<a href="https://colab.research.google.com/github/ALLEN0607/lumaa_recommender/blob/main/movie_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import pandas as pd
import numpy as np
import sklearn

print("pandas", pd.__version__)
print("numpy", np.__version__)
print("sklearn", sklearn.__version__)

pandas 2.2.2
numpy 1.26.4
sklearn 1.6.1


In [54]:
# Full Dataset
df_movies = pd.read_csv("data/tmdb_5000_movies.csv")
# Small Dataset
#df_movies = df_movies.sample(n=500, random_state=42).reset_index(drop=True)

df_movies.info()

print(df_movies.head(n=10))
print(df_movies.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'data/tmdb_5000_movies.csv'

In [None]:
import ast

# 解析并拼接 genres, keywords
def parse_json_column(json_str):
  try:
    data_list = ast.literal_eval(json_str)   # string -> python object(list or dict)
    names = [d['name'] for d in data_list]
    return " ".join(names)
  except:
    return ""

# 给 df_movies 增加解析列
df_movies["parsed_genres"] = df_movies["genres"].apply(parse_json_column)
df_movies["parsed_keywords"] = df_movies["keywords"].apply(parse_json_column)

# combine overview, genres, keywords
df_movies["combined"] = (
    df_movies["overview"].fillna("") + " " +
    df_movies["parsed_genres"].fillna("") + " " +
    df_movies["parsed_keywords"].fillna("")
)

print(df_movies["combined"])

0      When the Switchblade, the most sophisticated p...
1      In a futuristic London, the rising sea levels ...
2      On a beautiful college campus, something ugly ...
3      Two girlfriends on a summer holiday in Spain b...
4      As Harry begins his sixth year at Hogwarts, he...
                             ...                        
495    The Big Apple's in big trouble, as indestructi...
496    Dr. John Dolittle the beloved doctor is back, ...
497    Orson Welles, as judge Rauch, holds a lengthy ...
498    Everyone deserves a chance to follow their dre...
499    An amphibious shark-like monster terrorizes an...
Name: combined, Length: 500, dtype: object


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# stop_words(the, is, and, ...)
vectorizer = TfidfVectorizer(stop_words='english')
# 对 combined 列做向量化
tfidf_matrix = vectorizer.fit_transform(df_movies["combined"])

# Print tfidf_martix shape (num_movies, num_features)
print("TF-IDF matrix shape: ", tfidf_matrix.shape)


TF-IDF matrix shape:  (500, 7014)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# 推荐电影
def recommend_movies(user_input, df, vectorizer, tfidf_matrix, top_n=5):
  # 将user_input转成TF-IDE向量
  user_vec = vectorizer.transform([user_input])

  similarity_scores = cosine_similarity(user_vec, tfidf_matrix)

  scores = similarity_scores.flatten()

  top_indices = scores.argsort()[::-1][:top_n]

  res = []
  for idx in top_indices:
    movie_title = df.iloc[idx]["title"]
    score = scores[idx]
    res.append((movie_title, score))

  return res

In [None]:
import sys

if __name__ == '__main__':
    if len(sys.argv) > 1:
        user_query = sys.argv[1]
    else:
        user_query = "I love thrilling action movies set in space, with a comedic twist."

    recommendations = recommend_movies(
        user_query,
        df_movies,
        vectorizer,
        tfidf_matrix,
        top_n=5
    )

    for title, score in recommendations:
        print(f"{title}: {score:.4f}")


Jack and Jill: 0.0000
The World Is Not Enough: 0.0000
Exodus: Gods and Kings: 0.0000
Star Trek: 0.0000
Spider-Man: 0.0000
