In [1]:
import pandas as pd
import joblib
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from surprise import Dataset, Reader, SVD

In [11]:
import os

os.makedirs("models", exist_ok=True)

In [3]:
start_time = time.time()

In [5]:
movies_df = pd.read_csv("merged_imdb_data.csv")
movies_df["genres"] = movies_df["genres"].fillna("Unknown")

## Content-Based Training

In [7]:
print("[INFO] Reducing dataset to top 10,000 movies by popularity...")
movies_df = movies_df.sort_values(by="averageRating", ascending=False).dropna(subset=["averageRating"]).head(10000)

[INFO] Reducing dataset to top 10,000 movies by popularity...


In [13]:
print("[INFO] Starting TF-IDF vectorization...")
tfidf_vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df["genres"])
print(f"[DONE] TF-IDF shape: {tfidf_matrix.shape}")

print("[INFO] Fitting NearestNeighbors model...")
nn_model = NearestNeighbors(n_neighbors=50, metric="cosine", algorithm="brute")
nn_model.fit(tfidf_matrix)

print("[INFO] Computing all neighbors at once...")
distances, indices = nn_model.kneighbors(tfidf_matrix)

# Create model directory if it doesn't exist
os.makedirs("models", exist_ok=True)

print("[INFO] Saving content-based models...")
joblib.dump(tfidf_vectorizer, "models/tfidf_vectorizer.pkl")
joblib.dump(indices, "models/nearest_neighbors_indices.pkl")

[INFO] Starting TF-IDF vectorization...
[DONE] TF-IDF shape: (10000, 27)
[INFO] Fitting NearestNeighbors model...
[INFO] Computing all neighbors at once...
[INFO] Saving content-based models...


['models/nearest_neighbors_indices.pkl']

## Collaborative Filtering (SVD)

In [17]:
print("[INFO] Preparing SVD collaborative filtering...")
movies_df["user_id"] = movies_df.index % 500

reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(movies_df[["user_id", "primaryTitle", "averageRating"]], reader)
trainset = data.build_full_trainset()

print("[INFO] Training SVD model...")
svd_model = SVD(n_factors=50, n_epochs=10)
svd_model.fit(trainset)

print("[INFO] Saving SVD model...")
joblib.dump(svd_model, "models/svd_model.pkl")

elapsed_time = time.time() - start_time
print(f"Training complete in {elapsed_time:.2f} seconds with {len(movies_df)} movies.")

[INFO] Preparing SVD collaborative filtering...
[INFO] Training SVD model...
[INFO] Saving SVD model...
Training complete in 279.39 seconds with 10000 movies.
