In [9]:
#imports
import re, string
import joblib
import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

from sentence_transformers import SentenceTransformer

In [10]:
#preprocessing
def clean_series(s: pd.Series) -> pd.Series:
    return (
        s.str.lower()
         .str.replace(f"[{re.escape(string.punctuation)}]", " ", regex=True)
         .str.replace(r"\s+", " ", regex=True)
         .str.strip()
    )


In [11]:
# concatenates the cleaned overview with a directorname tag
# runs through a pre-trained SentenceTransformer to get embedding
# normalizes and scales it by weight a
# director: vectorizes cleaned director name with tfidf
# normalizes and scales with wieght b

class MovieFeatureizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="all-MiniLM-L6-v2", a=1.0, b=5.0, svd_dims=50):
        self.model_name = model_name
        self.a = a
        self.b = b
        self.svd_dims = svd_dims

    def fit(self, X: pd.DataFrame, y=None):
        self.sbert = SentenceTransformer(self.model_name)
        self.tfidf = TfidfVectorizer(analyzer="char", ngram_range=(3,3))
        dirs = clean_series(X["Director"])
        self.tfidf.fit(dirs)
        self.svd = TruncatedSVD(self.svd_dims, random_state=42)
        self.svd.fit(self.tfidf.transform(dirs))
        return self

    def transform(self, X: pd.DataFrame):
        over = clean_series(X["Overview"])
        dirs = clean_series(X["Director"])
        texts = (over + " [SEP] director: " + dirs).tolist()

        emb = self.sbert.encode(texts, show_progress_bar=False)
        emb = normalize(emb) * self.a

        D = self.tfidf.transform(dirs)
        D = self.svd.transform(D)
        D = normalize(D) * self.b

        return np.hstack([emb, D])


In [21]:
#combines everything into a single class that is easier to manage with training, evaluation, and saving the models
class MovieRecommender:
    def __init__(self, k=10, a=1.0, b=5.0):
        self.k = k
        self.featureizer = MovieFeatureizer(a=a, b=b)
        self.knn = NearestNeighbors(n_neighbors=k+1, metric="cosine", algorithm="brute")

    def fit(self, df: pd.DataFrame):
        df = df.dropna(subset=["Overview","Director"]).reset_index(drop=True)
        self.df = df.copy()
        feats = self.featureizer.fit_transform(df)
        self.knn.fit(feats)
        return self

    def precision_at_k(self, k=None):
        k = k or self.k
        feats = self.featureizer.transform(self.df)
        _, idx = self.knn.kneighbors(feats, n_neighbors=k+1)
        hits = (self.df["Director"].values[idx[:,1:]] ==
                self.df["Director"].values[:,None])
        return hits.mean()

    def recommend(self, title: str, n=None):
        n = n or self.k
        mask = self.df["Series_Title"].str.lower() == title.lower()
        if not mask.any():
            raise ValueError(f"Title '{title}' not found.")
        i = mask.idxmax()
        qf = self.featureizer.transform(self.df.loc[[i]])
        nbrs = self.knn.kneighbors(qf, n_neighbors=n+1,
                                   return_distance=False)[0]
        return self.df["Series_Title"].iloc[nbrs[1:]].tolist()

    def save(self, path: str):
        joblib.dump(self, path)

    @classmethod
    def load(cls, path: str):
        return joblib.load(path)


In [22]:
csv_path = "imdb_top_1000.csv"
df = pd.read_csv(csv_path)

mr = MovieRecommender(k=10).fit(df)
print(f"Precision@10: {mr.precision_at_k():.4f}")


Precision@10: 0.2280


In [23]:
mr.save("movie_recommender.pkl")
print("Model saved to movie_recommender.pkl")


Model saved to movie_recommender.pkl


In [24]:
mr2 = MovieRecommender.load("movie_recommender.pkl")
print(mr2.recommend("The Matrix", n=10))


['Pirates of the Caribbean: The Curse of the Black Pearl', 'Chinatown', 'Trois couleurs: Rouge', 'Repulsion', 'Le locataire', 'Trois couleurs: Blanc', 'Trois couleurs: Bleu', 'La double vie de Véronique', "Rosemary's Baby", 'The Pianist']
