In [13]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- Configuration ---
DATA_PATH      = '../data/TMDB_all_movies.csv'
MODELS_DIR     = '../models'
POSTER_BASE_URL = "https://image.tmdb.org/t/p/w500"

os.makedirs(MODELS_DIR, exist_ok=True)
print("Setup complete. Artifacts will be saved to:", MODELS_DIR)


Setup complete. Artifacts will be saved to: ../models


In [14]:
try:
    df_raw = pd.read_csv(DATA_PATH)
    print("Loaded:", df_raw.shape)
except FileNotFoundError as e:
    raise e
df_raw.head(2)


Loaded: (1082514, 28)


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,...,spoken_languages,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,poster_path
0,2,Ariel,7.111,346.0,Released,1988-10-21,0.0,73.0,0.0,tt0094675,...,suomi,"Turo Pajala, Matti Jaaranen, Marja Packalén, J...",Aki Kaurismäki,Timo Salminen,Aki Kaurismäki,Aki Kaurismäki,,7.4,9122.0,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg
1,3,Shadows in Paradise,7.293,409.0,Released,1986-10-17,0.0,74.0,0.0,tt0092149,...,"suomi, English, svenska","Esko Nikkari, Mari Rantasila, Marina Martinoff...",Aki Kaurismäki,Timo Salminen,Aki Kaurismäki,Mika Kaurismäki,,7.4,7937.0,/nj01hspawPof0mJmlgfjuLyJuRN.jpg


In [15]:
"""
    -> Data Cleaning and Preprocessing

"""
cols = ['id','title','overview','genres','cast','director','poster_path','release_date']
df = df_raw[cols].copy()

# --- Safe fillna handling ---
df[['overview','genres','cast','director']] = \
    df[['overview','genres','cast','director']].fillna('')

df = df.dropna(subset=['title'])

# --- Poster URLs ---
df['poster_url'] = df['poster_path'].apply(
    lambda p: f"{POSTER_BASE_URL}{p}" if pd.notna(p) and p else None
)

# --- Normalizer ---
def normalize_list_field(text: str) -> list[str]:
    if not isinstance(text, str) or not text.strip():
        return []
    return [t.strip().lower().replace(' ', '-') for t in text.split(',') if t.strip()]

for col in ['genres', 'cast', 'director']:
    df[col] = df[col].apply(normalize_list_field)

print("Preprocessing done. Shape:", df.shape)
df.head(2)


Preprocessing done. Shape: (1082501, 9)


Unnamed: 0,id,title,overview,genres,cast,director,poster_path,release_date,poster_url
0,2,Ariel,A Finnish man goes to the city to find a job a...,"[comedy, drama, romance, crime]","[turo-pajala, matti-jaaranen, marja-packalén, ...",[aki-kaurismäki],/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,1988-10-21,https://image.tmdb.org/t/p/w500/ojDg0PGvs6R9xY...
1,3,Shadows in Paradise,"Nikander, a rubbish collector and would-be ent...","[comedy, drama, romance]","[esko-nikkari, mari-rantasila, marina-martinof...",[aki-kaurismäki],/nj01hspawPof0mJmlgfjuLyJuRN.jpg,1986-10-17,https://image.tmdb.org/t/p/w500/nj01hspawPof0m...


In [17]:
"""
    -> the requimendation token
"""
def build_tokens(row: pd.Series) -> str:
    genres = [f"genre={g}" for g in row['genres']]
    cast = [f"actor={a}" for a in row['cast'][:5]]
    director = [f"director={d}" for d in row['director'][:1]]
    overview = str(row['overview'] or '')
    # --- make higher weight --- 
    structured = genres + cast + director 
    return " ".join(structured * 2) + " " + overview
df['text'] = df.apply(build_tokens, axis=1)

# --- TF-IDF vectorizer --- 
tfidf = TfidfVectorizer(stop_words='english' , sublinear_tf=True , min_df=3)
tfidf_matrix = tfidf.fit_transform(df['text'])
print("TF-IDF matrix shape:", tfidf_matrix.shape)




TF-IDF matrix shape: (1082501, 281697)


In [None]:
def recommend_by_filters(
    genres: List[str] | None = None , 
    actors: li
)