In [3]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- Configuration ---
DATA_PATH      = '../data/TMDB_all_movies.csv'
MODELS_DIR     = '../models'
POSTER_BASE_URL = "https://image.tmdb.org/t/p/w500"

os.makedirs(MODELS_DIR, exist_ok=True)
print("Setup complete. Artifacts will be saved to:", MODELS_DIR)


Setup complete. Artifacts will be saved to: ../models


In [4]:
try:
    df_raw = pd.read_csv(DATA_PATH)
    print("Loaded:", df_raw.shape)
except FileNotFoundError as e:
    raise e
df_raw.head(2)


Loaded: (1082514, 28)


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,...,spoken_languages,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,poster_path
0,2,Ariel,7.111,346.0,Released,1988-10-21,0.0,73.0,0.0,tt0094675,...,suomi,"Turo Pajala, Matti Jaaranen, Marja Packalén, J...",Aki Kaurismäki,Timo Salminen,Aki Kaurismäki,Aki Kaurismäki,,7.4,9122.0,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg
1,3,Shadows in Paradise,7.293,409.0,Released,1986-10-17,0.0,74.0,0.0,tt0092149,...,"suomi, English, svenska","Esko Nikkari, Mari Rantasila, Marina Martinoff...",Aki Kaurismäki,Timo Salminen,Aki Kaurismäki,Mika Kaurismäki,,7.4,7937.0,/nj01hspawPof0mJmlgfjuLyJuRN.jpg


In [5]:
"""
    -> Data Cleaning and Preprocessing

"""
cols = ['id','title','overview','genres','cast','director','poster_path','release_date']
df = df_raw[cols].copy()

# --- Safe fillna handling ---
df[['overview','genres','cast','director']] = \
    df[['overview','genres','cast','director']].fillna('')

df = df.dropna(subset=['title'])

# --- Poster URLs ---
df['poster_url'] = df['poster_path'].apply(
    lambda p: f"{POSTER_BASE_URL}{p}" if pd.notna(p) and p else None
)

# --- Normalizer ---
def normalize_list_field(text: str) -> list[str]:
    if not isinstance(text, str) or not text.strip():
        return []
    return [t.strip().lower().replace(' ', '-') for t in text.split(',') if t.strip()]

for col in ['genres', 'cast', 'director']:
    df[col] = df[col].apply(normalize_list_field)

print("Preprocessing done. Shape:", df.shape)
df.head(2)


Preprocessing done. Shape: (1082501, 9)


Unnamed: 0,id,title,overview,genres,cast,director,poster_path,release_date,poster_url
0,2,Ariel,A Finnish man goes to the city to find a job a...,"[comedy, drama, romance, crime]","[turo-pajala, matti-jaaranen, marja-packalén, ...",[aki-kaurismäki],/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,1988-10-21,https://image.tmdb.org/t/p/w500/ojDg0PGvs6R9xY...
1,3,Shadows in Paradise,"Nikander, a rubbish collector and would-be ent...","[comedy, drama, romance]","[esko-nikkari, mari-rantasila, marina-martinof...",[aki-kaurismäki],/nj01hspawPof0mJmlgfjuLyJuRN.jpg,1986-10-17,https://image.tmdb.org/t/p/w500/nj01hspawPof0m...


In [6]:
"""
    -> the requimendation token
"""
def build_tokens(row: pd.Series) -> str:
    genres = [f"genre={g}" for g in row['genres']]
    cast = [f"actor={a}" for a in row['cast'][:5]]
    director = [f"director={d}" for d in row['director'][:1]]
    overview = str(row['overview'] or '')
    # --- make higher weight --- 
    structured = genres + cast + director 
    return " ".join(structured * 2) + " " + overview
df['text'] = df.apply(build_tokens, axis=1)

# --- TF-IDF vectorizer --- 
tfidf = TfidfVectorizer(stop_words='english' , sublinear_tf=True , min_df=3)
tfidf_matrix = tfidf.fit_transform(df['text'])
print("TF-IDF matrix shape:", tfidf_matrix.shape)




TF-IDF matrix shape: (1082501, 281697)


In [7]:
def recommend_by_filters(
    genres: list[str] | None = None,
    actors: list[str] | None = None,
    keywords: list[str] | None = None,
    k: int = 5
) -> pd.DataFrame:
    """
    Return k movies as a DataFrame that best match the provided filters.
    Each list element should already be lower-cased & hyphenated (like our tokens).
    """
    tokens = (
        [f"genre={g}" for g in (genres or [])] +
        [f"actor={a}" for a in (actors or [])] +
        (keywords or [])
    )
    if not tokens:
        raise ValueError("Provide at least one of genres, actors, or keywords")

    query = " ".join(tokens + tokens) 
    q_vec = tfidf.transform([query])
    scores = (tfidf_matrix @ q_vec.T).toarray().ravel()

    top_idx = np.argpartition(scores, -k)[-k:]
    top_idx = top_idx[np.argsort(scores[top_idx])[::-1]]

    cols_needed = ['id','title','release_date','poster_url','genres','cast']
    return df.iloc[top_idx][cols_needed].reset_index(drop=True)

In [8]:
recommend_by_filters(genres=['action','drama'],actors=['tom-hanks'],keywords=['war'])

Unnamed: 0,id,title,release_date,poster_url,genres,cast
0,1447933,The Americas,,https://image.tmdb.org/t/p/w500/3xoXA77BOjN50q...,[],[tom-hanks]
1,1116589,Die Tom Hanks Story,2021-07-14,,[documentary],[tom-hanks]
2,1322493,crumbs,,,[],[tom-hanks]
3,638302,Major Matt Mason,,,"[adventure, action, family]",[tom-hanks]
4,1198548,A Timeless Call,2008-08-27,https://image.tmdb.org/t/p/w500/eSIJDlKvMt1Oll...,"[documentary, war]",[tom-hanks]


In [11]:
# --- Test al-pacino and robert-de-niro --- 
recommendation_df = recommend_by_filters(
    genres = ['crime','drama'],
    actors = ['al-pacino','robert-de-niro'],
    keywords = ['mafia']
)

recommendation_df.head(5)






Unnamed: 0,id,title,release_date,poster_url,genres,cast
0,1458013,Scarface 3mk w3m 3yalk,,https://image.tmdb.org/t/p/w500/m73Z89kkGQqKPS...,[],"[3zeef, al-pacino]"
1,1006912,Pacino and De Niro: The Conversation,2005-02-22,https://image.tmdb.org/t/p/w500/geTzP43feBnimS...,[documentary],"[jon-voight, robert-de-niro, michael-mann, al-..."
2,651724,The Irishman: In Conversation,2019-11-27,https://image.tmdb.org/t/p/w500/fF704pR8xsNjI7...,[documentary],"[martin-scorsese, robert-de-niro, joe-pesci, a..."
3,328733,Babbleonia,2005-07-12,https://image.tmdb.org/t/p/w500/1VsfOjGGDy6bfi...,[documentary],[al-pacino]
4,926616,Becoming Al Pacino,2022-02-06,https://image.tmdb.org/t/p/w500/sOWjfvdokidKwB...,"[documentary, history, tv-movie]","[al-pacino, sarah-jane-sauvegrain, robert-de-n..."
