In [1]:
import pandas as pd
import numpy as np

# --- Step 1: Define Path and Load Data ---

DATA_PATH = '../data/TMDB_all_movies.csv'

try:
    df = pd.read_csv(DATA_PATH)
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print(f"Error: The file was not found at the specified path: {DATA_PATH}")
    print("Please make sure the dataset is in the 'data/' folder and the filename is correct.")
    # Assign df to None if loading fails, to prevent errors in subsequent cells.
    df = None

# --- Step 2: Initial Data Inspection ---

# Proceed only if the DataFrame was loaded successfully.
if df is not None:
    # Print the dimensions of the DataFrame (rows, columns).
    print(f"\nDataFrame shape: {df.shape}\n")

    # Display the first 5 rows to get a feel for the columns and data.
    # Using display() in a Jupyter environment provides a nicer table format.
    print("First 5 rows of the DataFrame (df.head()):")
    display(df.head())

    # Get a concise summary of the DataFrame.
    # This is the most crucial output for our next steps.
    # It shows column names, non-null counts, and data types (Dtype).
    print("\nDataFrame Info (df.info()):")
    df.info()

Dataset loaded successfully!

DataFrame shape: (1113651, 28)

First 5 rows of the DataFrame (df.head()):


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,...,spoken_languages,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,poster_path
0,2,Ariel,7.1,346.0,Released,1988-10-21,0.0,73.0,0.0,tt0094675,...,suomi,"Merja Pulkkinen, Eetu Hilkamo, Turo Pajala, Es...",Aki Kaurismäki,Timo Salminen,Aki Kaurismäki,Aki Kaurismäki,,7.4,9232.0,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg
1,3,Shadows in Paradise,7.293,409.0,Released,1986-10-17,0.0,74.0,0.0,tt0092149,...,"suomi, English, svenska","Esko Nikkari, Mari Rantasila, Marina Martinoff...",Aki Kaurismäki,Timo Salminen,Aki Kaurismäki,Mika Kaurismäki,,7.4,8068.0,/nj01hspawPof0mJmlgfjuLyJuRN.jpg
2,5,Four Rooms,5.9,2698.0,Released,1995-12-09,4257354.0,98.0,4000000.0,tt0113101,...,English,"Antonio Banderas, Paul Skemp, Lana McKissack, ...","Allison Anders, Alexandre Rockwell, Quentin Ta...","Andrzej Sekula, Guillermo Navarro, Phil Parmet...","Allison Anders, Alexandre Rockwell, Quentin Ta...","Lawrence Bender, Alexandre Rockwell, Quentin T...",Combustible Edison,6.7,114449.0,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg
3,6,Judgment Night,6.5,351.0,Released,1993-10-15,12136938.0,109.0,21000000.0,tt0107286,...,English,"Deirdre Kelly, Peter Greene, Will Zahrn, Jerem...",Stephen Hopkins,Peter Levy,"Jere Cunningham, Lewis Colick","Marilyn Vance, Lloyd Segan, Gene Levy",Alan Silvestri,6.6,20058.0,/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg
4,8,Life in Loops (A Megacities RMX),7.5,27.0,Released,2006-01-01,0.0,80.0,42000.0,tt0825671,...,"English, हिन्दी, 日本語, Pусский, Español",,Timo Novotny,Wolfgang Thaler,"Michael Glawogger, Timo Novotny","Ulrich Gehmacher, Timo Novotny",,8.1,285.0,/7ln81BRnPR2wqxuITZxEciCe1lc.jpg



DataFrame Info (df.info()):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1113651 entries, 0 to 1113650
Data columns (total 28 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   id                       1113651 non-null  int64  
 1   title                    1113631 non-null  object 
 2   vote_average             1113649 non-null  float64
 3   vote_count               1113649 non-null  float64
 4   status                   1113649 non-null  object 
 5   release_date             983635 non-null   object 
 6   revenue                  1113649 non-null  float64
 7   runtime                  1113649 non-null  float64
 8   budget                   1113649 non-null  float64
 9   imdb_id                  624738 non-null   object 
 10  original_language        1113649 non-null  object 
 11  original_title           1113632 non-null  object 
 12  overview                 919811 non-null   object 
 13  popularity   

In [2]:
# This cell performs the initial data cleaning and preparation.

# --- 1. Select Relevant Columns ---
# We select only the columns we will need for features and final output.
columns_to_keep = [
    'id', 'title', 'overview', 'genres', 'cast', 'director', 
    'vote_average', 'vote_count', 'release_date', 'poster_path', 'status'
]
# Use .copy() to avoid SettingWithCopyWarning later on
df_clean = df[columns_to_keep].copy()


# --- 2. Filter for Released Movies ---
# We only want to recommend movies that have actually been released.
df_clean = df_clean[df_clean['status'] == 'Released']


# --- 3. Drop Rows with Critical Missing Information ---
# A movie without a title or an overview cannot be used in our content-based system.
df_clean.dropna(subset=['title', 'overview', 'genres'], inplace=True)


# --- 4. Correct Data Types ---
# Convert date to datetime objects for potential future use.
df_clean['release_date'] = pd.to_datetime(df_clean['release_date'], errors='coerce')

# Ensure numeric columns are numeric, filling errors with 0
df_clean['vote_count'] = pd.to_numeric(df_clean['vote_count'], errors='coerce').fillna(0)
df_clean['vote_average'] = pd.to_numeric(df_clean['vote_average'], errors='coerce').fillna(0)

# Drop any rows that might have become null during coercion, just in case
df_clean.dropna(subset=['release_date', 'vote_count', 'vote_average'], inplace=True)


# --- 5. Final Reset of Index ---
# This is important to ensure a clean, contiguous index before proceeding.
df_clean.reset_index(drop=True, inplace=True)


print("Initial cleaning and pre-processing complete.")
print(f"DataFrame shape after initial cleaning: {df_clean.shape}")
display(df_clean.head())

Initial cleaning and pre-processing complete.
DataFrame shape after initial cleaning: (644474, 11)


Unnamed: 0,id,title,overview,genres,cast,director,vote_average,vote_count,release_date,poster_path,status
0,2,Ariel,A Finnish man goes to the city to find a job a...,"Comedy, Drama, Romance, Crime","Merja Pulkkinen, Eetu Hilkamo, Turo Pajala, Es...",Aki Kaurismäki,7.1,346.0,1988-10-21,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,Released
1,3,Shadows in Paradise,"Nikander, a rubbish collector and would-be ent...","Comedy, Drama, Romance","Esko Nikkari, Mari Rantasila, Marina Martinoff...",Aki Kaurismäki,7.293,409.0,1986-10-17,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,Released
2,5,Four Rooms,It's Ted the Bellhop's first night on the job....,Comedy,"Antonio Banderas, Paul Skemp, Lana McKissack, ...","Allison Anders, Alexandre Rockwell, Quentin Ta...",5.9,2698.0,1995-12-09,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,Released
3,6,Judgment Night,"Four young friends, while taking a shortcut en...","Action, Crime, Thriller","Deirdre Kelly, Peter Greene, Will Zahrn, Jerem...",Stephen Hopkins,6.5,351.0,1993-10-15,/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg,Released
4,8,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,Documentary,,Timo Novotny,7.5,27.0,2006-01-01,/7ln81BRnPR2wqxuITZxEciCe1lc.jpg,Released


In [3]:
# This cell defines helper functions to parse text-based feature columns.

def parse_comma_separated_list(text_data, top_n=None):
    """Parses a comma-separated string into a list of strings."""
    if not isinstance(text_data, str):
        return []
    items = [item.strip() for item in text_data.split(',')]
    return items[:top_n] if top_n else items

def clean_names_in_list(name_list):
    """Cleans a list of names by making them lowercase and removing spaces."""
    if not isinstance(name_list, list):
        return []
    return [str(name).lower().replace(' ', '') for name in name_list]

print("Helper functions for parsing defined successfully.")

Helper functions for parsing defined successfully.


In [4]:
# This cell aggressively filters the dataset to create a high-quality pool.

# --- 1. Apply Feature Parsing ---
# We apply the functions defined above to create structured feature lists.
# We only take the top 3 cast members. Director and genres are fully parsed.
df_clean['genres_list'] = df_clean['genres'].apply(parse_comma_separated_list)
df_clean['cast_list'] = df_clean['cast'].apply(lambda x: clean_names_in_list(parse_comma_separated_list(x, top_n=3)))
df_clean['director_list'] = df_clean['director'].apply(lambda x: clean_names_in_list(parse_comma_separated_list(x)))

# --- 2. Filter by Vote Count Quantile ---
# We only keep movies that are in the top 10% by number of votes.
m = df_clean['vote_count'].quantile(0.90)
print(f"Original number of movies: {len(df_clean)}")
print(f"Minimum votes required to be included (90th percentile): {m}")

df_clean = df_clean[df_clean['vote_count'] >= m]
print(f"Number of movies after vote count filtering: {len(df_clean)}")

# --- 3. Drop Duplicate Titles and Reset Index ---
# This ensures each movie is unique and the index is clean for model generation.
df_clean.drop_duplicates(subset='title', keep='first', inplace=True)
df_clean.reset_index(drop=True, inplace=True)
print(f"Final number of unique, high-quality movies: {len(df_clean)}")

Original number of movies: 644474
Minimum votes required to be included (90th percentile): 16.0
Number of movies after vote count filtering: 66331
Final number of unique, high-quality movies: 60666


In [5]:
# This cell calculates the IMDb Weighted Rating (WR) for our high-quality movie pool.

# C is the mean vote across the entire high-quality pool
C = df_clean['vote_average'].mean()
print(f"Mean rating for the pool (C): {C:.2f}")

# The m value is the 90th percentile threshold we calculated in the previous cell.
print(f"Vote threshold (m): {m}")

def weighted_rating(x, m=m, C=C):
    """Calculates the WR score based on the IMDb formula."""
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + m) * R) + (m / (m + v) * C)

# Create the new 'wr' (Weighted Rating) column
df_clean['wr'] = df_clean.apply(weighted_rating, axis=1)

print("\nWeighted Rating calculated. Top 10 movies by WR:")
# Display the results to verify
display(df_clean[['title', 'vote_count', 'vote_average', 'wr']].sort_values('wr', ascending=False).head(10))

Mean rating for the pool (C): 6.19
Vote threshold (m): 16.0

Weighted Rating calculated. Top 10 movies by WR:


Unnamed: 0,title,vote_count,vote_average,wr
58295,The Way to the Heart,142.0,9.9,9.524021
48612,Nude,162.0,9.414,9.123951
53945,BTS World Tour: Love Yourself - Japan Edition,326.0,9.2,9.05905
44720,"What's New, Scooby-Doo? Vol. 7: Ready to Scare",47.0,10.0,9.031671
45659,What's New Scooby-Doo? Vol. 3: Halloween Boos ...,52.0,9.9,9.026401
49001,What's New Scooby-Doo? Vol. 4: Merry Scary Hol...,46.0,10.0,9.016053
39582,Scooby-Doo! and the Werewolves,62.0,9.7,8.979426
46895,What's New Scooby-Doo? Vol. 10: Monstrous Tails,54.0,9.8,8.974218
44719,"What's New, Scooby-Doo? Vol. 5: Sports Spookta...",48.0,9.9,8.971801
44506,"What's New, Scooby-Doo? Vol. 7: Ghosts on the Go!",48.0,9.9,8.971801


In [6]:
# --- Phase 3: Building the Collaborative Filtering Engine ---
# Cell 1: Load and Prepare MovieLens Data for the 'Surprise' library

from surprise import Reader, Dataset
import pandas as pd
from pathlib import Path

print("--- Loading MovieLens 'small' dataset ---")

# Define path to the data
DATA_DIR = Path('../data/')
ratings_path = DATA_DIR / 'ml-latest-small' / 'ratings.csv'

# Load the ratings data
try:
    ratings_df = pd.read_csv(ratings_path)
    
    # The 'surprise' library needs to know the format of your data.
    # We specify that the ratings are on a scale of 1 to 5.
    reader = Reader(rating_scale=(1, 5))

    # Load the data from the pandas DataFrame into the surprise Dataset object.
    # The columns MUST be in the order: user, item, rating
    data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
    
    print("MovieLens data loaded and prepared for Surprise successfully.")
    display(ratings_df.head())
    
except FileNotFoundError:
    print(f"Error: Ratings file not found at {ratings_path}.")
    print("Please make sure you have downloaded and placed the 'ml-latest-small' folder in the 'data/' directory.")
    data = None

--- Loading MovieLens 'small' dataset ---
MovieLens data loaded and prepared for Surprise successfully.


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
# =================================================================================
# === FINAL & COMPLETE ARTIFACT GENERATION CELL (BGE + SVD) ===
# This single cell contains all logic to create the final models and artifacts.
# =================================================================================
import os
import pandas as pd
import numpy as np
import joblib
import torch
from sentence_transformers import SentenceTransformer
from surprise import SVD, Reader, Dataset
from surprise.model_selection import cross_validate
from pathlib import Path

# --- A. CONTENT ENGINE (BGE) ---

print("--- Part A: Building upgraded Content-Based Engine with BGE ---")

# --- 1. Create the "Narrative Document" for each movie ---
# THIS STEP WAS MISSING AND IS NOW INCLUDED.
print("\nCreating narrative documents for semantic encoding...")

def create_narrative_document(row):
    """Creates a natural language paragraph combining key movie features."""
    directors = ' and '.join(row.get('director_list', []))
    cast_list = row.get('cast_list', [])
    
    if len(cast_list) > 1:
        cast_str = ', '.join(cast_list[:-1]) + ' and ' + cast_list[-1]
    elif cast_list:
        cast_str = cast_list[0]
    else:
        cast_str = ""

    title_sentence = f"The movie is titled {row.get('title', '')}."
    genre_sentence = f"It is a {' and '.join(row.get('genres_list', []))} film."
    director_sentence = f"This film was directed by {directors}." if directors else ""
    cast_sentence = f"It stars {cast_str}." if cast_str else ""
    overview_sentence = f"The plot is as follows: {row.get('overview', '')}"
    
    narrative = ' '.join(filter(None, [title_sentence, genre_sentence, director_sentence, cast_sentence, overview_sentence]))
    return narrative

# Apply the function to the final cleaned DataFrame
# (Assuming df_clean is the final output of your data cleaning cells)
df_clean['narrative_text'] = df_clean.apply(create_narrative_document, axis=1)
print("Narrative documents created successfully.")


# --- 2. Generate BGE Embeddings from the new narrative text ---
# Configure Environment
cache_dir = "F:/huggingface_cache" 
os.environ['HF_HOME'] = cache_dir
os.environ['SENTENCE_TRANSFORMERS_HOME'] = cache_dir
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"\nUsing device: {device}")

# Load BGE Model
print("Loading BGE model (BAAI/bge-base-en-v1.5)...")
bge_model = SentenceTransformer('BAAI/bge-base-en-v1.5', device=device)
print("BGE model loaded.")

# Encode the newly created 'narrative_text' column
print("\nGenerating BGE embeddings from narrative text...")
sentences = df_clean['narrative_text'].tolist()
bge_embeddings = bge_model.encode(sentences, show_progress_bar=True, device=device)
print(f"BGE embeddings generated. Shape: {bge_embeddings.shape}")


# --- B. COLLABORATIVE FILTERING ENGINE (SVD) ---

print("\n\n--- Part B: Building Collaborative Filtering Engine with SVD ---")
# (Assuming 'data' object for Surprise was created from MovieLens ratings in a previous cell)
if 'data' in locals():
    algo_svd = SVD(n_factors=100, n_epochs=20, random_state=42)
    print("\nTraining SVD model on the full dataset...")
    trainset = data.build_full_trainset()
    algo_svd.fit(trainset)
    print("SVD model trained successfully.")
else:
    print("\nWarning: 'data' object for Surprise not found. Skipping SVD model training.")
    algo_svd = None


# --- C. SAVE ALL FINAL ARTIFACTS ---

print("\n\n--- Part C: Saving all synchronized artifacts ---")
# Create final, synchronized DataFrame and Index Map from the df_clean
columns_to_save = ['id', 'title', 'overview', 'poster_path', 'vote_count', 'vote_average', 'wr']
final_api_df = df_clean[columns_to_save].copy()
final_indices_map = pd.Series(final_api_df.index, index=final_api_df.title)

# Define save paths
SAVE_DIR = Path('../movie_recommender/saved_models/')
os.makedirs(SAVE_DIR, exist_ok=True) 
BGE_EMBEDDINGS_PATH = SAVE_DIR / 'bge_embeddings.npy'
DF_PATH = SAVE_DIR / 'movies_df.joblib'
INDICES_PATH = SAVE_DIR / 'indices_map.joblib'
SVD_MODEL_PATH = SAVE_DIR / 'svd_model.joblib'

# Save the files
np.save(BGE_EMBEDDINGS_PATH, bge_embeddings)
print(f"BGE embeddings saved to: {BGE_EMBEDDINGS_PATH}")

joblib.dump(final_api_df, DF_PATH)
print(f"DataFrame saved to: {DF_PATH}")

joblib.dump(final_indices_map, INDICES_PATH)
print(f"Indices map saved to: {INDICES_PATH}")

if algo_svd:
    joblib.dump(algo_svd, SVD_MODEL_PATH)
    print(f"SVD model saved to: {SVD_MODEL_PATH}")

print("\n--- SUCCESS! All final artifacts are generated and ready for the web app. ---")

--- Part A: Building upgraded Content-Based Engine with BGE ---

Creating narrative documents for semantic encoding...
Narrative documents created successfully.

Using device: cuda
Loading BGE model (BAAI/bge-base-en-v1.5)...
BGE model loaded.

Generating BGE embeddings from narrative text...


Batches:   0%|          | 0/1896 [00:00<?, ?it/s]

BGE embeddings generated. Shape: (60666, 768)


--- Part B: Building Collaborative Filtering Engine with SVD ---

Training SVD model on the full dataset...
SVD model trained successfully.


--- Part C: Saving all synchronized artifacts ---
BGE embeddings saved to: ..\movie_recommender\saved_models\bge_embeddings.npy
DataFrame saved to: ..\movie_recommender\saved_models\movies_df.joblib
Indices map saved to: ..\movie_recommender\saved_models\indices_map.joblib
SVD model saved to: ..\movie_recommender\saved_models\svd_model.joblib

--- SUCCESS! All final artifacts are generated and ready for the web app. ---
