# Movie Recommender System

In [2]:
pip install pandas numpy scikit-learn nltk tqdm

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
import ast
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load Data

In [3]:
movies_path = "movies_metadata.csv"
credits_path = "credits.csv"

movies = pd.read_csv(movies_path, low_memory=False)
credits = pd.read_csv(credits_path)

print("Movies shape:", movies.shape)
print("Credits shape:", credits.shape)

Movies shape: (45466, 24)
Credits shape: (45476, 3)


### CLEAN & PREP IDs, MERGE

In [6]:
# movies id sometimes non-numeric; coerce to numeric
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')

# Drop rows where id is NaN
movies = movies.dropna(subset=['id'])
credits = credits.dropna(subset=['id'])

# Convert to int for safe merging
movies['id'] = movies['id'].astype(int)
credits['id'] = credits['id'].astype(int)

# Merge on 'id'
df = movies.merge(credits, on='id', how='left')
print("Merged df shape:", df.shape)

# FIX: Select only existing columns safely 
available_cols = df.columns.tolist()
required_cols = ['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']

# Filter to keep only the ones that actually exist
selected_cols = [col for col in required_cols if col in available_cols]
df = df[selected_cols]

print("Columns in use:", selected_cols)
print(df.head(2).T)


Merged df shape: (45539, 26)
Columns in use: ['id', 'title', 'overview', 'genres', 'cast', 'crew']
                                                          0  \
id                                                      862   
title                                             Toy Story   
overview  Led by Woody, Andy's toys live happily in his ...   
genres    [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
cast      [{'cast_id': 14, 'character': 'Woody (voice)',...   
crew      [{'credit_id': '52fe4284c3a36847f8024f49', 'de...   

                                                          1  
id                                                     8844  
title                                               Jumanji  
overview  When siblings Judy and Peter discover an encha...  
genres    [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...  
cast      [{'cast_id': 1, 'character': 'Alan Parrish', '...  
crew      [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...  


### PARSERS: genres, keywords, cast, crew

In [8]:
# Helper: safely parse JSON-like string column to Python list/dict 
def parse_json_field(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(x)
    except Exception:
        try:
            s = x.replace("'", '"')
            return ast.literal_eval(s)
        except Exception:
            return []

# Extract genre names from genres column 
def extract_genres(genres_field):
    parsed = parse_json_field(genres_field)
    return [g.get('name', '') for g in parsed if isinstance(g, dict) and g.get('name')]

# Extract keywords (only if available) 
def extract_keywords(keywords_field):
    parsed = parse_json_field(keywords_field)
    return [k.get('name', '') for k in parsed if isinstance(k, dict) and k.get('name')]

# Extract top N cast names (actors) 
def extract_cast(cast_field, top_n=3):
    parsed = parse_json_field(cast_field)
    names = []
    for c in parsed:
        if isinstance(c, dict) and c.get('name'):
            names.append(c['name'])
        if len(names) >= top_n:
            break
    return names

# Extract director from crew 
def extract_director(crew_field):
    parsed = parse_json_field(crew_field)
    for c in parsed:
        if isinstance(c, dict) and c.get('job') and c.get('name'):
            if c.get('job').lower() == 'director':
                return c.get('name')
    return ''

# Apply parsers safely 
tqdm.pandas()

# Apply only on columns that exist
if 'genres' in df.columns:
    df['genres_parsed'] = df['genres'].progress_apply(extract_genres)

if 'keywords' in df.columns:
    df['keywords_parsed'] = df['keywords'].progress_apply(extract_keywords)
else:
    df['keywords_parsed'] = [[] for _ in range(len(df))]  # empty placeholder

if 'cast' in df.columns:
    df['cast_parsed'] = df['cast'].progress_apply(extract_cast)

if 'crew' in df.columns:
    df['director'] = df['crew'].progress_apply(extract_director)

#  Quick check 
print(df[['title', 'genres_parsed', 'keywords_parsed', 'cast_parsed', 'director']].head())


100%|██████████| 45539/45539 [00:02<00:00, 21497.20it/s]
100%|██████████| 45539/45539 [00:32<00:00, 1413.53it/s]
100%|██████████| 45539/45539 [00:25<00:00, 1796.67it/s]


                         title                 genres_parsed keywords_parsed  \
0                    Toy Story   [Animation, Comedy, Family]              []   
1                      Jumanji  [Adventure, Fantasy, Family]              []   
2             Grumpier Old Men             [Romance, Comedy]              []   
3            Waiting to Exhale      [Comedy, Drama, Romance]              []   
4  Father of the Bride Part II                      [Comedy]              []   

                                         cast_parsed         director  
0                [Tom Hanks, Tim Allen, Don Rickles]    John Lasseter  
1     [Robin Williams, Jonathan Hyde, Kirsten Dunst]     Joe Johnston  
2         [Walter Matthau, Jack Lemmon, Ann-Margret]    Howard Deutch  
3  [Whitney Houston, Angela Bassett, Loretta Devine]  Forest Whitaker  
4         [Steve Martin, Diane Keaton, Martin Short]    Charles Shyer  


### BUILD "tags" (bag of words)

In [9]:
# Text cleaning helper
stop_words = set(stopwords.words('english'))

def clean_and_join(list_of_strings):
    # keep only alphanumeric, remove spaces in multi-word tokens by joining with ''
    # lower, remove punctuation
    cleaned = []
    for s in list_of_strings:
        if not isinstance(s, str):
            continue
        # remove punctuation
        s2 = re.sub(r'[^a-zA-Z0-9\s]', '', s)
        # lowercase, strip
        s2 = s2.lower().strip()
        # split to tokens and remove stopwords
        tokens = [tok for tok in s2.split() if tok not in stop_words]
        # join tokens to single token to preserve multi-word as single token
        if tokens:
            cleaned.append(''.join(tokens))
    return " ".join(cleaned)

# For overview, do a mild cleaning (keep words)
def clean_overview(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = text.lower()
    tokens = [tok for tok in text.split() if tok not in stop_words]
    return " ".join(tokens)

# Create tags column: genres + keywords + cast + director + overview
def create_tags(row):
    parts = []
    # genres_parsed: list of genre names
    parts.extend(row['genres_parsed'] if isinstance(row['genres_parsed'], list) else [])
    # keywords_parsed
    parts.extend(row['keywords_parsed'] if isinstance(row['keywords_parsed'], list) else [])
    # cast_parsed (top actors)
    parts.extend(row['cast_parsed'] if isinstance(row['cast_parsed'], list) else [])
    # director
    if row['director']:
        parts.append(row['director'])
    # overview (just raw cleaned words)
    overview = clean_overview(row['overview'])
    # Now clean and join token lists
    tag_text = clean_and_join(parts)
    if overview:
        # include overview too (space separated tokens)
        tag_text = tag_text + " " + overview
    return tag_text

# Compute tags (may take some time)
df['tags'] = df.progress_apply(create_tags, axis=1)

# Basic stats
print("Tags sample:")
print(df[['title','tags']].head(5).T)

100%|██████████| 45539/45539 [00:03<00:00, 12670.08it/s]

Tags sample:
                                                       0  \
title                                          Toy Story   
tags   animation comedy family tomhanks timallen rick...   

                                                       1  \
title                                            Jumanji   
tags   adventure fantasy family robinwilliams jonatha...   

                                                       2  \
title                                   Grumpier Old Men   
tags   romance comedy waltermatthau jacklemmon annmar...   

                                                       3  \
title                                  Waiting to Exhale   
tags   comedy drama romance whitneyhouston angelabass...   

                                                       4  
title                        Father of the Bride Part II  
tags   comedy stevemartin dianekeaton martinshort cha...  





### VECTORIZE & COMPUTE SIMILARITY

In [11]:
# Limit to smaller subset (for memory safety)
df_small = df.head(5000).copy()  # Adjust this depending on your RAM

cv = CountVectorizer(max_features=5000, stop_words='english')
tags_matrix = cv.fit_transform(df_small['tags'].fillna(''))

print("Tags matrix shape:", tags_matrix.shape)

# Use sparse output to save RAM
cosine_sim = cosine_similarity(tags_matrix, tags_matrix, dense_output=False)
print("Cosine similarity computed (sparse), shape:", cosine_sim.shape)


Tags matrix shape: (5000, 5000)
Cosine similarity computed (sparse), shape: (5000, 5000)


### HELPER: MAP TITLE -> INDEX

In [13]:
# Some titles are duplicated; we'll keep the first occurrence mapping.
df = df.reset_index(drop=True)
title_to_idx = pd.Series(df.index, index=df['title'].str.lower()).drop_duplicates()

def get_index_by_title(title):
    """
    Return index for a title (case-insensitive). If exact title not found, try fuzzy match later.
    """
    title_lower = title.lower()
    if title_lower in title_to_idx:
        return int(title_to_idx[title_lower])
    else:
        return None

### RECOMMEND FUNCTION

In [14]:
def recommend(movie_title, n_recommendations=10):
    idx = get_index_by_title(movie_title)
    if idx is None:
        # try relaxed matching: substring search
        possible = df[df['title'].str.lower().str.contains(movie_title.lower(), na=False)]
        if len(possible) > 0:
            idx = int(possible.index[0])
            print(f"Note: exact match not found. Using close match: '{df.loc[idx, 'title']}'")
        else:
            raise ValueError(f"Movie title '{movie_title}' not found in database.")
    # get similarity scores for this movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # sort by score descending, skip the movie itself
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # collect top n_recommendations (excluding itself)
    results = []
    count = 0
    for i, score in sim_scores:
        if i == idx:
            continue
        results.append((df.loc[i, 'title'], df.loc[i, 'id'], float(score)))
        count += 1
        if count >= n_recommendations:
            break
    return results


### USAGE EXAMPLES

In [15]:
print("\nExample recommendations for 'Toy Story':")
try:
    recs = recommend("Toy Story", n_recommendations=10)
    for title, mid, score in recs:
        print(f"{title} (id={mid}) — similarity={score:.3f}")
except Exception as e:
    print("Error:", e)


Example recommendations for 'Toy Story':


### Streamlit app

In [17]:
# streamlit run app.py