In [1]:
import os
import pandas as pd

BASE_DIR = os.path.dirname(os.path.abspath("__file__"))

def pick(*candidates):
    for rel in candidates:
        p = os.path.join(BASE_DIR, rel)
        if os.path.exists(p):
            return p
    raise FileNotFoundError(candidates)

# files
MOVIES_FILE       = pick("movies_metadata.csv", "data/movies_metadata.csv")
RATINGS_FILE      = pick("ratings.csv", "data/ratings.csv")
RATINGS_SMALL     = pick("ratings_small.csv", "data/ratings_small.csv")
CREDITS_FILE      = pick("credits.csv", "data/credits.csv")
KEYWORDS_FILE     = pick("keywords.csv", "data/keywords.csv")
LINKS_FILE        = pick("links.csv", "data/links.csv")
LINKS_SMALL       = pick("links_small.csv", "data/links_small.csv")

# load data
movies        = pd.read_csv(MOVIES_FILE, low_memory=False)
ratings       = pd.read_csv(RATINGS_FILE)
ratings_small = pd.read_csv(RATINGS_SMALL)
credits       = pd.read_csv(CREDITS_FILE)
keywords      = pd.read_csv(KEYWORDS_FILE)
links         = pd.read_csv(LINKS_FILE)
links_small   = pd.read_csv(LINKS_SMALL)

print("✅ Loaded:")
for name, df in [
    ("movies", movies), ("ratings", ratings), ("ratings_small", ratings_small),
    ("credits", credits), ("keywords", keywords),
    ("links", links), ("links_small", links_small),
]:
    print(f"{name:<14} {df.shape}")


✅ Loaded:
movies         (45466, 24)
ratings        (26024289, 4)
ratings_small  (100004, 4)
credits        (45476, 3)
keywords       (46419, 2)
links          (45843, 3)
links_small    (9125, 3)


In [2]:
import re, ast
import numpy as np

def safe_list_from_jsonlike(val, key="name"):
    if pd.isna(val): 
        return []
    if isinstance(val, list):
        return [d.get(key, "") for d in val if isinstance(d, dict)]
    if isinstance(val, str):
        try:
            data = ast.literal_eval(val)
            if isinstance(data, list):
                return [d.get(key, "") for d in data if isinstance(d, dict)]
        except Exception:
            pass
    return []

def normalize(s):
    s = (s or "").lower()
    s = re.sub(r"[^a-z0-9]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()

# fix IDs
movies["id"] = movies["id"].astype(str)
keywords["id"] = keywords["id"].astype(str)

# merge keywords
if "id" in keywords.columns and "keywords" in keywords.columns:
    movies = movies.merge(keywords[["id", "keywords"]], on="id", how="left")

# build lists
movies["genres_list"]   = movies["genres"].apply(safe_list_from_jsonlike) if "genres" in movies else [[]]*len(movies)
movies["keywords_list"] = movies["keywords"].apply(safe_list_from_jsonlike) if "keywords" in movies else [[]]*len(movies)

# build strings
movies["genres_str"]   = movies["genres_list"].apply(lambda xs: " ".join(x.replace(" ", "") for x in xs))
movies["keywords_str"] = movies["keywords_list"].apply(lambda xs: " ".join(x.replace(" ", "") for x in xs))
movies["overview_str"] = movies.get("overview", "").astype(str).map(normalize)

# final tags
movies["tags"] = (movies["genres_str"] + " " + movies["keywords_str"] + " " + movies["overview_str"]).str.strip()
movies["Tags"] = movies["tags"]

display(movies[["title", "genres_list", "keywords_list", "tags"]].head(3))


Unnamed: 0,title,genres_list,keywords_list,tags
0,Toy Story,"[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...",Animation Comedy Family jealousy toy boy frien...
1,Jumanji,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...",Adventure Fantasy Family boardgame disappearan...
2,Grumpier Old Men,"[Romance, Comedy]","[fishing, best friend, duringcreditsstinger, o...",Romance Comedy fishing bestfriend duringcredit...


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

SIMILARITY_PICKLE = "similarity.pkl"

if os.path.exists(SIMILARITY_PICKLE):
    with open(SIMILARITY_PICKLE, "rb") as f:
        similarity = pickle.load(f)
    print("📂 Loaded existing similarity.pkl:", similarity.shape)
else:
    cv = CountVectorizer(max_features=5000, stop_words="english")
    vectors = cv.fit_transform(movies["tags"])
    similarity = cosine_similarity(vectors).astype("float16")
    with open(SIMILARITY_PICKLE, "wb") as f:
        pickle.dump(similarity, f)
    print("💾 Computed and saved similarity.pkl:", similarity.shape)

print("movies:", movies.shape, "| similarity:", similarity.shape)


📂 Loaded existing similarity.pkl: (10000, 10000)
movies: (46486, 32) | similarity: (10000, 10000)


In [4]:
def recommend(title, n=10, data=movies, sim=similarity):
    if "original_title" in data.columns:
        title_col = "original_title"
    elif "title" in data.columns:
        title_col = "title"
    else:
        raise ValueError("No title column found.")

    matches = data[data[title_col].str.lower() == title.lower()]
    if matches.empty:
        return f"❌ '{title}' not found."

    idx = matches.index[0]
    if idx >= sim.shape[0]:
        return f"⚠️ Movie index {idx} is out of range for similarity matrix of size {sim.shape[0]}."

    distances = list(enumerate(sim[idx]))
    distances = sorted(distances, key=lambda x: x[1], reverse=True)[1:n+1]

    rec_idxs = [i for i, _ in distances]
    cols = [c for c in [title_col, "release_date", "vote_average"] if c in data.columns]

    return data.loc[rec_idxs, cols].reset_index(drop=True)

# test
recommend("Avatar", 5)


'⚠️ Movie index 14581 is out of range for similarity matrix of size 10000.'

In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [9]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [14]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  10000 non-null  object 
 1   belongs_to_collection  1421 non-null   object 
 2   budget                 10000 non-null  object 
 3   genres                 10000 non-null  object 
 4   homepage               662 non-null    object 
 5   id                     10000 non-null  object 
 6   imdb_id                9999 non-null   object 
 7   original_language      10000 non-null  object 
 8   original_title         10000 non-null  object 
 9   overview               9971 non-null   object 
 10  popularity             10000 non-null  object 
 11  poster_path            9969 non-null   object 
 12  production_companies   10000 non-null  object 
 13  production_countries   10000 non-null  object 
 14  release_date           9995 non-null   object 
 15  rev

In [15]:
movies.isna().sum()

adult                       0
belongs_to_collection    8579
budget                      0
genres                      0
homepage                 9338
id                          0
imdb_id                     1
original_language           0
original_title              0
overview                   29
popularity                  0
poster_path                31
production_companies        0
production_countries        0
release_date                5
revenue                     0
runtime                     6
spoken_languages            0
status                      8
tagline                  3108
title                       0
video                       0
vote_average                0
vote_count                  0
genres_str                  0
dtype: int64

In [16]:
movies = movies.iloc[movies['overview'].dropna().index]


In [17]:
movies = movies.fillna(' ')

  movies = movies.fillna(' ')


In [18]:
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')


In [19]:
movies = movies.sort_values(by=['release_date'], ascending=False)

In [20]:
movies = movies.fillna(' ')
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
movies = movies.sort_values(by=['release_date'], ascending=False)


In [21]:
movies.dropna(inplace=True)

In [24]:
movies.reset_index(inplace=True)

In [26]:
def lemmatize_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(sentence)
    pos_tags = pos_tag(tokens)

    def get_wordnet_pos(tag):
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag[0].upper(), wordnet.NOUN)

    return " ".join([lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags])


In [27]:
print(movies.columns)

Index(['index', 'adult', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'genres_list', 'genres_str', 'Tags'],
      dtype='object')


In [28]:
movies.columns = movies.columns.str.lower().str.strip()
print(movies.columns)


Index(['index', 'adult', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'genres_list', 'genres_str', 'tags'],
      dtype='object')


In [29]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
movies['tags_'] = movies['overview'].astype(str).fillna("").apply(lemmatize_sentence)

In [32]:
# Step 1: Import everything you need
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 2: Vectorize your movie tags column
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
vectors = vectorizer.fit_transform(movies['tags']).toarray()  # now 'vectors' exists

# Step 3: Build similarity matrix
similarity = cosine_similarity(vectors)

print("Vectors shape:", vectors.shape)
print("Similarity shape:", similarity.shape)


Vectors shape: (10000, 5000)
Similarity shape: (10000, 10000)


In [33]:
similarity = cosine_similarity(vectors) 

In [34]:
similarity.shape

(10000, 10000)

In [35]:
def recommend(title, n=10, data=movies, sim=similarity):
    """
    Recommend top-n similar movies given a title.
    """
    # choose title column
    if "original_title" in data.columns:
        title_col = "original_title"
    elif "title" in data.columns:
        title_col = "title"
    else:
        raise ValueError("No title column found.")

    # case-insensitive search
    matches = data[data[title_col].str.lower() == title.lower()]
    if matches.empty:
        return f"❌ '{title}' not found."

    idx = matches.index[0]

    # get distances
    distances = list(enumerate(sim[idx]))
    distances = sorted(distances, key=lambda x: x[1], reverse=True)[1 : n + 1]

    rec_idxs = [i for i, _ in distances]
    cols = [c for c in [title_col, "release_date", "vote_average"] if c in data.columns]

    return data.loc[rec_idxs, cols].reset_index(drop=True)

# 🔍 example
recommend("Avatar", 5)


"❌ 'Avatar' not found."

In [36]:
movies['original_title'][:100].values


array(['Avatar 2', 'The Other Side of the Wind', 'Bad Boys for Life',
       'Mary Shelley', 'Mobile Homes', 'Iron Sky: The Coming Race',
       'Sly Cooper', "The King's Daughter",
       'How to Talk to Girls at Parties', 'Pitch Perfect 3', 'Machines',
       'Sweet Virginia', 'Justice League', '78/52',
       'Call Me by Your Name', 'Thor: Ragnarok', 'Dina', 'Beyond Skyline',
       'Resurrecting Hassan', 'Leatherface', 'Porto', "God's Own Country",
       'Science Fiction Volume One: The Osiris Child',
       'The Trip to Spain', 'LEGO DC Super Hero Girls: Brain Drain',
       'Patti Cake$', 'What Happened to Monday', 'Ingrid Goes West',
       'Good Time', 'The Glass Castle', 'A Gray State', 'Kidnap',
       'Chronically Metropolitan', 'Columbus', 'The Dark Tower',
       'London Town', 'Wind River', 'Hostages',
       'Cop and a Half: New Recruit', 'S.W.A.T.: Under Siege',
       '東京喰種 トーキョーグール', 'Detroit', 'The Emoji Movie',
       'An Inconvenient Sequel: Truth to Power',
     

In [37]:
def recommendation(title, data, similarity=similarity):
    try:
        # find index of the movie
        movie_index = data[data['original_title'].str.lower() == title.lower()].index[0]
    except IndexError:
        return "Movie not currently in the database"
    
    # get similarity scores
    distances = list(enumerate(similarity[movie_index]))
    distances = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]
    
    # return top 5 similar movies
    return data['original_title'].iloc[[i[0] for i in distances]].tolist()


In [38]:
recommendation("Jumanji", data=movies)

'Movie not currently in the database'

In [39]:
recommendation("Copycat", data=movies)

'Movie not currently in the database'

In [40]:
movies.to_csv('data.csv', index=False)

In [41]:
import pickle
pickle.dump(similarity, open('similarity.pkl','wb'))

In [42]:
import nltk
nltk.data.path.append(r"C:\Users\kazia\AppData\Roaming\nltk_data")



In [43]:
import os
os.listdir("data")



['.ipynb_checkpoints',
 'credits.csv',
 'keywords.csv',
 'links.csv',
 'links_small.csv',
 'movies_metadata.csv',
 'ratings.csv',
 'ratings_small.csv']

In [44]:
import pandas as pd

# Load the available CSV files
movies = pd.read_csv("data/movies_metadata.csv", low_memory=False)
credits = pd.read_csv("data/credits.csv")
keywords = pd.read_csv("data/keywords.csv")
ratings = pd.read_csv("data/ratings.csv")

print(movies.shape, credits.shape, keywords.shape, ratings.shape)

(45466, 24) (45476, 3) (46419, 2) (26024289, 4)


In [45]:
import pandas as pd

# load the movie metadata
movies = pd.read_csv("data/movies_metadata.csv", low_memory=False)
print(movies.shape)
movies.head()


(45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [46]:
ratings = pd.read_csv("data/ratings.csv")
print(ratings.shape)
ratings.head()


(26024289, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [47]:
keywords = pd.read_csv("data/keywords.csv")
print(keywords.shape)
keywords.head()


(46419, 2)


Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [48]:
import os
os.listdir("data")


['.ipynb_checkpoints',
 'credits.csv',
 'keywords.csv',
 'links.csv',
 'links_small.csv',
 'movies_metadata.csv',
 'ratings.csv',
 'ratings_small.csv']

In [49]:
import pandas as pd

# Load metadata files
movies = pd.read_csv("data/movies_metadata.csv", low_memory=False)
credits = pd.read_csv("data/credits.csv")
keywords = pd.read_csv("data/keywords.csv")
links = pd.read_csv("data/links.csv")

# Load ratings
ratings = pd.read_csv("data/ratings.csv")

print("Movies:", movies.shape)
print("Credits:", credits.shape)
print("Keywords:", keywords.shape)
print("Ratings:", ratings.shape)
print("Links:", links.shape)


Movies: (45466, 24)
Credits: (45476, 3)
Keywords: (46419, 2)
Ratings: (26024289, 4)
Links: (45843, 3)


In [50]:
import os
print(os.getcwd())


C:\Users\kazia\Downloads\Netflix-movie-recommender


In [51]:
import pandas as pd
df = pd.read_csv("movies_metadata.csv", low_memory=False)
print(df.shape)




(45466, 24)


In [52]:
import re
import numpy as np
import ast

def safe_list_from_jsonlike(val, key="name"):
    """Parse strings like "[{'id': 28, 'name': 'Action'}, ...]" → ['Action', ...]."""
    if pd.isna(val): 
        return []
    if isinstance(val, list):
        return [d.get(key, "") for d in val if isinstance(d, dict)]
    if isinstance(val, str):
        try:
            data = ast.literal_eval(val)
            if isinstance(data, list):
                return [d.get(key, "") for d in data if isinstance(d, dict)]
        except Exception:
            pass
    return []

def normalize(s):
    s = (s or "").lower()
    s = re.sub(r"[^a-z0-9]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()

# --- FIX: make IDs same type ---
movies["id"] = movies["id"].astype(str)
keywords["id"] = keywords["id"].astype(str)

# merge keywords into movies
if "id" in keywords.columns and "keywords" in keywords.columns:
    movies = movies.merge(keywords[["id", "keywords"]], on="id", how="left")

# build lists
movies["genres_list"]   = movies["genres"].apply(safe_list_from_jsonlike) if "genres" in movies else [[]]*len(movies)
movies["keywords_list"] = movies["keywords"].apply(safe_list_from_jsonlike) if "keywords" in movies else [[]]*len(movies)

# make strings
movies["genres_str"]   = movies["genres_list"].apply(lambda xs: " ".join(x.replace(" ", "") for x in xs))
movies["keywords_str"] = movies["keywords_list"].apply(lambda xs: " ".join(x.replace(" ", "") for x in xs))
movies["overview_str"] = movies.get("overview", "").astype(str).map(normalize)

# final text field
movies["tags"] = (movies["genres_str"] + " " + movies["keywords_str"] + " " + movies["overview_str"]).str.strip()
movies["Tags"] = movies["tags"]  # keep old cells that expect 'Tags'

# sanity peek
display(movies[["title", "genres_list", "keywords_list", "tags"]].head(3))


Unnamed: 0,title,genres_list,keywords_list,tags
0,Toy Story,"[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva...",Animation Comedy Family jealousy toy boy frien...
1,Jumanji,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...",Adventure Fantasy Family boardgame disappearan...
2,Grumpier Old Men,"[Romance, Comedy]","[fishing, best friend, duringcreditsstinger, o...",Romance Comedy fishing bestfriend duringcredit...


In [53]:
links_file = os.path.join(BASE_DIR, "links.csv")
links = pd.read_csv(links_file)

print("Links:", links.shape)
print(links.head())


Links: (45843, 3)
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


In [54]:
movies = pd.read_csv(
    r"C:\Users\kazia\Downloads\Netflix-movie-recommender\movies_metadata.csv",
    low_memory=False
)
ratings = pd.read_csv(
    r"C:\Users\kazia\Downloads\Netflix-movie-recommender\ratings_small.csv"
)


In [None]:
# 🔍 Final sanity check
try:
    print("Movies dataframe shape:", movies.shape)
    print("Similarity matrix shape:", similarity.shape)

    if movies.shape[0] != similarity.shape[0]:
        print("⚠️ WARNING: movies and similarity sizes do not match!")
    else:
        print("✅ movies and similarity are aligned.")
except NameError as e:
    print("❌ You need to run the data loading and similarity cells first!", e)


In [3]:
import pickle
import os

# Save similarity matrix to file for Streamlit app
output_file = os.path.join(os.getcwd(), "similarity.pkl")

with open(output_file, "wb") as f:
    pickle.dump(similarity, f)

print(f"✅ similarity.pkl saved at: {output_file}")
print(f"Shape: {similarity.shape}")


NameError: name 'similarity' is not defined

In [None]:

# 🔹 Compute Similarity Matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example: using 'soup' column if available, else combine keywords/genres
if "soup" in movies.columns:
    cv = CountVectorizer(stop_words='english')
    count_matrix = cv.fit_transform(movies['soup'].fillna(""))
    similarity = cosine_similarity(count_matrix, count_matrix)
else:
    print("⚠️ No 'soup' column found. Please build features before similarity.")


In [None]:

# 🔹 Optionally save similarity.pkl (not required for running)
import pickle, os
output_file = os.path.join(os.getcwd(), "similarity.pkl")
with open(output_file, "wb") as f:
    pickle.dump(similarity, f)
print(f"✅ similarity.pkl saved at: {output_file}")


In [None]:

# 🔹 Final sanity check
print("Movies dataframe shape:", movies.shape)
print("Similarity matrix shape:", similarity.shape)
if movies.shape[0] != similarity.shape[0]:
    print("⚠️ WARNING: movies and similarity sizes do not match!")
else:
    print("✅ Movies and similarity are aligned.")
