In [5]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.porter import PorterStemmer
import json

# Download nltk data (just in case)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("1. Loading Data...")
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# --- STEP 1: MERGE & CLEAN (Matching the Notebook Logic) ---

print("2. Merging Datasets...")
# The notebook merges on 'title'
movies = movies.merge(credits, on='title')

# Selecting the exact columns used in the notebook
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

print("3. Pre-processing columns...")

# Helper function to convert stringified lists to python lists
def convert(obj):
    L = []
    # ast.literal_eval is safer/better than json.loads for this dataset
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

# Apply transformations
movies.dropna(inplace=True) # Drop missing values
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert3)
movies['crew'] = movies['crew'].apply(fetch_director)

# Convert overview from string to list
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# --- STEP 2: COLLAPSE & TAGS ---

print("4. Collapsing spaces (Sam Worthington -> SamWorthington)...")
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Create 'tags' column by concatenating everything
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create new dataframe
new_df = movies[['movie_id', 'title', 'tags']]

# Convert tags list back to string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

# --- STEP 3: STEMMING & VECTORIZATION ---

print("5. Applying Stemming (PorterStemmer)...")
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

new_df['tags'] = new_df['tags'].apply(stem)

print("6. Vectorizing (Bag of Words - 5000 features)...")
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

print("7. Calculating Cosine Similarity...")
similarity = cosine_similarity(vectors)

# --- STEP 4: GENERATE JSON OUTPUT ---

print("8. Creating JSON Database...")
recommendation_db = {}

# Deduplicate movies by title to prevent errors
new_df = new_df.drop_duplicates(subset=['title'])
new_df = new_df.reset_index(drop=True)

for index, row in new_df.iterrows():
    try:
        title = row['title']

        # Find similar movies
        distances = similarity[index]
        movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:10] # Top 9 recommendations

        # Extract titles
        recs = []
        for i in movies_list:
            # i[0] is the index in the dataframe
            recommended_title = new_df.iloc[i[0]].title
            recs.append(recommended_title)

        recommendation_db[title] = recs
    except Exception as e:
        print(f"Error processing {row['title']}: {e}")
        continue

# Save to file
output_file = 'movie_recs_campusx.json'
with open(output_file, 'w') as f:
    json.dump(recommendation_db, f)

print(f"SUCCESS! Model logic applied. '{output_file}' has been created.")
print("Place this file in the 'public' folder of your React app.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


1. Loading Data...
2. Merging Datasets...
3. Pre-processing columns...
4. Collapsing spaces (Sam Worthington -> SamWorthington)...
5. Applying Stemming (PorterStemmer)...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


6. Vectorizing (Bag of Words - 5000 features)...
7. Calculating Cosine Similarity...
8. Creating JSON Database...
Error processing The Pink Panther: single positional indexer is out-of-bounds
Error processing Hollywood Homicide: single positional indexer is out-of-bounds
Error processing I, Frankenstein: single positional indexer is out-of-bounds
Error processing Duplex: single positional indexer is out-of-bounds
Error processing Jersey Girl: single positional indexer is out-of-bounds
Error processing The Three Stooges: single positional indexer is out-of-bounds
Error processing There Will Be Blood: single positional indexer is out-of-bounds
Error processing The Mighty Ducks: single positional indexer is out-of-bounds
Error processing Sinister 2: single positional indexer is out-of-bounds
Error processing Partition: single positional indexer is out-of-bounds
Error processing Zipper: single positional indexer is out-of-bounds
Error processing Network: single positional indexer is out-of

In [None]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB
