### Setup and Imports

In [9]:
import pandas as pd
import numpy as np
from nltk.stem.snowball import SnowballStemmer

In [10]:
# Load datasets
movies_df = pd.read_csv('../Data/NewMoviesMetadata.csv', low_memory=False)
credits_df = pd.read_csv('../Data/NewCredits.csv')
keywords_df = pd.read_csv('../Data/NewKeywords.csv')

# Ensure IDs are integers for a clean merge
credits_df['id'] = credits_df['id'].astype(int)
keywords_df['id'] = keywords_df['id'].astype(int)
movies_df['id'] = movies_df['id'].astype(int)

# Merge the dataframes into one
merged_df = movies_df.merge(credits_df, on='id')
merged_df = merged_df.merge(keywords_df, on='id')

print("Data successfully loaded and merged.")
print(f"Shape of merged dataframe: {merged_df.shape}")

Data successfully loaded and merged.
Shape of merged dataframe: (45453, 41)


In [11]:
def get_director(row):
    """
    Parses crew data to find and return the director's name.
    """
    # Handle missing or invalid crew data
    if pd.isna(row['job_crew']) or pd.isna(row['name_crew']):
        return np.nan
        
    # Create lists of jobs and names
    jobs = row['job_crew'].split(', ')
    names = row['name_crew'].split(', ')
    
    # Find the index of 'Director' and return the corresponding name
    if 'Director' in jobs:
        director_index = jobs.index('Director')
        return names[director_index]
    return np.nan

# Apply the function to create the 'director' column
merged_df['director'] = merged_df.apply(get_director, axis=1)

### Processing Text Features
For the recommender, we'll combine several text-based features: genres, keywords, top cast members, and the director. To make these features useful, we need to clean and standardize them.

The steps are:

- Filter keywords: Keep only keywords that appear more than once to remove noise.

- Stem keywords: Reduce words to their root form (e.g., "jealousy" becomes "jealousi").

- Process all features: Convert all names and terms to lowercase and remove spaces to create unique tokens (e.g., "Tom Hanks" becomes "tomhanks").

- Combine features: Aggregate all processed text into a single string for each movie.

In [12]:
# --- Keyword Processing ---
# Count all keyword occurrences to find common ones
all_keywords = merged_df['name_keywords'].dropna().str.split(', ').explode()
keyword_counts = all_keywords.value_counts()
common_keywords = keyword_counts[keyword_counts > 1].index.tolist()

# Function to filter, stem, and clean keywords
stemmer = SnowballStemmer('english')
def process_keywords(text):
    if pd.isna(text):
        return []
    keywords = text.split(', ')
    stemmed_keywords = [stemmer.stem(kw) for kw in keywords if kw in common_keywords]
    return [kw.lower().replace(' ', '') for kw in stemmed_keywords]

merged_df['keywords_processed'] = merged_df['name_keywords'].apply(process_keywords)


# --- Cast, Genre, and Director Processing ---
# Helper function to clean and limit the number of list items
def clean_and_limit(text, limit=3):
    if pd.isna(text) or text == '[]':
        return []
    items = text.split(', ')[:limit]
    return [item.lower().replace(' ', '') for item in items]

# Apply cleaning to other features
merged_df['cast_processed'] = merged_df['name_cast'].apply(clean_and_limit, limit=5) # Take top 5 actors
merged_df['genres_processed'] = merged_df['name_genres'].apply(clean_and_limit, limit=5)
merged_df['director_processed'] = merged_df['director'].astype(str).apply(lambda x: [x.lower().replace(' ', '')] if x != 'nan' else [])

In [13]:
# Combine all processed feature lists into one
def create_feature_soup(row):
    return ' '.join(row['keywords_processed'] + row['cast_processed'] + row['genres_processed'] + row['director_processed'])

merged_df['model_feature'] = merged_df.apply(create_feature_soup, axis=1)

print("Final 'model_feature' column created. Here's a sample:")
print(merged_df[['title', 'model_feature']].head())

Final 'model_feature' column created. Here's a sample:
                         title  \
0                    Toy Story   
1                      Jumanji   
2             Grumpier Old Men   
3            Waiting to Exhale   
4  Father of the Bride Part II   

                                       model_feature  
0  jealousi toy boy friendship friend rivalri boy...  
1  boardgam disappear basedonchildren'sbook newho...  
2  fish bestfriend duringcreditssting oldmen walt...  
3  basedonnovel interracialrelationship singlemot...  
4  babi midlifecrisi confid age daughter motherda...  


In [14]:
# Select and save the final data needed for the recommender
recommender_data = merged_df[['id', 'title', 'model_feature']]
recommender_data.to_csv('../data/MovieBasedRecommenderData.csv', index=False)

print("\nRecommender data saved successfully.")


Recommender data saved successfully.
