### Data Loading and Preparation

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load all necessary datasets
movies_df = pd.read_csv('../data/NewMoviesMetadata.csv', low_memory=False)
credits_df = pd.read_csv('../data/NewCredits.csv')
keywords_df = pd.read_csv('../data/newKeywords.csv')
links_df = pd.read_csv('../data/links.csv')

# Clean and convert movie IDs to a numeric type, dropping invalid rows
movies_df['id'] = pd.to_numeric(movies_df['id'], errors='coerce')
movies_df.dropna(subset=['id'], inplace=True)
movies_df['id'] = movies_df['id'].astype(int)

# Ensure other IDs are integers
credits_df['id'] = credits_df['id'].astype(int)
keywords_df['id'] = keywords_df['id'].astype(int)

# Merge into a single dataframe
master_df = movies_df.merge(credits_df, on='id')
master_df = master_df.merge(keywords_df, on='id')

# Filter to include only movies with a valid TMDB link
valid_tmdb_ids = links_df[links_df['tmdbId'].notnull()]['tmdbId'].astype('int')
master_df = master_df[master_df['id'].isin(valid_tmdb_ids)].copy()

print(f"Data loaded and merged. Final shape: {master_df.shape}")

Data loaded and merged. Final shape: (45453, 41)


### Calculating a Hybrid Quality Score
To rank our recommendations, we'll create a hybrid score that combines a movie's popularity with its weighted rating. The weighted rating uses the IMDb formula to balance the average vote with the number of votes, preventing movies with few high ratings from dominating.

In [4]:
# Calculate the components for the IMDb weighted rating
vote_averages = master_df['vote_average']
vote_counts = master_df['vote_count']
mean_vote_all_movies = master_df['vote_average'].mean()
min_votes_for_chart = master_df['vote_count'].quantile(0.90)

# Calculate the weighted rating
master_df['weighted_rating'] = (
    (vote_averages * vote_counts + mean_vote_all_movies * min_votes_for_chart) /
    (vote_counts + min_votes_for_chart)
)

# Scale popularity and weighted_rating to be between 0 and 1
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(master_df[['popularity', 'weighted_rating']])
scaled_df = pd.DataFrame(scaled_features, columns=['popularity_scaled', 'rating_scaled'], index=master_df.index)

# Create the final hybrid score (60% popularity, 40% rating)
master_df['score'] = scaled_df['popularity_scaled'] * 0.6 + scaled_df['rating_scaled'] * 0.4

In [7]:
# Load the pre-engineered text features
content_features_df = pd.read_csv('../data/MovieBasedRecommenderData.csv')
content_features_df.head()

Unnamed: 0,id,title,model_feature
0,862,Toy Story,jealousi toy boy friendship friend rivalri boy...
1,8844,Jumanji,boardgam disappear basedonchildren'sbook newho...
2,15602,Grumpier Old Men,fish bestfriend duringcreditssting oldmen walt...
3,31357,Waiting to Exhale,basedonnovel interracialrelationship singlemot...
4,11862,Father of the Bride Part II,babi midlifecrisi confid age daughter motherda...


In [8]:

# Combine the scores and content features into a final dataframe
recommender_df = master_df[['id', 'title', 'score']].merge(content_features_df, on='id', suffixes=(None, '_y'))
recommender_df.drop('title_y', axis=1, inplace=True) # Drop redundant title column

print("Final recommender dataframe created:")
recommender_df.head()

Final recommender dataframe created:


Unnamed: 0,id,title,score,model_feature
0,862,Toy Story,0.35776,jealousi toy boy friendship friend rivalri boy...
1,8844,Jumanji,0.284882,boardgam disappear basedonchildren'sbook newho...
2,15602,Grumpier Old Men,0.206639,fish bestfriend duringcreditssting oldmen walt...
3,31357,Waiting to Exhale,0.178489,basedonnovel interracialrelationship singlemot...
4,11862,Father of the Bride Part II,0.180003,babi midlifecrisi confid age daughter motherda...


In [9]:
# Fill any missing feature text with an empty string
recommender_df['model_feature'] = recommender_df['model_feature'].fillna('')

# Initialize and fit the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 2),
    stop_words='english',
    min_df=0.0 # Using original value, consider increasing to 3-5
)
tfidf_matrix = tfidf_vectorizer.fit_transform(recommender_df['model_feature'])

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

TF-IDF matrix shape: (45525, 450099)


In [10]:
# Create a mapping from movie title to dataframe index for quick lookups
title_to_index = pd.Series(recommender_df.index, index=recommender_df['title'])

def get_recommendations(movie_title, similarity_weight=0.8, top_n=10):
    """
    Generates movie recommendations based on content similarity and quality score.
    """
    try:
        # Get the index of the movie that matches the title
        movie_idx = title_to_index[movie_title]
    except KeyError:
        return f"Movie with title '{movie_title}' not found."

    # Calculate cosine similarity for the selected movie against all others
    cosine_sim = cosine_similarity(tfidf_matrix[movie_idx], tfidf_matrix)
    
    # Create a dataframe of similarity scores
    sim_scores_df = pd.DataFrame(cosine_sim[0], columns=['similarity'])
    
    # Combine with the main recommender dataframe
    final_df = pd.concat([recommender_df.reset_index(), sim_scores_df], axis=1)
    
    # Calculate the final hybrid score
    final_df['final_score'] = (
        final_df['score'] * (1 - similarity_weight) +
        final_df['similarity'] * similarity_weight
    )
    
    # Sort by the final score and remove the input movie itself
    recommendations = (
        final_df
        .sort_values(by='final_score', ascending=False)
        .drop(movie_idx) # Drop the movie itself from recommendations
        .head(top_n)
    )
    
    return recommendations[['title', 'score', 'similarity', 'final_score']]

In [11]:
# Get recommendations for 'The Dark Knight'
get_recommendations('The Dark Knight', similarity_weight=0.8, top_n=10)

Unnamed: 0,title,score,similarity,final_score
18293,The Dark Knight Rises,0.350183,0.294034,0.305263
10153,Batman Begins,0.350176,0.250469,0.27041
15549,Batman: Under the Red Hood,0.295955,0.174281,0.198616
1336,Batman Returns,0.25762,0.176985,0.193112
33413,Wonder Woman,0.616032,0.061767,0.17262
30754,Minions,0.829545,0.0,0.165909
11388,The Prestige,0.375166,0.107938,0.161384
26610,Guardians of the Galaxy Vol. 2,0.528302,0.04529,0.141893
26608,Deadpool,0.517795,0.044307,0.139005
26611,Captain America: Civil War,0.446566,0.058658,0.13624


In [14]:
# Get recommendations for 'Superman'
get_recommendations('Jumanji', similarity_weight=0.8, top_n=10)

Unnamed: 0,title,score,similarity,final_score
30754,Minions,0.829545,0.011268,0.174923
33413,Wonder Woman,0.616032,0.020407,0.139532
30304,The Games Maker,0.163558,0.12596,0.13348
14320,Where the Wild Things Are,0.231287,0.108946,0.133414
24714,Tinker Bell and the Lost Treasure,0.216433,0.107788,0.129517
42281,Beauty and the Beast,0.576636,0.009781,0.123152
24500,Big Hero 6,0.57676,0.008135,0.12186
11155,Night at the Museum,0.246599,0.089095,0.120596
23720,Gone Girl,0.519871,0.019702,0.119736
18709,The Lorax,0.224416,0.091637,0.118192
