In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

In [2]:
df_movies = pd.read_csv(r'C:\Users\jaraneses\OneDrive - 2X LLC\Codes\Portfolio Projects_storage\Day 27 - Movie Recommendation System\movies.csv')
df = df_movies.copy()

In [3]:
display(df.head())

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [5]:
# fill the missing values with '' in target_columns

def data_prep(df):

    target_columns = ['genres', 'keywords', 'overview', 'cast', 'director', 'production_companies']
        
    for col in target_columns:
        df[col] = df[col].fillna('')
    return df

df = data_prep(df)

In [6]:
target_columns = ['genres', 'keywords', 'overview', 'cast', 'director', 'production_companies']
df[target_columns].isnull().sum()

genres                  0
keywords                0
overview                0
cast                    0
director                0
production_companies    0
dtype: int64

In [7]:
# convert lists to actual strings eg. [{'id': 28, 'name': 'Action'}]

def clean_list(df):
    df['combined_features'] = (
        df['genres'] + ' ' +
        df['keywords'] + ' ' +
        df['overview'] + ' ' +
        df['cast'] + ' ' +
        df['director'] + ' ' +
        df['production_companies']
    )
    # there might be extra spaces
    df['combined_features'] = df['combined_features'].str.replace('  ', ' ').str.strip()

    return df
df = clean_list(df)

In [8]:
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,combined_features
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron,Action Adventure Fantasy Science Fiction cultu...
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski,Adventure Fantasy Action ocean drug abuse exot...
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes,Action Adventure Crime spy based on novel secr...
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan,Action Crime Drama Thriller dc comics crime fi...
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton,Action Adventure Science Fiction based on nove...


In [9]:
# Convert text to numbers and calculate movie similarities

def content_similarity(df):

    tfidf = TfidfVectorizer(stop_words= 'english', max_features = 5000)
    
    # convert text to number

    tfidf_matrix = tfidf.fit_transform(df['combined_features'])

    # calculate cosine similarity between all movies
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    return cosine_sim

cosine_sim = content_similarity(df)

In [10]:
df['popularity'].describe()

count    4803.000000
mean       21.492301
std        31.816650
min         0.000000
25%         4.668070
50%        12.921594
75%        28.313505
max       875.581305
Name: popularity, dtype: float64

In [11]:
# Create popularity scores for hybrid recommendations

def popularity (df):

    # Normalize popularity score and rating to 0-1 scale
    df['popularity_norm'] = (df['popularity'] - df['popularity'].min())/ (df['popularity'].max() - df['popularity'].min())
    df['rating_norm'] = df['vote_average'] / 10

    # handle zero division
    if df['vote_count'].max() > df['vote_count'].min():
        df['vote_count_norm'] = (df['vote_count'] - df['vote_count'].min()) / (df['vote_count'].max() - df['vote_count'].min())
    else:
        df['vote_count_norm'] = 0.5

    # Create a weighted popularity score
    df['popularity_score'] = (
        df['popularity_norm'] * 0.4 +
        df['rating_norm'] * 0.4 +
        df['vote_count_norm'] * 0.2
    )

    return df

df = popularity(df)

In [12]:
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,vote_average,vote_count,cast,crew,director,combined_features,popularity_norm,rating_norm,vote_count_norm,popularity_score
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron,Action Adventure Fantasy Science Fiction cultu...,0.171815,0.72,0.858057,0.528337
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski,Adventure Fantasy Action ocean drug abuse exot...,0.158846,0.69,0.327225,0.404983
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes,Action Adventure Crime spy based on novel secr...,0.122635,0.63,0.324753,0.366004
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan,Action Crime Drama Thriller dc comics crime fi...,0.128272,0.76,0.662158,0.487741
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton,Action Adventure Science Fiction based on nove...,0.050169,0.61,0.15445,0.294958


In [13]:
def movie_indices (df):
    """ Create mappings of movies to indices"""

    indices = pd.Series(df.index, index = df['title']).drop_duplicates()
    print(f'Create indices for {len(indices)} movies')

    print("Sample mappings:")
    sample_titles = list(indices.index[:3])
    for title in sample_titles:
        print(f"  '{title}' -> Index {indices[title]}")

    return indices 

indices = movie_indices(df)

Create indices for 4803 movies
Sample mappings:
  'Avatar' -> Index 0
  'Pirates of the Caribbean: At World's End' -> Index 1
  'Spectre' -> Index 2


In [None]:
# create the recommendation system

def recommendation_system (title, cosine_sim, indices, df, n_recommendations = 10, content_weight = 0.7, popularity_weight = 0.3):
    """Hybrid reco = content similarity + popularity"""

    print(f'Getting recommendations for {title}')

    # Check if the movie exists

    if title not in indices:
        print("Movie can't be found")

        similar_titles = [t for t in indices.index if title.lower() in t.lower()]
        if similar_titles:
            print("You mean?:")
            for t in similar_titles[:5]:
                print(f'  - {t}')
            return None
    
    # Get the movie index
    idx = indices[title]
    print(f'Found {title} at index {idx}')

    # Get the similarity scores of the movie
    content_scores = list(enumerate(cosine_sim[idx]))

    # Calculate the hybrid score
    recommendations = []

    for i, content_score in content_scores:
        if i == idx: # Skip the movie itself
            continue

        movie_data = df.iloc[i]
        pop_score = movie_data['popularity_score']
        
        # Combine content similarity and popularity
        hybrid_score = (content_score * content_weight) + (pop_score * popularity_weight)

        recommendations.append({
            'index' : i,
            'title': movie_data['title'],
            'genres': movie_data['genres'],
            'vote_average': movie_data['vote_average'],
            'popularity': movie_data['popularity'],
            'director': movie_data['director'],
            'content_similarity': content_score,
            'popularity_score': pop_score,
            'hybrid_score': hybrid_score
        })

    recommendations.sort(key = lambda x: x['hybrid_score'], reverse= True)
    top_recommendations = recommendations[:n_recommendations]

    return top_recommendations



In [None]:
def test_recommendation_system():
    test_movies = ["The Dark Knight"]
    
    for movie in test_movies:
        print(f"RECOMMENDATIONS FOR: {movie}")
        print("-" * 50)
        
        recommendations = recommendation_system(
            movie, 
            cosine_sim, 
            indices, 
            df,
            n_recommendations=5,
            content_weight=0.7,
            popularity_weight=0.3
        )
        
        if recommendations:
            for i, rec in enumerate(recommendations, 1):
                print(f"{i}. {rec['title']}")
                print(f"   ⭐ Rating: {rec['vote_average']}/10")
                print(f"   🎭 Genres: {rec['genres'][:50]}...")
                print(f"   📈 Popularity: {rec['popularity']:.1f}")
                print(f"   🎬 Director: {rec['director']}")
                print(f"   🔍 Match Score: {rec['hybrid_score']:.3f}")
                print()
        else:
            print("No recommendations found")

# Run the test
test_recommendation_system()

RECOMMENDATIONS FOR: The Dark Knight
--------------------------------------------------
Getting recommendations for The Dark Knight
Found The Dark Knight at index 65
1. Batman Begins
   ⭐ Rating: 7.5/10
   🎭 Genres: Action Crime Drama...
   📈 Popularity: 115.0
   🎬 Director: Christopher Nolan
   🔍 Match Score: 0.535

2. The Dark Knight Rises
   ⭐ Rating: 7.6/10
   🎭 Genres: Action Crime Drama Thriller...
   📈 Popularity: 112.3
   🎬 Director: Christopher Nolan
   🔍 Match Score: 0.518

3. Batman: The Dark Knight Returns, Part 2
   ⭐ Rating: 7.9/10
   🎭 Genres: Action Animation...
   📈 Popularity: 25.9
   🎬 Director: Jay Oliva
   🔍 Match Score: 0.424

4. Suicide Squad
   ⭐ Rating: 5.9/10
   🎭 Genres: Action Adventure Crime Fantasy Science Fiction...
   📈 Popularity: 90.2
   🎬 Director: David Ayer
   🔍 Match Score: 0.382

5. Batman v Superman: Dawn of Justice
   ⭐ Rating: 5.7/10
   🎭 Genres: Action Adventure Fantasy...
   📈 Popularity: 155.8
   🎬 Director: Zack Snyder
   🔍 Match Score: 0.3