<a href="https://colab.research.google.com/github/Felicia197/Movie-Recommendation-System/blob/main/Hybrid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

In [None]:
#Read the CSV File into df
df = pd.read_csv('Desktop/AIM Assignment/data/movies_metadata.csv/movies_metadata.csv', low_memory=False)

#Keeping the features that require
df = df[['title','genres','release_date', 'runtime', 'vote_average', 'vote_count','id']]

df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count,id
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0,862
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0,8844
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0,15602
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0,31357
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0,11862


In [None]:
# Load the keywords and credits files
cred_df = pd.read_csv('Desktop/AIM Assignment/data/credits.csv/credits.csv')
key_df = pd.read_csv('Desktop/AIM Assignment/data/keywords.csv/keywords.csv')

In [None]:
# Function to convert all non-integer IDs to NaN
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

#Clean the ids of df
df['id'] = df['id'].apply(clean_ids)

#Filter all rows that have a null ID
df = df[df['id'].notnull()]
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count,id
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0,862.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0,8844.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0,15602.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0,31357.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0,11862.0


In [None]:
# Convert IDs into integer
df['id'] = df['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')
cred_df['id'] = cred_df['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
df = df.merge(cred_df, on='id')
df = df.merge(key_df, on='id')

#Display the head of the merged df
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count,id,cast,crew,keywords
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0,862,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0,15602,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0,31357,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0,11862,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [None]:
# Convert the stringtified objects into the native python objects
from ast import literal_eval

features = ['genres','cast', 'crew', 'keywords']

for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [None]:
# Extract the director's name. If director is not listed, return NaN
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

In [None]:
#Define the new director feature
df['director'] = df['crew'].apply(get_director)

In [None]:
# Returns the list top 3 elements or entire list; whichever is more.
def generate_list(x):
    if isinstance(x, list):
        names = [ele['name'] for ele in x]

        #Check if more than 3 elements exist. If yes, return only first three.
        #If no, return entire list.
        if len(names) > 3:
            names = names[:3]

        return names

        #Return empty list in case of missing/malformed data
    return []

In [None]:
#Apply the generate_list function to cast and keywords
df['genres'] = df['genres'].apply(generate_list)
df['cast'] = df['cast'].apply(generate_list)
df['keywords'] = df['keywords'].apply(generate_list)

In [None]:
# Function to sanitize data to prevent ambiguity.
# Removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]

    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
#Apply the generate_list function to cast, keywords, director and genres
for feature in ['cast', 'director', 'genres', 'keywords']:
    df[feature] = df[feature].apply(sanitize)

In [None]:
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count,id,cast,crew,keywords,director
0,Toy Story,"[animation, comedy, family]",1995-10-30,81.0,7.7,5415.0,862,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy]",johnlasseter
1,Jumanji,"[adventure, fantasy, family]",1995-12-15,104.0,6.9,2413.0,8844,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[boardgame, disappearance, basedonchildren'sbook]",joejohnston
2,Grumpier Old Men,"[romance, comedy]",1995-12-22,101.0,6.5,92.0,15602,"[waltermatthau, jacklemmon, ann-margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, bestfriend, duringcreditsstinger]",howarddeutch
3,Waiting to Exhale,"[comedy, drama, romance]",1995-12-22,127.0,6.1,34.0,31357,"[whitneyhouston, angelabassett, lorettadevine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[basedonnovel, interracialrelationship, single...",forestwhitaker
4,Father of the Bride Part II,[comedy],1995-02-10,106.0,5.7,173.0,11862,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlifecrisis, confidence]",charlesshyer


In [None]:
#Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [None]:
# Create the new soup feature
df['soup'] = df.apply(create_soup, axis=1)

In [None]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Define a new CountVectorizer object and create vectors for the soup
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [None]:
#Import cosine_similarity function
from sklearn.metrics.pairwise import linear_kernel

#Compute the cosine similarity score
cosine_sim = linear_kernel(count_matrix, count_matrix)

In [None]:
# Reset index of df and construct reverse mapping
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [None]:
#Build the SVD based Collaborative filter
#Import the required classes and methods from the surprise library
from surprise import SVD, Reader, Dataset
from surprise.model_selection import KFold

#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader()

#Read data from ratings
ratings = pd.read_csv('Desktop/AIM Assignment/data/ratings_small.csv/ratings_small.csv', low_memory=False)

# Convert pandas DataFrame to Surprise Dataset
data = Dataset.load_from_df(ratings, reader)

#Define the SVD algorithm object
svd = SVD()

#Train dataset by using svd
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x227fa1be2e0>

In [None]:
#Build a mapping
id_map = pd.read_csv('Desktop/AIM Assignment/data/links.csv/links.csv')
id_map.drop('imdbId', inplace=True, axis=1)
id_to_title = id_map.set_index("tmdbId")
id_to_title.head()

Unnamed: 0_level_0,movieId
tmdbId,Unnamed: 1_level_1
862.0,1
8844.0,2
15602.0,3
31357.0,4
11862.0,5


In [None]:
def hybrid_recommender(title, userId):
    # Obtain the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 25 most similar movies and ignore the first movie.
    sim_scores = sim_scores[1:26]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    #Extract the metadata of the aforementioned movies
    movies = df.iloc[movie_indices][['title',  'release_date', 'runtime','vote_count', 'vote_average', 'id']]

    #Compute the predicted ratings using the SVD filter
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, id_to_title.loc[x]['movieId']).est)

    #Sort the movies in decreasing order of predicted rating
    movies = movies.sort_values('est', ascending=False)

    #Return the top 10 movies as recommendations
    return movies.head(10)

In [None]:
#Assuming that user 1 just finish watching Star Wars
hybrid_recommender('Star Wars', 1)

Unnamed: 0,title,release_date,runtime,vote_count,vote_average,id,est
1177,Raiders of the Lost Ark,1981-06-12,115.0,3949.0,7.7,85,3.33695
1188,Return of the Jedi,1983-05-23,135.0,4763.0,7.9,1892,3.139334
1175,The Empire Strikes Back,1980-05-17,124.0,5998.0,8.2,1891,3.121595
452,The Fugitive,1993-08-06,130.0,1240.0,7.2,5503,3.042384
1267,Indiana Jones and the Last Crusade,1989-05-24,127.0,3221.0,7.6,89,3.041738
1329,Star Trek: First Contact,1996-11-21,111.0,671.0,7.0,199,3.035857
767,Independence Day,1996-06-25,145.0,3334.0,6.7,602,2.879253
26770,Star Wars: The Force Awakens,2015-12-15,136.0,7993.0,7.5,140607,2.854965
1276,Forbidden Planet,1956-03-15,98.0,238.0,7.2,830,2.836627
10157,Star Wars: Episode III - Revenge of the Sith,2005-05-17,140.0,4200.0,7.1,1895,2.696023


In [None]:
hybrid_recommender('Star Wars', 2)

Unnamed: 0,title,release_date,runtime,vote_count,vote_average,id,est
1267,Indiana Jones and the Last Crusade,1989-05-24,127.0,3221.0,7.6,89,4.04325
452,The Fugitive,1993-08-06,130.0,1240.0,7.2,5503,3.903611
1175,The Empire Strikes Back,1980-05-17,124.0,5998.0,8.2,1891,3.837369
1177,Raiders of the Lost Ark,1981-06-12,115.0,3949.0,7.7,85,3.776365
1188,Return of the Jedi,1983-05-23,135.0,4763.0,7.9,1892,3.705181
1276,Forbidden Planet,1956-03-15,98.0,238.0,7.2,830,3.689167
1329,Star Trek: First Contact,1996-11-21,111.0,671.0,7.0,199,3.554461
10157,Star Wars: Episode III - Revenge of the Sith,2005-05-17,140.0,4200.0,7.1,1895,3.45212
11758,Babylon 5: The Gathering,1993-02-22,89.0,57.0,6.4,10942,3.440726
18223,Radioactive Dreams,1986-09-19,98.0,7.0,6.0,47342,3.440726
