# ML Mini Project - Building a basic Movie Recommender System 
 ---
## Phase 2

In [1]:
#importing the basic libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading the dataset
movie = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
links = pd.read_csv("links.csv")
tags = pd.read_csv("tags.csv")

In [3]:
# general data perprocessing
ratings.drop(columns='timestamp',inplace=True)
tags.drop(columns='timestamp',inplace=True)
movie['Year'] = movie['title'].str.extract('.*\((.*)\).*',expand = False)
movie.replace('2006–2007','2007', inplace = True)

In [4]:
movie.head()

Unnamed: 0,movieId,title,genres,Year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [41]:
#creating copies of the datasets and working on the copies
movie1 = movie.copy()
tag1 = tags.copy()

In [42]:
tag1.drop('userId', inplace = True, axis = 1)
tag1.duplicated().sum()

104

In [43]:
tag1.drop_duplicates(inplace = True)
tag1.duplicated().sum()

0

In [44]:
#preprocessing on tags
tag1['tag'] = tag1['tag'].apply(lambda x : x.replace(' ', ''))
tag1['tag'] = tag1['tag'].apply(lambda x : x.lower())

In [45]:
temp_tag = pd.DataFrame(tag1.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))

In [46]:
movie1['genres'] = movie1['genres'].apply(lambda x: x.split('|'))
movie1['genres'] = movie1['genres'].apply(lambda x: ' '.join(x))
movie1['genres'] = movie1['genres'].apply(lambda x: x.lower())

In [47]:
df = pd.merge(movie1,temp_tag,on = 'movieId')
df['Year'] = df['Year'].replace(np.nan,'0')
df.head()

Unnamed: 0,movieId,title,genres,Year,tag
0,1,Toy Story (1995),adventure animation children comedy fantasy,1995,pixar fun
1,2,Jumanji (1995),adventure children fantasy,1995,fantasy magicboardgame robinwilliams game
2,3,Grumpier Old Men (1995),comedy romance,1995,moldy old
3,5,Father of the Bride Part II (1995),comedy,1995,pregnancy remake
4,7,Sabrina (1995),comedy romance,1995,remake


In [48]:
# creating a new column keywords that includes the contents of the generes, year and tag columns
df['keywords'] = df['genres'] +" "+ df['Year'] + " "+df['tag']

In [49]:
df.head()

Unnamed: 0,movieId,title,genres,Year,tag,keywords
0,1,Toy Story (1995),adventure animation children comedy fantasy,1995,pixar fun,adventure animation children comedy fantasy 19...
1,2,Jumanji (1995),adventure children fantasy,1995,fantasy magicboardgame robinwilliams game,adventure children fantasy 1995 fantasy magicb...
2,3,Grumpier Old Men (1995),comedy romance,1995,moldy old,comedy romance 1995 moldy old
3,5,Father of the Bride Part II (1995),comedy,1995,pregnancy remake,comedy 1995 pregnancy remake
4,7,Sabrina (1995),comedy romance,1995,remake,comedy romance 1995 remake


In [50]:
df.dtypes

movieId      int64
title       object
genres      object
Year        object
tag         object
keywords    object
dtype: object

#### Content Based Similarity

In [51]:
#creating the final dataframe that we will be working on for item to item based
final_df = df[['movieId','title','keywords']]

In [52]:
final_df.head()

Unnamed: 0,movieId,title,keywords
0,1,Toy Story (1995),adventure animation children comedy fantasy 19...
1,2,Jumanji (1995),adventure children fantasy 1995 fantasy magicb...
2,3,Grumpier Old Men (1995),comedy romance 1995 moldy old
3,5,Father of the Bride Part II (1995),comedy 1995 pregnancy remake
4,7,Sabrina (1995),comedy romance 1995 remake


In [53]:
#performing feature extraction using counter vectorizer and then computing cosine similarity

In [54]:
#importing the libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [55]:
cv = CountVectorizer()
vectors = cv.fit_transform(final_df['keywords']).toarray()
similarity = cosine_similarity(vectors)

In [56]:
#seeing the shape of the similarity array that we have made
similarity.shape

(1572, 1572)

In [57]:
#function to recommend the movies
def recommend(movie):
    movie_index = final_df[final_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse = True, key = lambda x:x[1])[1:11]
    
    for i in movies_list:
        #list.append(final_df.iloc[i[0]].title)
        print(final_df.iloc[i[0]].title)
    return list

In [60]:
movie_name = input("Enter movie name (full): ")

Enter movie name (full): Bug's Life, A (1998)


In [61]:
recommend(movie_name)

Toy Story (1995)
Wallace & Gromit in The Curse of the Were-Rabbit (2005)
Toy Story 2 (1999)
Aladdin (1992)
Who Framed Roger Rabbit? (1988)
Grand Day Out with Wallace and Gromit, A (1989)
Shrek (2001)
Holes (2003)
Finding Nemo (2003)
Shrek 2 (2004)


list

#### Coldstart problem
show the users the highest rating count with the average number of movies

In [68]:
merged = pd.merge(ratings,movie,on = 'movieId')
merged.head()

Unnamed: 0,userId,movieId,rating,title,genres,Year
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995


In [69]:
avg_rating = merged.groupby('title')['rating'].mean().sort_values(ascending = False)
rating_count = merged.groupby('title')['rating'].count().sort_values(ascending = True)

In [81]:
average_rating = pd.DataFrame(merged.groupby('title')['rating'].mean())
average_rating['rating_count'] = rating_count
average_rating.reset_index()

Unnamed: 0,title,rating,rating_count
0,'71 (2014),4.000000,1
1,'Hellboy': The Seeds of Creation (2004),4.000000,1
2,'Round Midnight (1986),3.500000,2
3,'Salem's Lot (2004),5.000000,1
4,'Til There Was You (1997),4.000000,2
...,...,...,...
9714,eXistenZ (1999),3.863636,22
9715,xXx (2002),2.770833,24
9716,xXx: State of the Union (2005),2.000000,5
9717,¡Three Amigos! (1986),3.134615,26


#### concidering user based similarity based on other movies liked by the users who watched a given movie

In [26]:
df_pivot = ratings.pivot(index ='movieId',columns='userId',values='rating')
df_pivot = df_pivot.dropna(thresh = 10, axis = 0)
df_pivot.fillna(0,inplace = True)
df_pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0


In [27]:
# counting rated movies and users who have rated movies
users_rated = ratings.groupby('movieId')['rating'].agg('count')
movies_rated = ratings.groupby('userId')['rating'].agg('count')

df_pivot = df_pivot.loc[users_rated[users_rated > 10].index,:]
df_pivot = df_pivot.loc[:,movies_rated[movies_rated > 10].index]
df_pivot

In [28]:
#concidering users which have rated more than 10 movies only
df_pivot = df_pivot.loc[:, movies_rated[movies_rated > 10].index]

In [29]:
#importing the libraries
#creating a compressed matrix aka spare matrix
#scipy is an opensource library for mathemnatics, science and engineering
from scipy.sparse import csr_matrix
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors

In [30]:
rating_movieid_matrix = csr_matrix(df_pivot.values)
df_pivot.reset_index(inplace = True)
knn_model = NearestNeighbors(n_neighbors=10,metric ='cosine',algorithm = 'auto',n_jobs=-1)
knn_model.fit(rating_movieid_matrix) 

NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=10)

In [43]:
def get_movie_recommendation(movie_name):
    n = 10
    movie_list = movie[movie['title'].str.contains(movie_name,case = False)]  
    if len(movie_list):        
        movie_idx= movie_list.iloc[0]['movieId']
        movie_idx = df_pivot[df_pivot['movieId'] == movie_idx].index[0]
        distances , indices = knn_model.kneighbors(rating_movieid_matrix[movie_idx],n_neighbors=n+1)    
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = df_pivot.iloc[val[0]]['movieId']
            idx = movie[movie['movieId'] == movie_idx].index
            recommend_frame.append({'Title':movie.iloc[idx]['title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,n+1))
        return df
    else:
        return "No movie found :( Check input" 

In [45]:
get_movie_recommendation(str(input("Enter movie name")))

Enter movie nameavatar


Unnamed: 0,Title,Distance
1,I Am Legend (2007),0.447392
2,Sherlock Holmes (2009),0.444272
3,"Avengers, The (2012)",0.440447
4,Harry Potter and the Half-Blood Prince (2009),0.426318
5,Kung Fu Panda (2008),0.413092
6,"Hangover, The (2009)",0.398295
7,WALL·E (2008),0.397135
8,District 9 (2009),0.388001
9,Iron Man (2008),0.375732
10,Up (2009),0.365445


### User Based Recommender system

In [48]:
#importing surprise package to find movie rating for the user for movie which he hasn't watched
from surprise import KNNWithMeans,Dataset,accuracy,Reader
from surprise.model_selection import train_test_split

In [49]:
#fitting algorithm 
alg=KNNWithMeans(k=50,sim_options={"name":"cosine","user_based":True})

In [None]:
#taking input user id
userID = int(input("Enter User Id:"))
#finding the names of unique movies in the dataset
iids = final_df["movieId"].unique()

#preparing the testset
testset = [[userID,movie_id,2] for movie_id in iids]
prediction = alg.test(testset)

#preparing the dataframe consisting of top 10 movies
usid=[]
rate = []
for i in prediction:
    usid.append(i.iid)
    rate.append(i.est) #inbuilt function to find the estimated rating of the movie
predDf = pd.DataFrame({"MovieID":usid,"Ratings":rate})
predDf=predDf.set_index("MovieID")
topten = predDf.sort_values(by="Ratings",ascending=False).head(10)
top_10_movie_id = pd.Series(topten.index)
top_10_movie_id

In [None]:
top10_movies_recommended = movies[movies["movieId"].isin(top_10_movie_id)]["title"]
print("movies to be recommended are", sep= "\n" )
print(top10_movies_recommended)

In [5]:
import pickle 

In [6]:
pickle.dump(movie,open('movie_list.pkl','wb'))

In [8]:
movie['title'].values

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       ..., 'Flint (2017)', 'Bungo Stray Dogs: Dead Apple (2018)',
       'Andrew Dice Clay: Dice Rules (1991)'], dtype=object)