# ML Mini Project - Building a basic Movie Recommender System 
 ---
## Phase 2

In [1]:
#importing the basic libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
#reading the dataset
movie = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
links = pd.read_csv("links.csv")
tags = pd.read_csv("tags.csv")

In [3]:
# general data perprocessing
ratings.drop(columns='timestamp',inplace=True)
tags.drop(columns='timestamp',inplace=True)
movie['Year'] = movie['title'].str.extract('.*\((.*)\).*',expand = False)
movie.replace('2006â€“2007','2007', inplace = True)

In [4]:
movie.head()

Unnamed: 0,movieId,title,genres,Year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [5]:
#creating copies of the datasets and working on the copies
movie1 = movie.copy()
tag1 = tags.copy()

In [6]:
tag1.drop('userId', inplace = True, axis = 1)
tag1.duplicated().sum()

104

In [7]:
tag1.drop_duplicates(inplace = True)
tag1.duplicated().sum()

0

In [8]:
#preprocessing on tags
tag1['tag'] = tag1['tag'].apply(lambda x : x.replace(' ', ''))
tag1['tag'] = tag1['tag'].apply(lambda x : x.lower())

In [9]:
temp_tag = pd.DataFrame(tag1.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))

In [10]:
movie1['genres'] = movie1['genres'].apply(lambda x: x.split('|'))
movie1['genres'] = movie1['genres'].apply(lambda x: ' '.join(x))
movie1['genres'] = movie1['genres'].apply(lambda x: x.lower())

In [11]:
df = pd.merge(movie1,temp_tag,on = 'movieId')
df['Year'] = df['Year'].replace(np.nan,'0')
df.head()

Unnamed: 0,movieId,title,genres,Year,tag
0,1,Toy Story (1995),adventure animation children comedy fantasy,1995,pixar fun
1,2,Jumanji (1995),adventure children fantasy,1995,fantasy magicboardgame robinwilliams game
2,3,Grumpier Old Men (1995),comedy romance,1995,moldy old
3,5,Father of the Bride Part II (1995),comedy,1995,pregnancy remake
4,7,Sabrina (1995),comedy romance,1995,remake


In [12]:
# creating a new column keywords that includes the contents of the generes, year and tag columns
df['keywords'] = df['genres'] +" "+ df['Year'] + " "+df['tag']

In [13]:
df.head()

Unnamed: 0,movieId,title,genres,Year,tag,keywords
0,1,Toy Story (1995),adventure animation children comedy fantasy,1995,pixar fun,adventure animation children comedy fantasy 19...
1,2,Jumanji (1995),adventure children fantasy,1995,fantasy magicboardgame robinwilliams game,adventure children fantasy 1995 fantasy magicb...
2,3,Grumpier Old Men (1995),comedy romance,1995,moldy old,comedy romance 1995 moldy old
3,5,Father of the Bride Part II (1995),comedy,1995,pregnancy remake,comedy 1995 pregnancy remake
4,7,Sabrina (1995),comedy romance,1995,remake,comedy romance 1995 remake


In [14]:
df.dtypes

movieId      int64
title       object
genres      object
Year        object
tag         object
keywords    object
dtype: object

#### Content Based Similarity

In [15]:
#creating the final dataframe that we will be working on for item to item based
final_df = df[['movieId','title','keywords']]

In [16]:
final_df.head()

Unnamed: 0,movieId,title,keywords
0,1,Toy Story (1995),adventure animation children comedy fantasy 19...
1,2,Jumanji (1995),adventure children fantasy 1995 fantasy magicb...
2,3,Grumpier Old Men (1995),comedy romance 1995 moldy old
3,5,Father of the Bride Part II (1995),comedy 1995 pregnancy remake
4,7,Sabrina (1995),comedy romance 1995 remake


In [17]:
#performing feature extraction using counter vectorizer and then computing cosine similarity

In [18]:
#importing the libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
cv = CountVectorizer()
vectors = cv.fit_transform(final_df['keywords']).toarray()
similarity = cosine_similarity(vectors)

In [20]:
#seeing the shape of the similarity array that we have made
similarity.shape

(1572, 1572)

In [21]:
#function to recommend the movies
def recommend(movie):
    movie_index = final_df[final_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse = True, key = lambda x:x[1])[1:11]
    
    for i in movies_list:
        #list.append(final_df.iloc[i[0]].title)
        print(final_df.iloc[i[0]].title)
    return list

In [22]:
movie_name = input("Enter movie name (full): ")

Enter movie name (full): Toy Story (1995)


In [23]:
recommend(movie_name)

Bug's Life, A (1998)
Toy Story 2 (1999)
Wallace & Gromit: A Close Shave (1995)
Shrek (2001)
Who Framed Roger Rabbit? (1988)
Space Jam (1996)
Sinbad: Legend of the Seven Seas (2003)
Cat Returns, The (Neko no ongaeshi) (2002)
Jumanji (1995)
Wallace & Gromit in The Curse of the Were-Rabbit (2005)


list

#### Coldstart problem
show the users the highest rating count with the average number of movies

In [24]:
mean_rat = ratings.groupby('movieId').rating.mean()
num_users = ratings.groupby('movieId').userId.count()
mean_rat_movie_temp = pd.merge(mean_rat, movie, how='inner', on='movieId')
mean_rat_movie = pd.merge(mean_rat_movie_temp, num_users, how='inner', on='movieId')

In [25]:
mean_rat_movie.drop(columns='genres', inplace=True)
mean_rat_movie.rename(columns={'rating':'mean_ratings','userId':'num_users'}, inplace=True)

In [26]:
mean_rat_movie.head()

Unnamed: 0,movieId,mean_ratings,title,Year,num_users
0,1,3.92093,Toy Story (1995),1995,215
1,2,3.431818,Jumanji (1995),1995,110
2,3,3.259615,Grumpier Old Men (1995),1995,52
3,4,2.357143,Waiting to Exhale (1995),1995,7
4,5,3.071429,Father of the Bride Part II (1995),1995,49


In [27]:
popular_movies.head()

NameError: name 'popular_movies' is not defined

In [None]:
popular_movies = mean_rat_movie[mean_rat_movie["num_users"]>50].sort_values('mean_ratings',ascending=False)

In [None]:
#function to display the highest movies with more than 50 ratings as a cold start problem
def popular_movies:
    for i in range
    

#### concidering user based similarity based on other movies liked by the users who watched a given movie

In [None]:
df_pivot = ratings.pivot(index ='movieId',columns='userId',values='rating')
df_pivot = df_pivot.dropna(thresh = 10, axis = 0)
df_pivot.fillna(0,inplace = True)
df_pivot.head()

In [None]:
# counting rated movies and users who have rated movies
users_rated = ratings.groupby('movieId')['rating'].agg('count')
movies_rated = ratings.groupby('userId')['rating'].agg('count')

df_pivot = df_pivot.loc[users_rated[users_rated > 10].index,:]
df_pivot = df_pivot.loc[:,movies_rated[movies_rated > 10].index]
df_pivot

In [None]:
#concidering users which have rated more than 10 movies only
df_pivot = df_pivot.loc[:, movies_rated[movies_rated > 10].index]

In [None]:
#importing the libraries
#creating a compressed matrix aka spare matrix
#scipy is an opensource library for mathemnatics, science and engineering
from scipy.sparse import csr_matrix
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors

In [None]:
rating_movieid_matrix = csr_matrix(df_pivot.values)
df_pivot.reset_index(inplace = True)
knn_model = NearestNeighbors(n_neighbors=10,metric ='cosine',algorithm = 'auto',n_jobs=-1)
knn_model.fit(rating_movieid_matrix) 

In [None]:
def get_movie_recommendation(movie_name):
    n = 10
    movie_list = movie[movie['title'].str.contains(movie_name,case = False)]  
    if len(movie_list):        
        movie_idx= movie_list.iloc[0]['movieId']
        movie_idx = df_pivot[df_pivot['movieId'] == movie_idx].index[0]
        distances , indices = knn_model.kneighbors(rating_movieid_matrix[movie_idx],n_neighbors=n+1)    
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = df_pivot.iloc[val[0]]['movieId']
            idx = movie[movie['movieId'] == movie_idx].index
            recommend_frame.append({'Title':movie.iloc[idx]['title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,n+1))
        return df
    else:
        return "No movie found :( Check input" 

In [None]:
get_movie_recommendation(str(input("Enter movie name")))

### User Based Recommender system

In [None]:
#importing surprise package to find movie rating for the user for movie which he hasn't watched
from surprise import KNNWithMeans,Dataset,accuracy,Reader
from surprise.model_selection import train_test_split

In [None]:
reader = Reader(rating_scale = (0.5,5))

In [None]:
#fitting algorithm 
alg=KNNWithMeans(k=50,sim_options={"name":"cosine","user_based":True})

In [None]:
#taking input user id
userID = int(input("Enter User Id:"))
#finding the names of unique movies in the dataset
iids = final_df["movieId"].unique()

#preparing the testset
testset = [[userID,movie_id,2] for movie_id in iids]
prediction = alg.test(testset)

#preparing the dataframe consisting of top 10 movies
usid=[]
rate = []
for i in prediction:
    usid.append(i.iid)
    rate.append(i.est) #inbuilt function to find the estimated rating of the movie
predDf = pd.DataFrame({"MovieID":usid,"Ratings":rate})
predDf=predDf.set_index("MovieID")
topten = predDf.sort_values(by="Ratings",ascending=False).head(10)
top_10_movie_id = pd.Series(topten.index)
top_10_movie_id

In [None]:
top10_movies_recommended = movies[movies["movieId"].isin(top_10_movie_id)]["title"]
print("movies to be recommended are", sep= "\n" )
print(top10_movies_recommended)

#### building a user based recommender system

In [None]:
merged_dataset = pd.merge(ratings,movie,how = 'inner',on = 'movieId')
merged_dataset.drop(['genres','Year'],axis = 1, inplace = True)
merged_dataset.head()

In [None]:
refined_dataset = merged_dataset.groupby(by = ['userId','title'],as_index=False).agg({"rating":"mean"})
refined_dataset.head()

In [None]:
# pivot and create movie-user matrix
user_to_movie_df = refined_dataset.pivot(
    index='userId',columns='title',values='rating').fillna(0)
user_to_movie_sparse_df = csr_matrix(user_to_movie_df.values)
user_to_movie_sparse_df

In [None]:
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_to_movie_sparse_df)

In [None]:
#function to  find the n similar users to given input user
def get_similar_users(user,n = 5):
    
    knn_input = np.asarray([user_to_movie_df.values[user-1]])
    distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)
    print("Top",n,"users who are very much similar to the User-",user, "are: ")
    print(" ")
    for i in range(1,len(distances[0])):
        print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
    return indices.flatten()[1:] + 1, distances.flatten()[1:]

In [None]:
user_id = int(input("Enter user id : "))

In [None]:
from pprint import pprint
#user_id = 778
print(" Few of movies seen by the User:")
pprint(list(refined_dataset[refined_dataset['userId'] == user_id]['title'])[:10])
similar_user_list, distance_list = get_similar_users(user_id,5)

In [None]:
movie_ratings_sim_users = user_to_movie_df.values[similar_user_list]
movie_ratings_sim_users

In [None]:
movie_list = user_to_movie_df.columns
movie_list

In [None]:
def recommend_movies(n):
    n = min(len(mean_rat_movie),n)
    pprint(list(movie_list[np.argsort(mean_rat_movie)[::-1][:n]]))

In [None]:
print("movies recommended based on similar users are: ")
recommend_movies(10)