# WEEK-3 ASSIGNMENT (Recommender System)

##  Importing Movie,Ratings,Users Dataset

In [1]:
import pandas as pd
import numpy as np

In [5]:
movies=pd.read_csv('movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [7]:
users=pd.read_csv('tags.csv')
users

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078
...,...,...,...,...
465559,138446,55999,dragged,1358983772
465560,138446,55999,Jason Bateman,1358983778
465561,138446,55999,quirky,1358983778
465562,138446,55999,sad,1358983772


In [8]:
ratings=pd.read_csv('ratings.csv')

In [9]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580
5,1,112,3.5,1094785740
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826


## Data Cleaning and Preparation

In [11]:
#Dropping the timestamp column
ratings.drop('timestamp',axis=1,inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [19]:
#Count of the number of ratings which the user has given 
temp=ratings.groupby('userId').rating.count().sort_values(ascending=False)
temp

userId
118205    9254
8405      7515
82418     5646
121535    5520
125794    5491
          ... 
21882       20
68139       20
20578       20
61692       20
39574       20
Name: rating, Length: 138493, dtype: int64

In [20]:
#Converting the above series into a DataFrame with number of ratings as num_ratings
good_users=pd.DataFrame(temp.values,temp.index)
good_users.rename(columns={0:'num_ratings'},inplace=True)
good_users

Unnamed: 0_level_0,num_ratings
userId,Unnamed: 1_level_1
118205,9254
8405,7515
82418,5646
121535,5520
125794,5491
...,...
21882,20
68139,20
20578,20
61692,20


In [21]:
#Filtering out only those users who have given more than 500 ratings to movies
good_users=good_users[good_users['num_ratings']>500]
good_users

Unnamed: 0_level_0,num_ratings
userId,Unnamed: 1_level_1
118205,9254
8405,7515
82418,5646
121535,5520
125794,5491
...,...
99716,501
19706,501
124220,501
13205,501


In [22]:
#Taking only those ratings in which the users have rated more than 500 movies 
ratings=ratings[ratings['userId'].isin(good_users.index)]
ratings

Unnamed: 0,userId,movieId,rating
960,11,1,4.5
961,11,10,2.5
962,11,19,3.5
963,11,32,5.0
964,11,39,4.5
...,...,...,...
19998294,138474,5401,1.0
19998295,138474,5449,4.0
19998296,138474,5459,4.0
19998297,138474,5460,5.0


In [23]:
#Merging the movies and ratings DataFrames which have the same movieID
ratings_with_name=pd.merge(movies,ratings,how='inner',left_on='movieId',right_on='movieId')
ratings_with_name

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,24,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,54,4.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,58,5.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,91,4.0
...,...,...,...,...,...
6554411,131254,Kein Bund für's Leben (2007),Comedy,79570,4.0
6554412,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,79570,4.0
6554413,131258,The Pirates (2014),Adventure,28906,2.5
6554414,131260,Rentun Ruusu (2001),(no genres listed),65409,3.0


In [37]:
#Count of the number of movies which has been reviewed and converting it to DataFrame
temp_2=ratings_with_name.groupby('movieId').userId.count().sort_values(ascending=False)
good_movies=pd.DataFrame(temp_2.values,temp_2.index)
good_movies

Unnamed: 0_level_0,0
movieId,Unnamed: 1_level_1
2571,6938
356,6870
480,6824
1270,6807
296,6767
...,...
105081,1
105079,1
105071,1
105068,1


In [38]:
#Renaming the number of views to num_views
good_movies.rename(columns={0:'num_views'},inplace=True)

In [39]:
#Filtering only those movies which have at least 500 views/ratings
good_movies=good_movies[good_movies['num_views']>500]
good_movies

Unnamed: 0_level_0,num_views
movieId,Unnamed: 1_level_1
2571,6938
356,6870
480,6824
1270,6807
296,6767
...,...
56156,502
8908,502
45442,501
58299,501


In [40]:
#Final DataFrame in which there are only users who have at least been rated more than 500 movies and movies
#which have have been reviewed/rated at least 500 times
final_ratings=ratings_with_name[ratings_with_name['movieId'].isin(good_movies.index)]
final_ratings

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,24,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,54,4.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,58,5.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,91,4.0
...,...,...,...,...,...
6542567,112852,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi,137277,3.5
6542568,112852,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi,137343,4.0
6542569,112852,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi,137805,4.0
6542570,112852,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi,137879,4.0


## User-to-User Collaborative System Filtering Model

In [42]:
#Making a pivot table for the creation of a utility matrix
movie_matrix=final_ratings.pivot_table(values='rating',columns='userId',index='title')

In [45]:
#Filling all then NaN values as 0
movie_matrix.fillna(0,inplace=True)
movie_matrix

userId,11,24,54,58,91,104,116,134,156,208,...,138270,138301,138307,138325,138382,138397,138406,138411,138437,138474
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,4.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0
*batteries not included (1987),5.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
...And Justice for All (1979),0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You (1999),5.0,0.0,0.0,0.0,3.5,3.0,2.0,3.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.5,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zombieland (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.5,4.0,4.5,0.0,0.0,0.0,0.0,4.0,0.0
Zoolander (2001),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,4.0,3.5,0.0,3.0,0.0,0.0,2.5,0.0,4.0,0.0
eXistenZ (1999),5.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,...,4.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0


In [44]:
from scipy.sparse import csr_matrix

In [46]:
#Making the sparse matrix
movie_utility_matrix=csr_matrix(movie_matrix)
movie_utility_matrix

<3124x7441 sparse matrix of type '<class 'numpy.float64'>'
	with 5257205 stored elements in Compressed Sparse Row format>

In [47]:
from sklearn.neighbors import NearestNeighbors

In [48]:
#Initializing the model
model=NearestNeighbors(algorithm='brute')

In [49]:
model.fit(movie_utility_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [50]:
#Testing the model
distance,suggestion=model.kneighbors(movie_matrix.iloc[-5,:].values.reshape(1,-1),n_neighbors=5)

In [51]:
suggestion

array([[3119, 1535, 2457, 1449, 1223]], dtype=int64)

In [56]:
movie_matrix.index[3119]

'Zombieland (2009)'

In [58]:
#Defining the function for Recommender System
def recommend(moviename):
    m_id=np.where(movie_matrix.index==moviename)[0][0]
    distance,suggestion=model.kneighbors(movie_matrix.iloc[m_id,:].values.reshape(1,-1),n_neighbors=5)
    for i in suggestion[0]:
        print(movie_matrix.index[i])
        

In [61]:
recommend('Zoolander (2001)')

Zoolander (2001)
Dodgeball: A True Underdog Story (2004)
Austin Powers in Goldmember (2002)
Anchorman: The Legend of Ron Burgundy (2004)
Dude, Where's My Car? (2000)


In [66]:
recommend('Captain America: The Winter Soldier (2014)')

Captain America: The Winter Soldier (2014)
X-Men: Days of Future Past (2014)
Guardians of the Galaxy (2014)
Iron Man 3 (2013)
Edge of Tomorrow (2014)


## Item-to-Item Collaborative System Model

In [67]:
def standardize(row):
    new_row=(row-row.mean())/(row.max()-row.min())
    return new_row

In [70]:
movie_matrix_std=movie_matrix.apply(standardize,axis=1)

In [71]:
movie_matrix_std

userId,11,24,54,58,91,104,116,134,156,208,...,138270,138301,138307,138325,138382,138397,138406,138411,138437,138474
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",-0.111853,-0.111853,0.488147,-0.111853,-0.111853,-0.111853,-0.111853,-0.111853,0.488147,-0.111853,...,-0.111853,-0.111853,-0.111853,-0.111853,0.488147,-0.111853,-0.111853,0.288147,-0.111853,0.688147
(500) Days of Summer (2009),-0.139188,-0.139188,-0.139188,-0.139188,-0.139188,-0.139188,-0.139188,-0.139188,-0.139188,-0.139188,...,-0.139188,0.260812,-0.139188,-0.139188,-0.139188,-0.139188,-0.139188,-0.139188,0.760812,-0.139188
*batteries not included (1987),0.942978,-0.057022,-0.057022,-0.057022,0.542978,-0.057022,-0.057022,-0.057022,-0.057022,-0.057022,...,-0.057022,-0.057022,-0.057022,-0.057022,-0.057022,0.742978,-0.057022,-0.057022,-0.057022,-0.057022
...And Justice for All (1979),-0.050517,-0.050517,0.749483,-0.050517,-0.050517,-0.050517,-0.050517,-0.050517,0.749483,-0.050517,...,-0.050517,-0.050517,-0.050517,-0.050517,-0.050517,-0.050517,-0.050517,-0.050517,-0.050517,-0.050517
10 Things I Hate About You (1999),0.739511,-0.260489,-0.260489,-0.260489,0.439511,0.339511,0.139511,0.339511,-0.260489,-0.260489,...,-0.260489,-0.260489,0.539511,-0.260489,-0.260489,-0.260489,-0.260489,-0.260489,0.439511,0.539511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zombieland (2009),-0.162828,-0.162828,-0.162828,-0.162828,-0.162828,-0.162828,-0.162828,-0.162828,-0.162828,-0.162828,...,-0.162828,0.537172,0.637172,0.737172,-0.162828,-0.162828,-0.162828,-0.162828,0.637172,-0.162828
Zoolander (2001),-0.269816,-0.269816,-0.269816,-0.269816,-0.269816,-0.269816,-0.069816,-0.269816,-0.269816,-0.269816,...,0.530184,0.430184,-0.269816,0.330184,-0.269816,-0.269816,0.230184,-0.269816,0.530184,-0.269816
eXistenZ (1999),0.821099,0.421099,-0.178901,-0.178901,-0.178901,0.421099,-0.178901,-0.178901,0.821099,-0.178901,...,0.621099,0.421099,0.421099,-0.178901,-0.178901,-0.178901,-0.178901,-0.178901,-0.178901,-0.178901
xXx (2002),-0.157465,-0.157465,-0.157465,-0.157465,-0.157465,-0.157465,0.242535,-0.157465,-0.157465,-0.157465,...,-0.157465,-0.157465,-0.157465,-0.157465,-0.157465,0.642535,-0.157465,-0.157465,-0.157465,-0.157465


In [72]:
from sklearn.metrics.pairwise import cosine_similarity

In [73]:
item_similarity=cosine_similarity(movie_matrix_std)

In [74]:
item_similarity

array([[ 1.        , -0.05874296,  0.19562191, ...,  0.03775625,
         0.01398616,  0.31211292],
       [-0.05874296,  1.        ,  0.02983842, ...,  0.02523354,
         0.05188119, -0.10841412],
       [ 0.19562191,  0.02983842,  1.        , ...,  0.05552353,
         0.09857733,  0.12278266],
       ...,
       [ 0.03775625,  0.02523354,  0.05552353, ...,  1.        ,
         0.08652456, -0.01368733],
       [ 0.01398616,  0.05188119,  0.09857733, ...,  0.08652456,
         1.        ,  0.0430187 ],
       [ 0.31211292, -0.10841412,  0.12278266, ..., -0.01368733,
         0.0430187 ,  1.        ]])

In [75]:
item_similarity_df=pd.DataFrame(item_similarity,index=movie_matrix_std.index,columns=movie_matrix_std.index)
item_similarity_df

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zack and Miri Make a Porno (2008),Zelig (1983),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.000000,-0.058743,0.195622,0.049851,0.117242,0.007373,0.085219,0.128579,0.043407,0.001783,...,0.015103,-0.004966,-0.030372,0.091073,-0.021621,-0.024426,0.056357,0.037756,0.013986,0.312113
(500) Days of Summer (2009),-0.058743,1.000000,0.029838,-0.023804,0.098423,0.157227,0.026645,-0.041274,0.024447,0.045312,...,0.365706,0.073790,0.320763,-0.041645,0.377384,0.471626,0.144213,0.025234,0.051881,-0.108414
*batteries not included (1987),0.195622,0.029838,1.000000,0.015554,0.088953,0.068397,0.086743,0.130191,0.073934,0.000517,...,0.049507,0.047609,0.008399,0.018694,0.017090,0.079328,0.108357,0.055524,0.098577,0.122783
...And Justice for All (1979),0.049851,-0.023804,0.015554,1.000000,-0.049461,-0.027421,0.008002,0.045367,0.023374,0.128570,...,-0.017157,0.104201,0.034852,0.062717,0.008764,-0.046200,-0.053464,0.041050,-0.030079,0.058935
10 Things I Hate About You (1999),0.117242,0.098423,0.088953,-0.049461,1.000000,0.058934,0.186032,0.132902,0.125756,-0.054386,...,0.103428,-0.072345,-0.000006,0.035006,0.016977,0.052582,0.203284,0.014604,0.155816,0.079113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zombieland (2009),-0.024426,0.471626,0.079328,-0.046200,0.052582,0.251491,0.032982,-0.036572,0.021510,0.012646,...,0.413400,0.024469,0.283946,-0.063027,0.356198,1.000000,0.202927,0.045705,0.141669,-0.069651
Zoolander (2001),0.056357,0.144213,0.108357,-0.053464,0.203284,0.081659,0.088025,0.029351,0.065523,-0.005101,...,0.201272,0.053759,0.068373,0.018389,0.168990,0.202927,1.000000,0.068039,0.220427,0.106849
eXistenZ (1999),0.037756,0.025234,0.055524,0.041050,0.014604,0.009548,-0.045704,-0.050534,-0.017715,0.008366,...,0.042291,0.070797,0.023579,0.143652,0.079822,0.045705,0.068039,1.000000,0.086525,-0.013687
xXx (2002),0.013986,0.051881,0.098577,-0.030079,0.155816,0.156140,0.120945,0.019999,0.109830,-0.051198,...,0.145485,-0.025673,0.053173,-0.002071,0.081333,0.141669,0.220427,0.086525,1.000000,0.043019


In [80]:
def get_similar_movies(moviename,user_rating):
    similar_score=item_similarity_df[moviename]*(user_rating-2.5)
    similar_score=similar_score.sort_values(ascending=False)
    similar_score=similar_score.head()
    return similar_score
print(get_similar_movies('Captain America: The Winter Soldier (2014)',3))

title
Captain America: The Winter Soldier (2014)    0.500000
X-Men: Days of Future Past (2014)             0.365000
Guardians of the Galaxy (2014)                0.342194
Iron Man 3 (2013)                             0.325329
Edge of Tomorrow (2014)                       0.318207
Name: Captain America: The Winter Soldier (2014), dtype: float64
