In [1]:
import pandas as pd
rating_df = pd.read_csv("ratings.csv")

In [2]:
rating_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
rating_df.drop('timestamp',axis=1,inplace=True)


In [4]:
len(rating_df.userId.unique())

610

In [5]:
len(rating_df.movieId.unique())

9724

In [6]:
user_movies_df = rating_df.pivot(index='userId',
columns='movieId',values="rating").reset_index(drop=True)
user_movies_df.index=rating_df.userId.unique()

In [7]:
user_movies_df.iloc[0:5,0:15]

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
1,4.0,,4.0,,,4.0,,,,,,,,,
2,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,
5,4.0,,,,,,,,,,,,,,


In [8]:
user_movies_df.fillna(0,inplace=True)
user_movies_df.iloc[0:5,0:10]

movieId,1,2,3,4,5,6,7,8,9,10
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation
user_sim = 1 - pairwise_distances(user_movies_df.values,metric='cosine')
#store the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)
#set the index and column names to user ids (0 to 671)
user_sim_df.index = rating_df.userId.unique()
user_sim_df.columns = rating_df.userId.unique()

In [10]:
user_sim_df.iloc[0:5,0:5]

Unnamed: 0,1,2,3,4,5
1,1.0,0.027283,0.05972,0.194395,0.12908
2,0.027283,1.0,0.0,0.003726,0.016614
3,0.05972,0.0,1.0,0.002251,0.00502
4,0.194395,0.003726,0.002251,1.0,0.128659
5,0.12908,0.016614,0.00502,0.128659,1.0


In [11]:
user_sim_df.shape

(610, 610)

In [12]:
import numpy as np
np.fill_diagonal(user_sim,0)
user_sim_df.iloc[0:5,0:5]

Unnamed: 0,1,2,3,4,5
1,0.0,0.027283,0.05972,0.194395,0.12908
2,0.027283,0.0,0.0,0.003726,0.016614
3,0.05972,0.0,0.0,0.002251,0.00502
4,0.194395,0.003726,0.002251,0.0,0.128659
5,0.12908,0.016614,0.00502,0.128659,0.0


In [13]:
user_sim_df.idxmax(axis=1)[0:5]

1    266
2    366
3    313
4    391
5    470
dtype: int64

In [14]:
user_sim_df.iloc[1:2,330:340]

Unnamed: 0,331,332,333,334,335,336,337,338,339,340
2,0.199366,0.073652,0.050674,0.053668,0.073991,0.046544,0.018408,0.074145,0.111447,0.03063


In [15]:
movies_df = pd.read_csv("movies.csv")

In [16]:
movies_df[0:5]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [17]:
movies_df.drop('genres', axis=1,inplace=True)

In [18]:
def get_user_similar_movies(user1,user2):
    # inner join between movies watched between two users will give the common movies watched.
    common_movies = rating_df[rating_df.userId == user1].merge(rating_df[rating_df.userId ==user2],
        on = "movieId",how = "inner")
    #join the above result with movies details
    return common_movies.merge(movies_df, on='movieId')

In [19]:
common_movies = get_user_similar_movies(2,338)

In [20]:
common_movies[(common_movies.rating_x >=4.0) &
((common_movies.rating_y>=4.0))]

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
1,2,6874,4.0,338,4.5,Kill Bill: Vol. 1 (2003)


In [21]:
common_movies = get_user_similar_movies(3,332)
common_movies

Unnamed: 0,userId_x,movieId,rating_x,userId_y,rating_y,title
0,3,527,0.5,332,4.0,Schindler's List (1993)
1,3,849,5.0,332,2.0,Escape from L.A. (1996)
2,3,1093,0.5,332,4.0,"Doors, The (1991)"
3,3,3703,5.0,332,4.0,"Road Warrior, The (Mad Max 2) (1981)"


In [27]:
rating_mat = rating_df.pivot(index='movieId',columns='userId',
values = "rating").reset_index(drop=True)
#fill all Nans with 0
rating_mat.fillna(0, inplace = True)
#find the correlation between movies
movie_sim = 1 -pairwise_distances(rating_mat.values,
metric="correlation")
#fill the diagnol with 0  as it represent the auto-correlatio of movies
movie_sim_df = pd.DataFrame(movie_sim)

In [28]:
movie_sim_df.iloc[0:5, 0:5]

Unnamed: 0,0,1,2,3,4
0,1.0,0.231327,0.173213,-0.028917,0.192474
1,0.231327,1.0,0.191945,0.071269,0.200526
2,0.173213,0.191945,1.0,0.067143,0.370171
3,-0.028917,0.071269,0.067143,1.0,0.16791
4,0.192474,0.200526,0.370171,0.16791,1.0


In [29]:
movie_sim_df.shape

(9724, 9724)

In [33]:
def get_similar_movies(movieid,topN=5):
    movieidx=movies_df[movies_df.movieId==movieid].index[0]
    movies_df['similarity']=movie_sim_df.iloc[movieidx]
    top_n=movies_df.sort_values(['similarity'],ascending=False)[0:topN]
    return top_n


In [34]:
movies_df[movies_df.movieId==858]

Unnamed: 0,movieId,title
843,858,"Godfather, The (1972)"


In [35]:
get_similar_movies(858)

Unnamed: 0,movieId,title,similarity
843,858,"Godfather, The (1972)",1.0
8630,26131,"Battle of Algiers, The (La battaglia di Algeri...",0.866952
8088,8771,Sherlock Holmes: Terror by Night (1946),0.763865
7048,7160,Monster (2003),0.749748
6080,6179,"Prayer for the Dying, A (1987)",0.732239


In [36]:
get_similar_movies(858)

Unnamed: 0,movieId,title,similarity
843,858,"Godfather, The (1972)",1.0
8630,26131,"Battle of Algiers, The (La battaglia di Algeri...",0.866952
8088,8771,Sherlock Holmes: Terror by Night (1946),0.763865
7048,7160,Monster (2003),0.749748
6080,6179,"Prayer for the Dying, A (1987)",0.732239


In [37]:
movies_df[movies_df.movieId == 231]

Unnamed: 0,movieId,title,similarity
228,231,Dumb & Dumber (Dumb and Dumber) (1994),-0.018576


In [38]:
get_similar_movies(231)

Unnamed: 0,movieId,title,similarity
228,231,Dumb & Dumber (Dumb and Dumber) (1994),1.0
444,448,Fearless (1993),0.442149
191,193,Showgirls (1995),0.415626
413,417,Barcelona (1994),0.37703
265,268,Little Odessa (1994),0.372662
