In [1]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [23]:
movies = pd.read_csv('moviesCollab.csv', usecols=['movieId','title','genres'])
ratings = pd.read_csv('ratingsCollab.csv', usecols=['userId','movieId','rating','timestamp'])

In [3]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [4]:
df_movies = movies 
df_ratings = ratings 

In [17]:
print(df_movies.shape)
df_movies.head()

(9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
df_movies_ratings=pd.merge(df_movies, df_ratings)

In [6]:
print(df_movies_ratings.shape)
df_movies_ratings

(100836, 6)


Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [7]:
ratings_matrix_items = df_movies_ratings.pivot_table(index=['movieId'],columns=['userId'],values='rating')
print(ratings_matrix_items.shape)
ratings_matrix_items

(9724, 610)


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,,,,,,,,,,...,,,,,,,,,,
193583,,,,,,,,,,,...,,,,,,,,,,
193585,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


In [8]:
ratings_matrix_items.reset_index(drop=True, inplace=True)
ratings_matrix_items

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
0,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
1,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
2,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
3,,,,,,3.0,,,,,...,,,,,,,,,,
4,,,,,,5.0,,,,,...,,,,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,,,,,,,,,,,...,,,,,,,,,,
9720,,,,,,,,,,,...,,,,,,,,,,
9721,,,,,,,,,,,...,,,,,,,,,,
9722,,,,,,,,,,,...,,,,,,,,,,


In [9]:
ratings_matrix_items.fillna( 0, inplace = True )
ratings_matrix_items

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
0,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
1,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9720,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


By default when we call pairwise_distances the metric is euclidean

In [10]:
# movie_similarity = 1 - pairwise_distances(ratings_matrix_items.to_numpy())
# movie_similarity

Here we are finding the distance matrix using cosine metric (that generally means using cosine similarity)

In [11]:
movie_similarity = 1 - pairwise_distances(ratings_matrix_items.to_numpy(), metric="cosine" )
movie_similarity

array([[1.        , 0.41056206, 0.2969169 , ..., 0.        , 0.        ,
        0.        ],
       [0.41056206, 1.        , 0.28243799, ..., 0.        , 0.        ,
        0.        ],
       [0.2969169 , 0.28243799, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [12]:
np.fill_diagonal( movie_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
ratings_matrix_items = pd.DataFrame( movie_similarity )
ratings_matrix_items

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723
0,0.000000,0.410562,0.296917,0.035573,0.308762,0.376316,0.277491,0.131629,0.232586,0.395573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.410562,0.000000,0.282438,0.106415,0.287795,0.297009,0.228576,0.172498,0.044835,0.417693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.296917,0.282438,0.000000,0.092406,0.417802,0.284257,0.402831,0.313434,0.304840,0.242954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.035573,0.106415,0.092406,0.000000,0.188376,0.089685,0.275035,0.158022,0.000000,0.095598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.308762,0.287795,0.417802,0.188376,0.000000,0.298969,0.474002,0.283523,0.335058,0.218061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
9720,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
9721,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
9722,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0


# If you want to check then run otherwise fine.

In [13]:
# corrMatrix = ratings_matrix_items.T.corr(method='pearson')
# print(corrMatrix.shape)
# corrMatrix.head(10)

In [20]:
inp=df_movies[df_movies['title']=="Jumanji (1995)"].index.tolist()
# inp=df_movies[df_movies['title']=="Andrew Dice Clay: Dice Rules (1991)"].index.tolist()
inp=inp[0]
inp

1

In [22]:
df_movies['similarity'] = ratings_matrix_items.iloc[inp]
df_movies.columns = ['movie_id', 'title', 'release_date','similarity']
df_movies.iloc[9720:9730]

Unnamed: 0,movie_id,title,release_date,similarity
9720,189333,Mission: Impossible - Fallout (2018),Action|Adventure|Thriller,0.0
9721,189381,SuperFly (2018),Action|Crime|Thriller,0.0
9722,189547,Iron Soldier (2010),Action|Sci-Fi,0.0
9723,189713,BlacKkKlansman (2018),Comedy|Crime|Drama,0.0
9724,190183,The Darkest Minds (2018),Sci-Fi|Thriller,
9725,190207,Tilt (2011),Drama|Romance,
9726,190209,Jeff Ross Roasts the Border (2017),Comedy,
9727,190213,John From (2015),Drama,
9728,190215,Liquid Truth (2017),Drama,
9729,190219,Bunny (1998),Animation,


In [85]:
def item_similarity(movieName): 
    inp=df_movies[df_movies['title']==movieName].index.tolist()
    inp=inp[0]

    df_movies['similarity'] = ratings_matrix_items.iloc[inp]
    df_movies.columns = ['movie_id', 'title', 'genre','similarity']

So We are just taking the movie that the user rated it in between 4.5 to 5 

In [86]:
user_movie= df_movies_ratings[(df_movies_ratings.userId==50) & df_movies_ratings.rating.isin([5,4.5])][['title']]
user_movie

Unnamed: 0,title
21083,2001: A Space Odyssey (1968)
25518,Lawrence of Arabia (1962)
25748,Apocalypse Now (1979)
27802,8 1/2 (8½) (1963)


We have only single column that's why 0,0 means the the 0th row and 0th column

In [87]:
user_movie=user_movie.iloc[0,0]
user_movie

'2001: A Space Odyssey (1968)'

In [88]:
item_similarity(user_movie)
sorted_movies_as_per_userChoice=df_movies.sort_values( ["similarity"], ascending = False )
sorted_movies_as_per_userChoice

Unnamed: 0,movie_id,title,release_date,similarity
474,541,Blade Runner (1982),Action|Sci-Fi|Thriller,0.670736
914,1213,Goodfellas (1990),Crime|Drama,0.569947
908,1207,To Kill a Mockingbird (1962),Drama,0.569111
901,1199,Brazil (1985),Fantasy|Sci-Fi,0.563665
906,1204,Lawrence of Arabia (1962),Adventure|Drama|War,0.561598
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,
9739,193585,Flint (2017),Drama,
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,


In [89]:
gettingMovieId=sorted_movies_as_per_userChoice[sorted_movies_as_per_userChoice['similarity'] >=0.45]['movie_id']
print(gettingMovieId.shape)
gettingMovieId.head()

(59,)


474     541
914    1213
908    1207
901    1199
906    1204
Name: movie_id, dtype: int64

All the movies id that the user have already rated

In [90]:
user2Movies= df_ratings[df_ratings['userId']==50]['movieId']
print(user2Movies.shape)
user2Movies.head()

(310,)


7112      1
7113     32
7114    111
7115    165
7116    296
Name: movieId, dtype: int64

We are taking all those that we have sorted it and the user haven't seen it so we can suggest that movies.

In [91]:
# df_recommended_item = pd.DataFrame()
# for movieId in gettingMovieId:
#         if movieId not in user2Movies:
#             new_item= sorted_movies_as_per_userChoice[(sorted_movies_as_per_userChoice.movie_id==movieId)]
#             df_recommended_item=pd.concat([df_recommended_item,new_item])
# print(df_recommended_item.shape)
# df_recommended_item

In [127]:
df_recommended_item = pd.DataFrame()
for movieId in gettingMovieId:
        if movieId not in user2Movies:
            new_item= df_ratings[(df_ratings.movieId==movieId)]
            df_recommended_item=pd.concat([df_recommended_item,new_item])
print(df_recommended_item.shape)
df_recommended_item

(3296, 4)


Unnamed: 0,userId,movieId,rating,timestamp
1580,16,541,4.5,1377477340
1681,17,541,3.5,1322628857
1818,18,541,4.0,1456745325
2386,19,541,4.0,965703555
3240,21,541,3.5,1452052129
...,...,...,...,...
85285,554,2857,2.0,944898992
97229,605,2857,2.5,1277176095
44632,298,1526,2.0,1466278019
73540,474,1526,0.5,1053021910


In [128]:
df_recommended_item = df_recommended_item.drop_duplicates(subset = 'movieId')
df_recommended_item

Unnamed: 0,userId,movieId,rating,timestamp
1580,16,541,4.5,1377477340
74,1,1213,5.0,964982951
1605,16,1207,4.0,1377476891
379,4,1199,2.0,945078967
4783,29,1204,4.5,1308084015
891,7,750,4.0,1106636810
13298,84,1194,3.0,860398764
70,1,1198,5.0,964981827
1707,17,1221,4.5,1305696752
2492,19,1238,3.0,965705784


In [129]:
best10=df_recommended_item.sort_values(["rating"], ascending = False )[0:10]
best10

Unnamed: 0,userId,movieId,rating,timestamp
9537,64,1096,5.0,1161536367
191,1,2949,5.0,964981888
1456,15,1200,5.0,1510572775
45,1,923,5.0,964981529
15,1,260,5.0,964981680
77,1,1220,5.0,964981909
380,4,1203,5.0,945174025
74,1,1213,5.0,964982951
62,1,1089,5.0,964982951
70,1,1198,5.0,964981827


In [136]:
# df_movies[df_movies['movie_id'] == 1198]['title']

900    Raiders of the Lost Ark (Indiana Jones and the...
Name: title, dtype: object

In [131]:
best10['movieId']

9537    1096
191     2949
1456    1200
45       923
15       260
77      1220
380     1203
74      1213
62      1089
70      1198
Name: movieId, dtype: int64

In [138]:
final_movie_list = pd.DataFrame()
for i in best10['movieId']:
    new_movie = df_movies[(df_movies['movie_id'] == i)]
    final_movie_list = pd.concat([final_movie_list, new_movie])
final_movie_list

Unnamed: 0,movie_id,title,release_date,similarity
835,1096,Sophie's Choice (1982),Drama,0.466874
2220,2949,Dr. No (1962),Action|Adventure|Thriller,0.496031
902,1200,Aliens (1986),Action|Adventure|Horror|Sci-Fi,0.500401
705,923,Citizen Kane (1941),Drama|Mystery,0.457812
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,0.495474
921,1220,"Blues Brothers, The (1980)",Action|Comedy|Musical,0.493115
905,1203,12 Angry Men (1957),Drama,0.484937
914,1213,Goodfellas (1990),Crime|Drama,0.569947
828,1089,Reservoir Dogs (1992),Crime|Mystery|Thriller,0.465971
900,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,0.526079


In [139]:
final_movie_list['title']

835                                Sophie's Choice (1982)
2220                                        Dr. No (1962)
902                                         Aliens (1986)
705                                   Citizen Kane (1941)
224             Star Wars: Episode IV - A New Hope (1977)
921                            Blues Brothers, The (1980)
905                                   12 Angry Men (1957)
914                                     Goodfellas (1990)
828                                 Reservoir Dogs (1992)
900     Raiders of the Lost Ark (Indiana Jones and the...
Name: title, dtype: object

# After similarity matrix we should run these code

In [357]:
def item_similarity(userMovie):
    userMovieIndex = df_movies[df_movies['title'] == userMovie].index.tolist()[0]
    df_movies['similarity'] = ratings_matrix_items.iloc[userMovieIndex]
    sorted_similar_movies = df_movies.sort_values(['similarity'], ascending = False)
    very_similar_movies = sorted_similar_movies[sorted_similar_movies['similarity'] >= 0.45]
    return very_similar_movies

In [358]:
data = item_similarity('Blade Runner (1982)')
data

Unnamed: 0,movieId,title,genres,similarity
706,924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,0.670736
914,1213,Goodfellas (1990),Crime|Drama,0.607595
938,1238,Local Hero (1983),Comedy,0.588224
862,1135,Private Benjamin (1980),Comedy,0.583766
897,1194,Cheech and Chong's Up in Smoke (1978),Comedy,0.582449
...,...,...,...,...
4131,5943,Maid in Manhattan (2002),Comedy|Romance,0.455691
2978,3990,Rugrats in Paris: The Movie (2000),Animation|Children|Comedy,0.453969
902,1200,Aliens (1986),Action|Adventure|Horror|Sci-Fi,0.452029
919,1218,"Killer, The (Die xue shuang xiong) (1989)",Action|Crime|Drama|Thriller,0.451630


In [359]:
def recommended_movie(userId):
    user_movie_dataframe = df_movies_ratings[(df_movies_ratings['userId'] == userId) & df_movies_ratings['rating'].isin([5,4.5])][['title']]
    user_movie = user_movie_dataframe.iloc[0,0]
    very_similar_movies = item_similarity(user_movie)
    
    user_movieId = df_ratings[df_ratings['userId'] == userId]['movieId']
    similar_movieId = very_similar_movies['movieId']
    
    df_recommended_movies = pd.DataFrame()
    final_best10 = pd.DataFrame()
    
    for i in similar_movieId:
        if i not in user_movieId:
            df_new = df_ratings[(df_ratings['movieId'] == i)]
            df_recommended_movies = pd.concat([df_recommended_movies, df_new])
    
    df_recommended_movies = df_recommended_movies.sort_values('rating', ascending = False)
    df_recommended_movies = df_recommended_movies.drop_duplicates(subset = 'movieId')
    
    best10 = df_recommended_movies[0:10]
    
    for i in best10['movieId']:
        new_movie = df_movies[df_movies['movieId'] == i]
        final_best10 = pd.concat([final_best10, new_movie])
        
    return final_best10

In [360]:
user_movie = recommended_movie(50)
user_movie

Unnamed: 0,movieId,title,genres,similarity
520,608,Fargo (1996),Comedy|Crime|Drama|Thriller,0.496841
659,858,"Godfather, The (1972)",Crime|Drama,0.488569
922,1221,"Godfather: Part II, The (1974)",Crime|Drama,0.525002
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,0.451641
1644,2193,Willow (1988),Action|Adventure|Fantasy,0.465824
905,1203,12 Angry Men (1957),Drama,0.484937
742,969,"African Queen, The (1951)",Adventure|Comedy|Romance|War,0.461771
921,1220,"Blues Brothers, The (1980)",Action|Comedy|Musical,0.493115
919,1218,"Killer, The (Die xue shuang xiong) (1989)",Action|Crime|Drama|Thriller,0.512608
1575,2114,"Outsiders, The (1983)",Drama,0.460745


# Summary

First we will use pivot table to get the user rating to particular movie and from their we will find the distance matrix means similarity matrix between the movies.

We will get the title of movie from user whose rating is greater than 4.5 and according to that we will find the similarity matrix for that title and then we will sort them according to similarity so that the most similar movie is in the top.

We will get the movie Id of similar movie whose similarity is greater than 0.45 and we will also get user rated movie Id. Now we will get only those movie in our dataset that the user have not seen but it is similar.

Now we get the similar movie that the user have not seen so now we will sort them according to the other user rating and then we will recommend the top movie.