## Movie Recommendation

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df=pd.read_csv("Movie.csv")

In [3]:
movies_df[0:5]

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5


In [4]:
#to find the unique users in the dataset
len(movies_df.userId.unique())

4081

In [5]:
#to find the unique movies in the dataset
len(movies_df.movie.unique())

10

In [6]:
user_movies_df=movies_df.pivot(index='userId',columns='movie',values='rating').reset_index(drop=True)

In [7]:
user_movies_df

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
0,,,,,3.5,,,,,
1,,,4.0,,,,,,,
2,,,,,,,,,4.0,
3,,4.0,,3.0,,,,,,
4,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
4076,4.0,,,,,,,,,
4077,3.5,,,,,,,,4.0,
4078,,3.0,4.0,5.0,,3.0,1.0,,4.0,
4079,,,,,,,,,5.0,


#### In the above o/p, "movie" column just represents the index and not the user id. And we need to map this index with the user id, for that we have to again calculate the unique user ids and that is stored in the var  "df_user" in the below cell.

In [8]:
df_user=movies_df.userId.unique()

#### The above "df_user" is stored in a dataframe format under the column "UserId".


In [9]:
df_userid=pd.DataFrame(df_user,columns=["UserId"])
df_userid

Unnamed: 0,UserId
0,3
1,6
2,8
3,10
4,11
...,...
4076,7044
4077,7070
4078,7080
4079,7087


#### The above "UserId" column are not in a sorted order. So it has to be sorted.

In [10]:
df_user1=df_userid.sort_values('UserId')
df_user1

Unnamed: 0,UserId
2569,1
2974,2
0,3
3294,4
2570,5
...,...
3293,7115
2565,7116
2566,7117
2567,7119


#### Pasting the above sorted user ids in the pivot table which was created earlier(cell no:9), where the "movie" column will be replaced with these sorted user ids.

In [11]:
user_movies_df.index=df_user1.UserId.unique()

In [12]:
user_movies_df

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
1,,,,,3.5,,,,,
2,,,4.0,,,,,,,
3,,,,,,,,,4.0,
4,,4.0,,3.0,,,,,,
5,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
7115,4.0,,,,,,,,,
7116,3.5,,,,,,,,4.0,
7117,,3.0,4.0,5.0,,3.0,1.0,,4.0,
7119,,,,,,,,,5.0,


In [13]:
#Impute the above NaN values with value 0
user_movies_df.fillna(0,inplace=True)

In [14]:
user_movies_df

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
1,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
4,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7115,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7116,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
7117,0.0,3.0,4.0,5.0,0.0,3.0,1.0,0.0,4.0,0.0
7119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


In [15]:
#Calculating cosine similarity between the users
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation

In [16]:
user_sim= 1 - pairwise_distances(user_movies_df.values,metric='cosine')

In [17]:
user_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.55337157],
       [0.        , 1.        , 0.        , ..., 0.45883147, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       ...,
       [0.        , 0.45883147, 0.45883147, ..., 1.        , 0.45883147,
        0.47607054],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       [0.55337157, 0.        , 0.62254302, ..., 0.47607054, 0.62254302,
        1.        ]])

#### We have calculated the similarity between user to user. So the size of the matrix is 4081 * 4081 (Since the no.of unique users are 4081)

In [18]:
#Storing the results in a dataframe
user_sim_df=pd.DataFrame(user_sim)

In [19]:
#Set the index and column names to user ids
user_sim_df.index=df_user1.UserId.unique()
user_sim_df.columns=df_user1.UserId.unique()

In [20]:
user_sim_df.iloc[0:15,0:15]

Unnamed: 0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514496,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.390567,0.707107,0.615457,0.0,0.0,0.437595,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.650945,0.0,0.492366,1.0,0.874157,0.58346,0.685994,0.789352,0.0,0.707107
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.615457,0.0,0.388514,0.262557,0.411597,0.0,0.8,0.424264
5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514496,0.0,0.0,0.0
6,0.0,0.390567,0.650945,0.0,0.0,1.0,0.73646,0.56088,0.650945,0.569028,0.83556,0.446544,0.913466,0.0,0.460287
7,0.0,0.707107,0.0,0.0,0.0,0.73646,1.0,0.435194,0.0,0.0,0.618853,0.0,0.434122,0.0,0.0
8,0.0,0.615457,0.492366,0.615457,0.0,0.56088,0.435194,1.0,0.492366,0.669519,0.71819,0.59108,0.38865,0.492366,0.609272
10,0.0,0.0,1.0,0.0,0.0,0.650945,0.0,0.492366,1.0,0.874157,0.58346,0.685994,0.789352,0.0,0.707107
11,0.0,0.0,0.874157,0.388514,0.0,0.569028,0.0,0.669519,0.874157,1.0,0.510036,0.849528,0.690018,0.485643,0.618123


#### WKT, the diagonal values of above o/p is 1, because it is calculating the similarity of 1st user to itself, 2nd user to itself and so on. As this creates an unnecessary confusion, since it shows highest similarity with the record itself. So we can substitute the diagonal values with 0.

In [21]:
np.fill_diagonal(user_sim,0)
user_sim_df.iloc[0:15,0:15]

Unnamed: 0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514496,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.390567,0.707107,0.615457,0.0,0.0,0.437595,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.650945,0.0,0.492366,1.0,0.874157,0.58346,0.685994,0.789352,0.0,0.707107
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.615457,0.0,0.388514,0.262557,0.411597,0.0,0.8,0.424264
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514496,0.0,0.0,0.0
6,0.0,0.390567,0.650945,0.0,0.0,0.0,0.73646,0.56088,0.650945,0.569028,0.83556,0.446544,0.913466,0.0,0.460287
7,0.0,0.707107,0.0,0.0,0.0,0.73646,0.0,0.435194,0.0,0.0,0.618853,0.0,0.434122,0.0,0.0
8,0.0,0.615457,0.492366,0.615457,0.0,0.56088,0.435194,0.0,0.492366,0.669519,0.71819,0.59108,0.38865,0.492366,0.609272
10,0.0,0.0,1.0,0.0,0.0,0.650945,0.0,0.492366,0.0,0.874157,0.58346,0.685994,0.789352,0.0,0.707107
11,0.0,0.0,0.874157,0.388514,0.0,0.569028,0.0,0.669519,0.874157,0.0,0.510036,0.849528,0.690018,0.485643,0.618123


In [22]:
#Identifying the max. similarity
user_sim_df.idxmax(axis=1)[20:30] #here "idx" will identify the index and "max" will identify the maximum, so it thus displays the index of the maximum value

24    2649
26      15
29     686
30     166
31       3
32     101
34     635
38    4553
39      16
41      61
dtype: int64

#### From the o/p, we can say that 24th user is similar with 2649th user, 26th user is similar with 15th user and so on.

In [23]:
#displaying the data of the similarity pair, user 38 and user 4553
movies_df[(movies_df['userId']==38) | (movies_df['userId']==4553)]

Unnamed: 0,userId,movie,rating
3730,38,Grumpier Old Men (1995),3.0
4165,4553,Grumpier Old Men (1995),3.0
6471,38,Sabrina (1995),5.0
6918,4553,Sabrina (1995),5.0
7164,38,Tom and Huck (1995),3.0


#### In the above o/p, movies "Grumpier Old Men" and "Sabrina" have been watched and rated by both the users but the movie "Tom and Huck" is watched and rated by only the 38th user. As 38th and 4553rd user have similar patterns of watching and rating of movies, so next time when 4553rd user logs in, then the movie "Tom and Huck" will be recommended to him/her.

#### Here, we have performed user-user similarity, same can be done for movie-movie.