In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Movie.csv')

In [3]:
data.head()

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5


In [5]:
print(data["movie"].unique())

['Toy Story (1995)' 'Jumanji (1995)' 'Grumpier Old Men (1995)'
 'Waiting to Exhale (1995)' 'Father of the Bride Part II (1995)'
 'Heat (1995)' 'Sabrina (1995)' 'Tom and Huck (1995)'
 'Sudden Death (1995)' 'GoldenEye (1995)']


In [6]:
len(data["movie"].unique())

10

In [7]:
#number of unique users in the dataset
len(data.userId.unique())

4081

In [8]:
user_data = data.pivot(index='userId',
                                 columns='movie',
                                 values='rating').reset_index(drop=True)

In [9]:
user_data

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
0,,,,,3.5,,,,,
1,,,4.0,,,,,,,
2,,,,,,,,,4.0,
3,,4.0,,3.0,,,,,,
4,,,,,3.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
4076,4.0,,,,,,,,,
4077,3.5,,,,,,,,4.0,
4078,,3.0,4.0,5.0,,3.0,1.0,,4.0,
4079,,,,,,,,,5.0,


In [15]:
#Impute those NaNs with 0 values
user_data.fillna(0, inplace=True)

In [16]:
user_data

movie,Father of the Bride Part II (1995),GoldenEye (1995),Grumpier Old Men (1995),Heat (1995),Jumanji (1995),Sabrina (1995),Sudden Death (1995),Tom and Huck (1995),Toy Story (1995),Waiting to Exhale (1995)
0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
3,0.0,4.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
4076,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4077,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
4078,0.0,3.0,4.0,5.0,0.0,3.0,1.0,0.0,4.0,0.0
4079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0


In [17]:
#Calculating Cosine Similarity between Users
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [18]:
print(user_data.values)

[[0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  4.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  4.  0. ]
 ...
 [0.  3.  4.  ... 0.  4.  0. ]
 [0.  0.  0.  ... 0.  5.  0. ]
 [0.  0.  0.  ... 0.  4.5 0. ]]


In [19]:
print(pairwise_distances( user_data.values,metric='cosine'))

[[0.         1.         1.         ... 1.         1.         0.44662843]
 [1.         0.         1.         ... 0.54116853 1.         1.        ]
 [1.         1.         0.         ... 0.54116853 0.         0.37745698]
 ...
 [1.         0.54116853 0.54116853 ... 0.         0.54116853 0.52392946]
 [1.         1.         0.         ... 0.54116853 0.         0.37745698]
 [0.44662843 1.         0.37745698 ... 0.52392946 0.37745698 0.        ]]


In [20]:
user_sim = 1 - pairwise_distances( user_data.values,metric='cosine')

In [21]:
user_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.55337157],
       [0.        , 1.        , 0.        , ..., 0.45883147, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       ...,
       [0.        , 0.45883147, 0.45883147, ..., 1.        , 0.45883147,
        0.47607054],
       [0.        , 0.        , 1.        , ..., 0.45883147, 1.        ,
        0.62254302],
       [0.55337157, 0.        , 0.62254302, ..., 0.47607054, 0.62254302,
        1.        ]])

In [22]:
#Store the results in a dataframe
user_sim_df = pd.DataFrame(user_sim)

In [23]:
user_sim_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4071,4072,4073,4074,4075,4076,4077,4078,4079,4080
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.553372
1,0.0,1.0,0.0,0.0,0.0,0.390567,0.707107,0.615457,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.458831,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.650945,0.0,0.492366,1.0,0.874157,...,0.0,1.0,0.0,0.707107,0.0,0.0,0.752577,0.458831,1.0,0.622543
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.615457,0.0,0.388514,...,0.8,0.0,0.0,0.0,0.989949,0.0,0.0,0.619422,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.553372


In [24]:
#Set the index and column names to user ids 
user_sim_df.index = data.userId.unique()
user_sim_df.columns = data.userId.unique()

In [25]:
user_sim_df.head()

Unnamed: 0,3,6,8,10,11,12,13,14,16,19,...,6975,6979,6993,7030,7031,7044,7070,7080,7087,7105
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.553372
6,0.0,1.0,0.0,0.0,0.0,0.390567,0.707107,0.615457,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.458831,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0,0.650945,0.0,0.492366,1.0,0.874157,...,0.0,1.0,0.0,0.707107,0.0,0.0,0.752577,0.458831,1.0,0.622543
10,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.615457,0.0,0.388514,...,0.8,0.0,0.0,0.0,0.989949,0.0,0.0,0.619422,0.0,0.0
11,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.553372


In [26]:
#for 1st five rows and column
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,3,6,8,10,11
3,1.0,0.0,0.0,0.0,1.0
6,0.0,1.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0
10,0.0,0.0,0.0,1.0,0.0
11,1.0,0.0,0.0,0.0,1.0


In [27]:
#diagonally fill 0instead of 1 for similar data(3,3....)
np.fill_diagonal(user_sim, 0)
user_sim_df.iloc[0:5, 0:5]

Unnamed: 0,3,6,8,10,11
3,0.0,0.0,0.0,0.0,1.0
6,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0
11,1.0,0.0,0.0,0.0,0.0


In [28]:
user_sim_df.idxmax(axis=1)
#for best recoomendation same value\1 
#so here 3=11 identical /similar view on one object

3         11
6        168
8         16
10      4047
11         3
        ... 
7044      80
7070    1808
7080     708
7087       8
7105    4110
Length: 4081, dtype: int64

In [29]:
data.head()

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
1,6,Toy Story (1995),5.0
2,8,Toy Story (1995),4.0
3,10,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5


In [30]:
#Most Similar Users
user_sim_df.idxmax(axis=1)[0:5]

3       11
6      168
8       16
10    4047
11       3
dtype: int64

In [31]:
data[(data['userId']==6) | (data['userId']==168)]

Unnamed: 0,userId,movie,rating
1,6,Toy Story (1995),5.0
60,168,Toy Story (1995),4.5
3725,6,Grumpier Old Men (1995),3.0
6464,6,Sabrina (1995),5.0


In [32]:
data[(data['userId']==3) | (data['userId']==11)]

Unnamed: 0,userId,movie,rating
0,3,Toy Story (1995),4.0
4,11,Toy Story (1995),4.5
7446,11,GoldenEye (1995),2.5


In [37]:
user_1=data[data['userId']==6]
print(user_1)

      userId                    movie  rating
1          6         Toy Story (1995)     5.0
3725       6  Grumpier Old Men (1995)     3.0
6464       6           Sabrina (1995)     5.0


In [34]:
user_2=data[df['userId']==11]
print(user_2)

NameError: name 'movies_df' is not defined

In [35]:
#on the basis of their likes we can say that the userid 3 and 11 have some overlapping so we suggest both of us their distict movie 
pd.merge(user_1,user_2,on='movie',how='outer')

NameError: name 'user_1' is not defined