In [1]:
import pandas as pd
import numpy as np

# Loading Ratings dataset

In [3]:
rating_df = pd.read_csv( "F:/JupyterML/ML_Practice/datasets/ml-100k/u.data", delimiter = "\t", header = None )

In [4]:
rating_df.head(4)

Unnamed: 0,0,1,2,3
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923


In [5]:
rating_df.columns = ["userid", "movieid", "rating", "timestamp"]

In [6]:
rating_df.head(4)

Unnamed: 0,userid,movieid,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923


In [8]:
rating_df.shape

(100000, 4)

In [10]:
##unique users
len(rating_df.userid.unique())

943

In [11]:
##unique movies
len(rating_df.movieid.unique())

1682

In [12]:
##Drop timestamp column
rating_df.drop( "timestamp", inplace = True, axis = 1 )

# Loading Movies Data

In [13]:
movies_df = pd.read_csv( "F:/JupyterML/ML_Practice/datasets/ml-100k/u.item", delimiter = '\|', header = None )

  """Entry point for launching an IPython kernel.


In [14]:
movies_df.head(4)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
movies_df = movies_df.iloc[:,:2]
movies_df.columns = ['movieid', 'title']

In [16]:
movies_df.head(4)

Unnamed: 0,movieid,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)


# Finding User Similarities

In [19]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine,correlation

In [30]:
##creating pivot table
user_movies_df  = rating_df.pivot(index='userid', columns='movieid', values = "rating").reset_index(drop=True)

In [31]:
user_movies_df.head(4)

movieid,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
1,4.0,,,,,,,,,2.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,


# Fill '0' for ratings not given by users

In [32]:
user_movies_df.fillna(value=0,inplace=True)

In [33]:
user_movies_df.shape

(943, 1682)

In [34]:
user_movies_df.iloc[10:20, 20:30]

movieid,21,22,23,24,25,26,27,28,29,30
10,0.0,4.0,0.0,3.0,3.0,0.0,0.0,5.0,3.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
12,3.0,4.0,5.0,1.0,1.0,0.0,3.0,5.0,2.0,0.0
13,0.0,3.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
15,0.0,5.0,0.0,0.0,0.0,0.0,2.0,5.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,5.0,4.0,0.0,3.0,4.0,0.0,3.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
user_sim = 1-pairwise_distances( user_movies_df.as_matrix(), metric="cosine" )

In [40]:
user_sim_df = pd.DataFrame( user_sim )

In [41]:
user_sim_df[0:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
1,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
2,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
3,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
4,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


In [42]:
user_sim_df.idxmax(axis=1)[0:5]

0    0
1    1
2    2
3    3
4    4
dtype: int64

The above results show that user are most similar to themselves. But this is not what we want. So, we will fill the diagonal of the matrix (which represent the relationship with self) with 0.

In [43]:
np.fill_diagonal( user_sim, 0 )

In [44]:
user_sim_df = pd.DataFrame( user_sim )

In [45]:
user_sim_df[0:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,0.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
1,0.166931,0.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
2,0.04746,0.110591,0.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
3,0.064358,0.178121,0.344151,0.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
4,0.378475,0.072979,0.021245,0.031804,0.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


# Finding user similarities

In [47]:
user_sim_df.idxmax(axis=1).sample(10,random_state = 10)

544    756
309    246
448    893
628    537
284    413
572    693
225    866
567    311
75     176
726    496
dtype: int64

This shows which results are similar to each other. The actual user id will be the index number + 1. That means user 545 is similar to user 757 and so on and so forth....

In [48]:
def get_user_similar_movies( user1, user2 ):
  common_movies = rating_df[rating_df.userid == user1].merge(
      rating_df[rating_df.userid == user2],
      on = "movieid",
      how = "inner" )

  return common_movies.merge( movies_df, on = 'movieid' )

In [50]:
get_user_similar_movies( 545, 757 )

Unnamed: 0,userid_x,movieid,rating_x,userid_y,rating_y,title
0,545,554,3,757,3,Waterworld (1995)
1,545,222,4,757,4,Star Trek: First Contact (1996)
2,545,399,4,757,3,"Three Musketeers, The (1993)"
3,545,210,5,757,4,Indiana Jones and the Last Crusade (1989)
4,545,101,4,757,4,Heavy Metal (1981)
5,545,228,5,757,4,Star Trek: The Wrath of Khan (1982)
6,545,89,3,757,4,Blade Runner (1982)
7,545,229,3,757,3,Star Trek III: The Search for Spock (1984)
8,545,472,5,757,3,Dragonheart (1996)
9,545,226,3,757,3,Die Hard 2 (1990)


In [51]:
get_user_similar_movies(310,247)

Unnamed: 0,userid_x,movieid,rating_x,userid_y,rating_y,title
0,310,258,3,247,5,Contact (1997)
1,310,257,5,247,4,Men in Black (1997)
2,310,251,5,247,4,Shall We Dance? (1996)
3,310,1022,5,247,4,"Fast, Cheap & Out of Control (1997)"
4,310,222,3,247,3,Star Trek: First Contact (1996)
5,310,181,4,247,4,Return of the Jedi (1983)
6,310,50,5,247,5,Star Wars (1977)


# Challenges with User similarity

The challenge with calculating user similarity is the user need to have some prior purchases and should have rated them. This recommendation technique does not work for new users. The system need to wait until the user make some purchases and rates them. Only then similar users can be found and recommendations can be made. This is called cold start problem.
This can be avoided by calculating item similarities based how users are buying these items and rates them together. Here the items are entities and users are dimensions.

In [52]:
rating_mat = rating_df.pivot( index='movieid', columns='userid', values = "rating" ).reset_index(drop=True)

In [53]:
rating_mat.fillna( 0, inplace = True )

In [54]:
rating_mat.shape

(1682, 943)

In [55]:
rating_mat.head( 10 )

userid,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
0,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
1,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,2.0,5.0,3.0,4.0,4.0,...,0.0,0.0,4.0,0.0,4.0,0.0,4.0,4.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
8,5.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,4.0,...,0.0,1.0,4.0,5.0,3.0,5.0,3.0,0.0,0.0,3.0
9,3.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
movie_sim = 1 - pairwise_distances( rating_mat.as_matrix(), metric="correlation" )

In [57]:
movie_sim.shape

(1682, 1682)

In [58]:
movie_sim_df = pd.DataFrame( movie_sim )

In [59]:
movie_sim_df.head( 10 )

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,1.0,0.234595,0.193362,0.226213,0.12884,0.015113,0.347354,0.25449,0.209502,0.104655,...,0.018215,-0.029676,-0.029676,-0.029676,0.018215,-0.029676,-0.029676,-0.029676,0.034179,0.034179
1,0.234595,1.0,0.190649,0.409044,0.240712,0.030062,0.220022,0.20602,0.077894,0.072906,...,-0.012451,-0.012451,-0.012451,-0.012451,-0.012451,-0.012451,-0.012451,-0.012451,0.071415,0.071415
2,0.193362,0.190649,1.0,0.227849,0.141368,0.065347,0.258855,0.078636,0.146181,0.079608,...,-0.009764,-0.009764,-0.009764,-0.009764,0.023964,-0.009764,-0.009764,-0.009764,-0.009764,0.091421
3,0.226213,0.409044,0.227849,1.0,0.237298,0.021878,0.295489,0.3528,0.229922,0.13822,...,-0.016619,-0.016619,0.088984,0.088984,0.025622,-0.016619,-0.016619,-0.016619,0.046743,0.067863
4,0.12884,0.240712,0.141368,0.237298,1.0,-0.008594,0.205289,0.145866,0.142541,-0.033746,...,-0.009889,-0.009889,-0.009889,-0.009889,-0.009889,-0.009889,-0.009889,-0.009889,-0.009889,0.088618
5,0.015113,0.030062,0.065347,0.021878,-0.008594,1.0,0.054415,0.01233,0.079619,0.166084,...,-0.005159,-0.005159,-0.005159,-0.005159,-0.005159,-0.005159,-0.005159,-0.005159,-0.005159,-0.005159
6,0.347354,0.220022,0.258855,0.295489,0.205289,0.054415,1.0,0.19067,0.286572,0.178505,...,-0.026036,0.03992,-0.026036,-0.026036,0.03992,-0.026036,-0.026036,-0.026036,0.03992,0.03992
7,0.25449,0.20602,0.078636,0.3528,0.145866,0.01233,0.19067,1.0,0.229331,0.152679,...,-0.01723,0.075617,0.057047,0.057047,0.075617,-0.01723,-0.01723,-0.01723,0.075617,-0.01723
8,0.209502,0.077894,0.146181,0.229922,0.142541,0.079619,0.286572,0.229331,1.0,0.158373,...,-0.021125,-0.021125,0.047273,0.047273,0.064372,-0.021125,-0.021125,-0.021125,0.047273,0.064372
9,0.104655,0.072906,0.079608,0.13822,-0.033746,0.166084,0.178505,0.152679,0.158373,1.0,...,-0.010138,-0.010138,0.073967,0.073967,-0.010138,-0.010138,-0.010138,-0.010138,-0.010138,-0.010138


# Finding similar movies to "Toy Story"

In [60]:
movies_df['similarity'] = movie_sim_df.iloc[0]
movies_df.columns = ['movieid', 'title', 'similarity']

In [61]:
movies_df.head( 10 )

Unnamed: 0,movieid,title,similarity
0,1,Toy Story (1995),1.0
1,2,GoldenEye (1995),0.234595
2,3,Four Rooms (1995),0.193362
3,4,Get Shorty (1995),0.226213
4,5,Copycat (1995),0.12884
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,0.015113
6,7,Twelve Monkeys (1995),0.347354
7,8,Babe (1995),0.25449
8,9,Dead Man Walking (1995),0.209502
9,10,Richard III (1995),0.104655


In [62]:
movies_df.sort_values( ["similarity"], ascending = False )[1:10]

Unnamed: 0,movieid,title,similarity
49,50,Star Wars (1977),0.457677
120,121,Independence Day (ID4) (1996),0.454544
116,117,"Rock, The (1996)",0.431789
150,151,Willy Wonka and the Chocolate Factory (1971),0.423975
180,181,Return of the Jedi (1983),0.422991
404,405,Mission: Impossible (1996),0.41677
94,95,Aladdin (1992),0.407829
117,118,Twister (1996),0.404908
221,222,Star Trek: First Contact (1996),0.391073


In [63]:
def get_similar_movies( movieid, topN = 5 ):
  movies_df['similarity'] = movie_sim_df.iloc[movieid -1]
  top_n = movies_df.sort_values( ["similarity"], ascending = False )[0:topN]
  print( "Similar Movies to: ", )
  return top_n

In [64]:
get_similar_movies( 118 )

Similar Movies to: 


Unnamed: 0,movieid,title,similarity
117,118,Twister (1996),1.0
120,121,Independence Day (ID4) (1996),0.629867
404,405,Mission: Impossible (1996),0.542379
545,546,Broken Arrow (1996),0.509549
596,597,Eraser (1996),0.489693


In [65]:
get_similar_movies( 127, 10 )

Similar Movies to: 


Unnamed: 0,movieid,title,similarity
126,127,"Godfather, The (1972)",1.0
186,187,"Godfather: Part II, The (1974)",0.543335
49,50,Star Wars (1977),0.409379
181,182,GoodFellas (1990),0.396741
22,23,Taxi Driver (1976),0.369608
99,100,Fargo (1996),0.345218
179,180,Apocalypse Now (1979),0.334691
191,192,Raging Bull (1980),0.331378
356,357,One Flew Over the Cuckoo's Nest (1975),0.331135
233,234,Jaws (1975),0.325565


In [66]:
get_similar_movies( 228)

Similar Movies to: 


Unnamed: 0,movieid,title,similarity
227,228,Star Trek: The Wrath of Khan (1982),1.0
228,229,Star Trek III: The Search for Spock (1984),0.747498
229,230,Star Trek IV: The Voyage Home (1986),0.723112
226,227,Star Trek VI: The Undiscovered Country (1991),0.685605
175,176,Aliens (1986),0.590461
