In [1]:
import pandas as pd
import numpy as np
import os
from sklearn import metrics
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.utils.extmath import randomized_svd

In [32]:
df = pd.read_csv('./ml-100k/u.data', sep = '\t', names = ['user id', 'movie id', 'rating', 'timestamp'])
df.head()

Unnamed: 0,user id,movie id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
df.shape

(99999, 4)

In [27]:
df_movie = pd.read_csv('./ml-100k/u.item', sep ='|', encoding = 'latin-1', names = ['movie id', 'movie title', 'release date', 'video release date',
              'IMDb URL', 'unknown', 'Action' , 'Adventure' , 'Animation' ,
              'Children\'s', 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
              'Film-Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci-Fi' ,
              'Thriller' , 'War' , 'Western'])
df_movie.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


#### Check for data quality

In [33]:
#video release date column has nan values
df_movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie id            1682 non-null   int64  
 1   movie title         1682 non-null   object 
 2   release date        1681 non-null   object 
 3   video release date  0 non-null      float64
 4   IMDb URL            1679 non-null   object 
 5   unknown             1682 non-null   int64  
 6   Action              1682 non-null   int64  
 7   Adventure           1682 non-null   int64  
 8   Animation           1682 non-null   int64  
 9   Children's          1682 non-null   int64  
 10  Comedy              1682 non-null   int64  
 11  Crime               1682 non-null   int64  
 12  Documentary         1682 non-null   int64  
 13  Drama               1682 non-null   int64  
 14  Fantasy             1682 non-null   int64  
 15  Film-Noir           1682 non-null   int64  
 16  Horror

In [34]:
df.head()

Unnamed: 0,user id,movie id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [99]:
df_join = df.merge(df_movie, on = 'movie id', how ='inner',suffixes = ('_data', '_movie'))
df_join.head()

Unnamed: 0,user id,movie id,rating,timestamp,movie title,release date,video release date,IMDb URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
1,63,242,3,875747190,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
2,226,242,5,883888671,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
3,154,242,3,879138235,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
4,306,242,5,876503793,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
df_final = df_join[['user id', 'movie id', 'rating', 'movie title']]
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user id      100000 non-null  int64 
 1   movie id     100000 non-null  int64 
 2   rating       100000 non-null  int64 
 3   movie title  100000 non-null  object
dtypes: int64(3), object(1)
memory usage: 3.8+ MB


#### Create Adjaceny matrix aka user movie matrix of ratings 

In [51]:
data = df_final['rating']
row_indexes = df_final['user id']
col_indexes = df_final['movie id']
row_len = max(set(row_indexes))+ 1
col_len = max(set(col_indexes)) + 1
print(row_len, col_len)
adjacency_matrix = csr_matrix((data, (row_indexes, col_indexes)), shape = (row_len,col_len))  

944 1683


In [53]:
def m_u(df):
    return np.mean(df['rating'])
mu = m_u(df_final)
mu

3.52986

### Singular Value Decomposition & SGD for predicting ratings 

In [55]:
b = np.zeros(row_len,)
c = np.zeros(col_len,)
epochs = 50
hidden_dim = 50
alpha = 1e0
lr = 1e-2
U, Sigma, VT = randomized_svd(adjacency_matrix, n_components = hidden_dim, random_state = 0)
V = VT.T
num_users= row_len
num_movies = col_len
N = len(df_final)

for epo in range(epochs):
    #compute
    cx = adjacency_matrix.tocoo()    
    for i,j,v in zip(cx.row, cx.col, cx.data):
        y_hat = mu + b[i] + c[j] + np.dot(U[i], V[j])
        e = v - y_hat
        delta = 0
        dL_dbi = 2*(alpha*b[i] - (v-mu-b[i]-c[j]-np.dot(U[i],V[j])))
        dL_dcj = 2*(alpha*c[j] - (v-mu-b[i]-c[j]-np.dot(U[i],V[j])))

        b[i] = b[i] - lr*dL_dbi   #SGD
        c[j] = c[j] - lr*dL_dcj   #SGD
    mse = 0
    for i,j,v in zip(cx.row, cx.col, cx.data):
        #import pdb;pdb.set_trace()
        mse+=pow((v-(mu+b[i]+c[j]+np.dot(U[i],V[j]))), 2)
    mse = mse/N
    print('for epoch', epo, 'mean sq error is', mse)

for epoch 0 mean sq error is 0.983977688004091
for epoch 1 mean sq error is 0.965391538329906
for epoch 2 mean sq error is 0.9596015663491471
for epoch 3 mean sq error is 0.9567969279349979
for epoch 4 mean sq error is 0.9552005130390343
for epoch 5 mean sq error is 0.9541963370004276
for epoch 6 mean sq error is 0.9535201949918205
for epoch 7 mean sq error is 0.9530421863630782
for epoch 8 mean sq error is 0.952691870386233
for epoch 9 mean sq error is 0.9524280663143184
for epoch 10 mean sq error is 0.9522252172911231
for epoch 11 mean sq error is 0.9520666790855472
for epoch 12 mean sq error is 0.9519411772844554
for epoch 13 mean sq error is 0.9518408237201988
for epoch 14 mean sq error is 0.9517599488212954
for epoch 15 mean sq error is 0.9516943841668686
for epoch 16 mean sq error is 0.9516410051823873
for epoch 17 mean sq error is 0.9515974303217938
for epoch 18 mean sq error is 0.9515618177397867
for epoch 19 mean sq error is 0.9515327245843278
for epoch 20 mean sq error is 0.9

### Recommending to a user 

In [59]:
# We find 50 most similar users using centered cosine similarity and take a weighted average of their ratings to
# compute a predicted rating for the user, we recommend it if its greater than a threshold
df_final.groupby('movie id')['user id'].count().sort_values(ascending = False)

movie id
50      583
258     509
100     508
181     507
294     485
       ... 
1452      1
1593      1
1447      1
814       1
1682      1
Name: user id, Length: 1682, dtype: int64

In [66]:
df_final[(df_final['movie id']==583) & (df_final['user id'] == 1)]

Unnamed: 0,user id,movie id,rating,movie title


In [93]:
corr = np.corrcoef(U)[0]
df_corr = pd.DataFrame({'correlation':corr})
df_corr['user id'] = df_corr.index+1
df_corr.head()

Unnamed: 0,correlation,user id
0,1.0,1
1,-0.373865,2
2,0.06624,3
3,0.053262,4
4,-0.189145,5


In [103]:
top_50 = df_corr.loc[df_corr['correlation'].sort_values(ascending=False)[1:51].index]
top_50.head()

Unnamed: 0,correlation,user id
27,0.561704,28
9,0.378655,10
858,0.37728,859
696,0.371654,697
359,0.367861,360


In [138]:
#add predicted rating for movie id 583
ratings = []
for index in top_50.index:
    #check for rating
    if(df[(df['user id'] == index) & (df['movie id']==583)].shape[0]==1):
        print('rated')
        ratings.append(df['rating'][(df['user id'] == index) & (df['movie id']==583)].values[0])
    else:
        #predict rating
        y_hat = mu + b[index] + c[583] + np.dot(U[index], V[583])
        print('predicted rating', y_hat)
        ratings.append(y_hat)
top_50['ratings'] = ratings
top_50.head()

predicted rating 3.1033946471740816
predicted rating 3.541615825259249
predicted rating 3.208178673230342
predicted rating 3.3645316787054864
predicted rating 3.46376292055824
predicted rating 3.405835708885782
predicted rating 3.168045028027042
predicted rating 3.264506849631818
predicted rating 3.308900468694599
predicted rating 3.211159850310367
predicted rating 2.999511436935638
predicted rating 3.174787880364791
predicted rating 3.0768532958542343
predicted rating 3.1272563290124284
predicted rating 3.0653119658731174
predicted rating 3.539816422323756
predicted rating 3.4885785373590017
predicted rating 3.690407446258111
predicted rating 3.272495903948457
rated
predicted rating 3.164507524501587
predicted rating 3.223268588355102
predicted rating 3.435582667556944
predicted rating 2.461724702975367
predicted rating 3.3659821209426566
rated
predicted rating 3.519902388639959
predicted rating 3.3330800839720736
predicted rating 3.0446492298637726
predicted rating 3.2999212053958664

Unnamed: 0,correlation,user id,ratings
27,0.561704,28,3.103395
9,0.378655,10,3.541616
858,0.37728,859,3.208179
696,0.371654,697,3.364532
359,0.367861,360,3.463763


In [142]:
np.sum((top_50['correlation'] * top_50['ratings']))/np.sum(top_50['correlation'])

3.2157624711108332

In [140]:
np.sum(k)

46.87926386091353

In [150]:
df_join[df_join['user id'] == 1]

Unnamed: 0,user id,movie id,rating,timestamp,movie title,release date,video release date,IMDb URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
14,1,242,5,889751633,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
433,1,51,4,878543275,Legends of the Fall (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Legends%20of%...,0,0,...,0,0,0,0,0,1,0,0,1,1
832,1,265,4,878542441,"Hunt for Red October, The (1990)",01-Jan-1990,,http://us.imdb.com/M/title-exact?Hunt+for+Red+...,0,1,...,0,0,0,0,0,0,0,1,0,0
1336,1,86,5,878543541,"Remains of the Day, The (1993)",01-Jan-1993,,http://us.imdb.com/M/title-exact?Remains%20of%...,0,0,...,0,0,0,0,0,0,0,0,0,0
1573,1,257,4,874965954,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...,0,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97675,1,18,4,887432020,"White Balloon, The (1995)",01-Jan-1995,,http://us.imdb.com/M/title-exact?Badkonake%20S...,0,0,...,0,0,0,0,0,0,0,0,0,0
98210,1,247,1,875241619,Turbo: A Power Rangers Movie (1997),28-Mar-1997,,http://us.imdb.com/M/title-exact?Turbo%3A%20A%...,0,1,...,0,0,0,0,0,0,0,0,0,0
98261,1,35,1,878542420,Free Willy 2: The Adventure Home (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Free%20Willy%...,0,0,...,0,0,0,0,0,0,0,0,0,0
98510,1,130,3,875072002,Kansas City (1996),16-Aug-1996,,http://us.imdb.com/M/title-exact?Kansas%20City...,0,0,...,0,0,0,0,0,0,0,0,0,0


In [151]:
df_join[df_join['movie id']==583]

Unnamed: 0,user id,movie id,rating,timestamp,movie title,release date,video release date,IMDb URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
66889,268,583,4,876513830,Romeo Is Bleeding (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Romeo%20Is%20...,0,0,...,0,0,0,0,0,0,0,1,0,0
66890,224,583,1,888103729,Romeo Is Bleeding (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Romeo%20Is%20...,0,0,...,0,0,0,0,0,0,0,1,0,0
66891,59,583,5,888205921,Romeo Is Bleeding (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Romeo%20Is%20...,0,0,...,0,0,0,0,0,0,0,1,0,0
66892,276,583,3,874791444,Romeo Is Bleeding (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Romeo%20Is%20...,0,0,...,0,0,0,0,0,0,0,1,0,0
66893,375,583,2,886622131,Romeo Is Bleeding (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Romeo%20Is%20...,0,0,...,0,0,0,0,0,0,0,1,0,0
66894,327,583,2,887820341,Romeo Is Bleeding (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Romeo%20Is%20...,0,0,...,0,0,0,0,0,0,0,1,0,0
66895,293,583,3,888908001,Romeo Is Bleeding (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Romeo%20Is%20...,0,0,...,0,0,0,0,0,0,0,1,0,0
66896,389,583,2,880088039,Romeo Is Bleeding (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Romeo%20Is%20...,0,0,...,0,0,0,0,0,0,0,1,0,0
66897,405,583,1,885546112,Romeo Is Bleeding (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Romeo%20Is%20...,0,0,...,0,0,0,0,0,0,0,1,0,0
66898,94,583,3,891722174,Romeo Is Bleeding (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Romeo%20Is%20...,0,0,...,0,0,0,0,0,0,0,1,0,0
