In [3]:
# find MovieLense 20M dataset on kaggle
# download rating.csv

In [4]:
import pandas as pd
from collections import Counter
from sklearn.utils import shuffle
import pickle
import numpy as np
import datetime as datetime
import matplotlib.pyplot as plt
from sortedcontainers import SortedList

<h2>Preprocessing<h2>

In [5]:
df = pd.read_csv('rating.csv')

In [6]:
df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39


In [7]:
# IDs from 1 , we need to get them to start from 0
# thankfully no missing number in between
df.userId -= 1 

In [8]:
# movie ids from 1 with missing number in between
# make user id go from 0 - N-1 ,
# not all movies have ids
# this takes some time
movie_ids = set(df.movieId.values)
movie2idx = {k:v for v,k in enumerate(movie_ids)}
df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId] , axis = 1)


In [9]:
# drop unecessary timestamp col
df = df.drop(columns=['timestamp'])

In [10]:
# since that took some time lets save
df.to_csv('rating_1.csv')

In [11]:
# the data is too large
# to provide as much processable data as possible
# select subset of users who rated most movies and movies rated by most users
df = pd.read_csv('rating_1.csv')

In [12]:
N = df.userId.max() + 1 # no of users
M = df.movie_idx.max() + 1 # no of moives
# the user-user algorithm is O(n^2 m) , so better keep n low
# the item-item algorithm is O(n m^2 ) , so its also better to keep m low
n = 1000 # number of users to take , 
m = 200 # number of movies to take


In [13]:
user_ids_count = Counter(df.userId) # counts occurunces of each element
movie_ids_count = Counter(df.movie_idx) 

top_user_ids = [i for i,_ in user_ids_count.most_common(n)] # id:count
top_movie_ids = [i for i,_ in movie_ids_count.most_common(m)]

In [14]:
# make a copy to overwrite data later
df_small = df[df.userId.isin(top_user_ids) & df.movie_idx.isin(top_movie_ids)].copy()

In [15]:
# new mapping of top IDs , to run from 0 - N-1 without missing ids in between
new_user_id_map = {k:v for v,k in enumerate(top_user_ids)}
# same for movies
new_movie_id_map = {k:v for v,k in enumerate(top_movie_ids)}


In [16]:
# apply changes to df_small
df_small['userId'] = df_small.apply(lambda row: new_user_id_map[row.userId] , axis = 1)
df_small['movie_idx'] = df_small.apply(lambda row: new_movie_id_map[row.movie_idx] , axis = 1)


In [17]:
print("max user id:", df_small.userId.max())
print("max movie id:", df_small.movie_idx.max())
print("small dataframe size:", len(df_small))
df_small.to_csv('small_rating.csv')

max user id: 999
max movie id: 199
small dataframe size: 167035


In [18]:
df = pd.read_csv('small_rating.csv')

In [19]:
N = df.userId.max() + 1 # no of users
M = df.movie_idx.max() + 1 # no of moives

In [20]:
# split train-test
df = shuffle(df)
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]

In [21]:
user2movie = {}
movie2user = {}
user_movie2rating = {}
count = 0
def update_dicts(row): # takes a row of pandas data frame
    global count
    count+=1
    if count % 100000 == 0:
        print('processed %.3f %%' %(float(count)/cutoff))
    
    i = int(row.userId)
    j = int(row.movie_idx)
    
    if i not in user2movie:
        user2movie[i] = [j]
    else:
        user2movie[i].append(j)
    
    if j not in movie2user:
        movie2user[j] = [i]
    else:
        movie2user[j].append(i)
        
    user_movie2rating[(i,j)] = row.rating
 

In [22]:
df_train.apply(update_dicts,axis=1)

processed 0.748 %


12338     None
86650     None
152492    None
13193     None
141970    None
          ... 
23703     None
22584     None
150656    None
4501      None
4598      None
Length: 133628, dtype: object

In [23]:
user_movie2rating_test = {}
count = 0
def update_dict_test(row): # takes a row of pandas data frame
    global count
    count+=1
    if count % 100000 == 0:
        print('processed %.3f %%' %(float(count)/len(df_test)))
    
    i = int(row.userId)
    j = int(row.movie_idx)
    
    user_movie2rating_test[(i,j)] = row.rating
 

In [24]:
df_test.apply(update_dict_test,axis=1)

156303    None
134018    None
4882      None
4632      None
135393    None
          ... 
6021      None
88457     None
158417    None
155380    None
103799    None
Length: 33407, dtype: object

In [25]:
with open('user2movie.json','wb') as f:
    pickle.dump(user2movie,f)
    
with open('movie2user.json','wb') as f:
    pickle.dump(movie2user,f)
    
with open('user_movie2rating.json','wb') as f:
    pickle.dump(user_movie2rating,f)
    
with open('user_movie2rating_test.json','wb') as f:
    pickle.dump(user_movie2rating_test,f)

<h2>User-User Collaborative Filtering</h2>

In [26]:
with open('user2movie.json','rb') as f:
    user2movie = pickle.load(f)
    
with open('movie2user.json','rb') as f:
    movie2user = pickle.load(f)
    
with open('user_movie2rating.json','rb') as f:
    user_movie2rating =  pickle.load(f)
    
with open('user_movie2rating_test.json','rb') as f:
    user_movie2rating_test =  pickle.load(f)
    

In [27]:
N = np.max(list(user2movie.keys()))+1
# some movies may appear in test but not in train
# unlikely to occur in user
m1 = np.max(list(movie2user.keys()))
m2 = np.max([m for(u,m),r in user_movie2rating_test.items()])
M = max(m1,m2) + 1 


In [28]:
K = 25 # number of neighbours to consider
limit = 5 # number of movies users must have in common to consider
averages = [] # each user's average rating
deviations = [] # each user's deviation
neighbors = [] # each element is a list of neighbours for each user

In [29]:
for i in range(N): # now i is user
    movies_i = user2movie[i]
    movies_i_set = set(movies_i)
    
    # calculate average and deviation
    ratings_i = {movie:user_movie2rating[(i,movie)] for movie in movies_i} # rating for each movie
    avg_i = np.mean(list(ratings_i.values())) # average across all movies
    dev_i = {movie:(rating-avg_i) for movie,rating in ratings_i.items()} # deviation for each movie
    dev_i_values = np.array(list(dev_i.values()))
    sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))
    
    # calculate correlation coeffecient
    averages.append(avg_i)
    deviations.append(dev_i)
    
    sl = SortedList()
    for j in range(N):
        if j != i: # dont include this i th user
            movies_j = user2movie[j]
            movies_j_set = set(movies_j)
            common_movies = (movies_i_set & movies_j_set) # intersection
            if len(common_movies) > limit:
                # calculate  avg and ddeviation
                ratings_j = {movie:user_movie2rating[(j,movie)] for movie in movies_j} # rating for each movie
                avg_j = np.mean(list(ratings_j.values())) # average across all movies
                dev_j = {movie:(rating-avg_j) for movie,rating in ratings_j.items()} # deviation for each movie
                dev_j_values = np.array(list(dev_j.values()))
                sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))
                
                # calculate correlation coffecient
                numerator = sum(dev_i[m]*dev_j[m] for m in common_movies)
                w_ij = numerator / (sigma_i * sigma_j)
                sl.add((-w_ij,j)) # list sorts ascendingly , to keep highest first negate
                if len(sl) > K:
                    del sl[-1]
    neighbors.append(sl)
    print('finished processing for user id: ',i)
    
        
    

finished processing for user id:  0
finished processing for user id:  1
finished processing for user id:  2
finished processing for user id:  3
finished processing for user id:  4
finished processing for user id:  5
finished processing for user id:  6
finished processing for user id:  7
finished processing for user id:  8
finished processing for user id:  9
finished processing for user id:  10
finished processing for user id:  11
finished processing for user id:  12
finished processing for user id:  13
finished processing for user id:  14
finished processing for user id:  15
finished processing for user id:  16
finished processing for user id:  17
finished processing for user id:  18
finished processing for user id:  19
finished processing for user id:  20
finished processing for user id:  21
finished processing for user id:  22
finished processing for user id:  23
finished processing for user id:  24
finished processing for user id:  25
finished processing for user id:  26
finished pr

In [30]:
def predict(i,m):
    numerator = 0
    denominator = 0
    for neg_w,j in neighbors[i]:
        try: 
            numerator += -neg_w * deviations[j][m]
            denominator += abs(neg_w)
        except KeyError: # user j may not have rated movie m so it migh not exist in deviations , instead pf looking up twice throgh exception
            pass
        
    if denominator == 0:
        prediction = averages[i]
    else:
        prediction = numerator / denominator + averages[i]
    prediction = min(5,prediction)
    prediction = max(0.5,prediction)
    return prediction

        

In [31]:
train_predictions = []
train_targets = []

for (i,m),target in user_movie2rating.items():
    prediction = predict(i,m)
    train_predictions.append(prediction)
    train_targets.append(target)
    

In [32]:
test_predictions = []
test_targets = []

for (i,m),target in user_movie2rating_test.items():
    prediction = predict(i,m)
    test_predictions.append(prediction)
    test_targets.append(target)
    

In [33]:
def mse(p,t): # for accuracy
    p = np.array(p)
    t = np.array(t)
    return np.mean((p-t)**2)

In [34]:
print('train mse: ',mse(train_predictions,train_targets))
print('train mse: ',mse(test_predictions,test_targets))

train mse:  0.4613057521837811
train mse:  0.5960859832624089


<h2>Item-Item Collaborative Filtering</h2>

In [35]:
with open('user2movie.json','rb') as f:
    user2movie = pickle.load(f)
    
with open('movie2user.json','rb') as f:
    movie2user = pickle.load(f)
    
with open('user_movie2rating.json','rb') as f:
    user_movie2rating =  pickle.load(f)
    
with open('user_movie2rating_test.json','rb') as f:
    user_movie2rating_test =  pickle.load(f)
    

In [36]:
N = np.max(list(user2movie.keys()))+1
# some movies may appear in test but not in train
# unlikely to occur in user
m1 = np.max(list(movie2user.keys()))
m2 = np.max([m for(u,m),r in user_movie2rating_test.items()])
M = max(m1,m2) + 1 


In [37]:
K = 20 # number of neighbours to consider
limit = 5 # number of movies users must have in common to consider
averages = [] # each user's average rating
deviations = [] # each user's deviation
neighbors = [] # each element is a list of neighbours for each user

In [38]:
for i in range(M): # now i is movie , just assume movies became users and vice versa
    users_i = movie2user[i]
    users_i_set = set(users_i)
    
    # calculate average and deviation
    ratings_i = {user:user_movie2rating[(user,i)] for user in users_i} # rating for each movie
    avg_i = np.mean(list(ratings_i.values())) # average across all movies
    dev_i = {user:(rating-avg_i) for user,rating in ratings_i.items()} # deviation for each movie
    dev_i_values = np.array(list(dev_i.values()))
    sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))
    
    # calculate correlation coeffecient
    averages.append(avg_i)
    deviations.append(dev_i)
    
    sl = SortedList()
    for j in range(M):
        if j != i: # dont include this i th user
            users_j = movie2user[j]
            users_j_set = set(users_j)
            common_movies = (users_i_set & users_j_set) # intersection
            if len(common_movies) > limit:
                # calculate  avg and ddeviation
                ratings_j = {user:user_movie2rating[(user,j)] for user in users_j} # rating for each movie
                avg_j = np.mean(list(ratings_j.values())) # average across all movies
                dev_j = {user:(rating-avg_j) for user,rating in ratings_j.items()} # deviation for each movie
                dev_j_values = np.array(list(dev_j.values()))
                sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))
                
                # calculate correlation coffecient
                numerator = sum(dev_i[m]*dev_j[m] for m in common_movies)
                w_ij = numerator / (sigma_i * sigma_j)
                sl.add((-w_ij,j)) # list sorts ascendingly , to keep highest first negate
                if len(sl) > K:
                    del sl[-1]
    neighbors.append(sl)
    print('finished processing for movie id: ',i)
    
        
    

finished processing for movie id:  0
finished processing for movie id:  1
finished processing for movie id:  2
finished processing for movie id:  3
finished processing for movie id:  4
finished processing for movie id:  5
finished processing for movie id:  6
finished processing for movie id:  7
finished processing for movie id:  8
finished processing for movie id:  9
finished processing for movie id:  10
finished processing for movie id:  11
finished processing for movie id:  12
finished processing for movie id:  13
finished processing for movie id:  14
finished processing for movie id:  15
finished processing for movie id:  16
finished processing for movie id:  17
finished processing for movie id:  18
finished processing for movie id:  19
finished processing for movie id:  20
finished processing for movie id:  21
finished processing for movie id:  22
finished processing for movie id:  23
finished processing for movie id:  24
finished processing for movie id:  25
finished processing fo

In [39]:
def predict(i,u):
    numerator = 0
    denominator = 0
    for neg_w,j in neighbors[i]:
        try: 
            numerator += -neg_w * deviations[j][u]
            denominator += abs(neg_w)
        except KeyError: # user j may not have rated movie m so it migh not exist in deviations , instead pf looking up twice throgh exception
            pass
        
    if denominator == 0:
        prediction = averages[i]
    else:
        prediction = numerator / denominator + averages[i]
    prediction = min(5,prediction)
    prediction = max(0.5,prediction)
    return prediction

        

In [40]:
train_predictions = []
train_targets = []

for (i,m),target in user_movie2rating.items():
    prediction = predict(m,i) # same function just switch inputs
    train_predictions.append(prediction)
    train_targets.append(target)
    

In [41]:
test_predictions = []
test_targets = []

for (i,m),target in user_movie2rating_test.items():
    prediction = predict(m,i)
    test_predictions.append(prediction)
    test_targets.append(target)
    

In [42]:
def mse(p,t): # for accuracy
    p = np.array(p)
    t = np.array(t)
    return np.mean((p-t)**2)

In [43]:
print('train mse: ',mse(train_predictions,train_targets))
print('train mse: ',mse(test_predictions,test_targets))

train mse:  0.5083151016169266
train mse:  0.5500312185708748
