In [31]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import Counter

from sklearn.metrics import mean_squared_error

## Train

In [2]:
data = np.loadtxt('../data/ml-100k/u.data', delimiter='\t', dtype='int64')

In [3]:
u_movie_counts = Counter(data[:,0])
mov = Counter(data[:,1])
n_users = len(u_movie_counts)
n_movies = len(mov)

In [10]:
# find mean and variance for each user
u_mean = {}
u_sigma = {}
for u in u_movie_counts.keys():
    u_ratings = data[np.where(data[:,0]==u)[0],2]
    u_mean[u] = np.sum(u_ratings)/u_movie_counts[u]
    u_sigma[u] = (np.sum(np.square(u_ratings))/u_movie_counts[u]) - np.square(u_mean[u])

In [14]:
# normalize rating for each user, based on his mu and sigma
norm_rating = {}
for i, r in  enumerate(data[:,2]):
    norm_rating[i] = (r - u_mean[u]) / u_sigma[u]

In [15]:
# user-movie rating matrix (n_user, n_movies)
user_rating = np.zeros((n_users, n_movies))
for idx in range(data.shape[0]):
    u = data[idx,0]-1
    m = data[idx,1]-1
    user_rating[u][m] = norm_rating[idx]

In [21]:
# similarity (pearsons)
sim = np.zeros((n_users,n_users))
for u1 in tqdm(range(n_users)):
    for u2 in range(n_users):
        r_ = np.sum(user_rating[u1]) * np.sum(user_rating[u2])
        movie_count = np.count_nonzero(user_rating[u1] * user_rating[u2])
        if movie_count != 0:
            sim[u1][u2] = r_/movie_count

100%|██████████| 943/943 [00:19<00:00, 48.39it/s]


## Test

In [24]:
test_data = np.loadtxt('../data/ml-100k/u1.test', delimiter='\t', dtype='int64')
# sample 100 from test
sample_data = np.random.randint(0, test_data.shape[0], 100)

In [25]:
def predict():
    pred = {}
    for idx in sample_data:
        rating_ = []
        user = test_data[idx,0]
        movie = test_data[idx,1]
        neigh = sim[user].argsort()[::-1]
        index = [np.where((data[:,0] == u) & (data[:,1] == movie))[0] for u in neigh]
        for j in index:
            if j != None:
                i = j[0]
                rating_.append(norm_rating[i] * sim[user][data[i][0]])
        pred[idx] = u_mean[user] + ((np.sum(rating_) / np.sum(np.abs(sim[user][neigh]))) * u_sigma[user])
    return pred

In [26]:
y_true = test_data[sample_data,2].astype('float')

In [28]:
y_pred = predict()

  # Remove the CWD from sys.path while we load stuff.


In [34]:
error = mean_squared_error(y_true, list(y_pred.values()))
error

0.9628536371840544