In [2]:
import pickle
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from datetime import datetime
from sortedcontainers import SortedList

In [3]:
import os
if not os.path.exists('user2movie.json') or \
   not os.path.exists('movie2user.json') or \
   not os.path.exists('usermovie2rating.json') or \
   not os.path.exists('usermovie2rating_test.json'):
       import preprocessing2dict

In [4]:
with open('user2movie.json','rb') as f:
    user2movie = pickle.load(f)
with open('movie2user.json', 'rb') as f:
    movie2user = pickle.load(f)
with open('usermovie2rating.json', 'rb') as f:
    usermovie2rating = pickle.load(f)
with open('usermovie2rating_test.json', 'rb') as f:
    usermovie2rating_test = pickle.load(f)

In [5]:
N = np.max(list(user2movie.keys())) +1
m1= np.max(list(movie2user.keys()))
m2 = np.max([m for (u,m), r in usermovie2rating_test.items()])
M = max(m1, m2) +1
print("M: ", M , "N: ", N)

M:  2000 N:  10000


In [6]:
if M > 2000:
    print("M = ", M, "Are you sure you want to continue?")
    print("Comment out these lines ifs so...")
    exit()

In [None]:
K = 20
limit = 5
neighbours = []
averages = []
deviations = []
for i in range(M):
    users_i = movie2user[i]
    users_i_set = set(users_i)
    ratings_i = {user:usermovie2rating[(user,i)] for user in users_i}
    avg_i = np.mean(list(ratings_i.values()))
    dev_i = {user: (rating - avg_i) for user, rating in ratings_i.items()}
    dev_i_vals = np.array(list(dev_i.values()))
    sigma_i = np.sqrt(dev_i_vals.dot(dev_i_vals))
    averages.append(avg_i)
    deviations.append(dev_i)
    sl = SortedList()
    for j in range(M):
        if j != i:
            users_j = movie2user[j]
            users_j_set = set(users_j)
            common_users = (users_i_set & users_j_set)
            
            if len(common_users) > limit:
                ratings_j = {user:usermovie2rating[(user, j)] for user in users_j}
                avg_j = np.mean(list(ratings_j.values()))
                dev_j = {user: (rating - avg_j) for user, rating in ratings_j.items()}
                dev_j_vals = np.array(list(dev_j.values()))
                sigma_j = np.sqrt(dev_j_vals.dot(dev_j_vals))
                
                numerators = sum(dev_i[m]*dev_j[m] for m in common_users)
                w_ij = numerators/(sigma_i*sigma_j)
                
                sl.add((-w_ij, j))
                if len(sl)> limit:
                    del sl[-1]
    neighbours.append(sl)
    
    if i% 1 == 0:
        print(i)

def predict(i, u):
    numerators = 0
    denominator = 0
    for neg_wt, j in neighbours[i]:
        try:
            numerators += -neg_wt* deviations[j][u]
            denominators += abs(neg_wt)
        except KeyError:
            pass
    if denominator==0:
        predictions = averages[i]
    else:
        predictions = numerator/denominator + averages[i]
    
    predictions = min(5, predictions)
    predictions = max(0.5, predictions)
    return predictions

train_predictions = []
train_targets = []
for (i,m), target in usermovie2rating.items():
    predictions = predict(i,m)
    train_predictions.append(predictions)
    train_targets.append(target)

test_predictions = []
test_targets = []
for (i,m), target in usermovie2rating_test.items():
    predictions = predict(i,m)
    test_predictions.append(predictions)
    test_targets.append(target)

def mse(p,t):
    p = np.array(p)
    t = np.array(t)
    return((p-t)**2)

print("Train MSE: ", mse(train_predictions, train_targets))
print("Test MSE: ", mse(test_predictions, test_targets))