In [1]:
import gzip
import csv
from collections import defaultdict
import scipy
import scipy.optimize
import numpy

In [2]:
path="trainInteractions.csv.gz"

In [3]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d

In [4]:
dataset = list(readCSV(path))

In [5]:
train_set=dataset
validate_set=dataset[400000:500000]

In [6]:
labels = [int(d['rating']) for d in train_set]

In [7]:
validate_truth=[int(d['rating']) for d in validate_set]

In [8]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

In [9]:
for d in train_set:
    user,item = d['user_id'], d['recipe_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

In [10]:
ratingMean = sum([int(d['rating']) for d in train_set]) / len(train_set)
ratingMean

4.580794

In [11]:
global step
step=1

In [12]:
N = len(train_set)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())

In [13]:
alpha = ratingMean

In [14]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

In [15]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [16]:
def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item]

In [25]:
def prediction_(user, item):
    if user in userBiases and item in itemBiases:
        pred=alpha + userBiases[user] + itemBiases[item]
    elif user in userBiases:
        pred=alpha + userBiases[user]
    elif item in itemBiases:
        pred=alpha + itemBiases[item]
    else:
        pred=alpha
    if pred<0:
        return 0
    if pred>5.0:
        return 5.0
    if abs(round(pred)-pred)<0.001:
        return round(pred)

    return pred

In [18]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    alpha = theta[0]
    userBiases = dict(zip(users, theta[1:nUsers+1]))
    itemBiases = dict(zip(items, theta[1+nUsers:]))

In [19]:
def cost(theta, labels, lamb):
    unpack(theta)
    global step
    predictions = [prediction(d['user_id'], d['recipe_id']) for d in train_set]
    cost = MSE(predictions, labels)
    
    validate_predictions=[prediction_(d['user_id'], d['recipe_id']) for d in validate_set]
    validate_cost=MSE(validate_predictions,validate_truth)
    print("step = " + str(step))
    print("MSE = " + str(cost))
    print("validate MSE = " + str(validate_cost))
    step+=1
    for u in userBiases:
        cost += lamb*userBiases[u]**2
    for i in itemBiases:
        cost += lamb*itemBiases[i]**2
    return cost

In [20]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(train_set)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    for d in train_set:
        u,i = d['user_id'], d['recipe_id']
        pred = prediction(u, i)
        diff = pred - int(d['rating'])
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    return numpy.array(dtheta)

In [21]:
lamb=0.000019

In [22]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0]*(nUsers+nItems),
                             derivative, args = (labels, lamb), maxfun=75)

step = 1
MSE = 0.9008923295603781
validate MSE = 0.9095362076759884
step = 2
MSE = 0.8880111212499088
validate MSE = 0.8968248602337343
step = 3
MSE = 1.0695097037303978
validate MSE = 1.067812439623728
step = 4
MSE = 0.8854387311293824
validate MSE = 0.89406940304688
step = 5
MSE = 0.8800505721084655
validate MSE = 0.8845302174422947
step = 6
MSE = 0.8792285921615436
validate MSE = 0.8838526749698199
step = 7
MSE = 0.8761141185768858
validate MSE = 0.8812733329921802
step = 8
MSE = 0.8552499355379078
validate MSE = 0.8621721543584585
step = 9
MSE = 0.8440696876899761
validate MSE = 0.8521418269279626
step = 10
MSE = 0.828718888541999
validate MSE = 0.8369502955278382
step = 11
MSE = 0.8180810485501481
validate MSE = 0.8264655819880364
step = 12
MSE = 0.7984272534037569
validate MSE = 0.8065312283022312
step = 13
MSE = 0.7847045166202754
validate MSE = 0.7919317812332473
step = 14
MSE = 0.772308740003515
validate MSE = 0.7798835053488007
step = 15
MSE = 0.7644540294786706
validate MSE 

(array([ 4.44997264,  0.03614097, -0.11206127, ...,  0.08054259,
         0.03577149,  0.01341844]),
 0.7395105887748105,
 {'grad': array([-2.48009399e-03, -5.10180106e-06, -2.89307173e-07, ...,
         -1.66538154e-07, -9.28819300e-08, -1.20097512e-08]),
  'task': b'STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT',
  'funcalls': 76,
  'nit': 65,
  'warnflag': 1})

In [23]:
predictions = []
validate_truth=[int(d['rating']) for d in validate_set]
for d in validate_set:
    user=d['user_id']
    item=d['recipe_id']
    predictions.append(prediction_(user, item))
#     if user in userBiases and item in itemBiases:
#         predictions.append(prediction_(user, item))
#     elif item in itemBiases:
#         predictions.append(itemAverage[item])
#     elif user in userBiases:
#         predictions.append(userAverage[user])
#     else:
#         predictions.append(globalAverage)

MSE(predictions, validate_truth)

0.6896136431060065

In [26]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        
      #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(prediction_(u, i)) + '\n')
#     if u in userBiases and i in itemBiases:
#         predictions.write(u + '-' + i + ',' + str(prediction_(u, i)) + '\n')
#     elif i in itemBiases:
#         predictions.write(u + '-' + i + ',' + str(itemAverage[i]) + '\n')
#     elif u in userBiases:
#         predictions.write(u + '-' + i + ',' + str(userAverage[u]) + '\n')
#     else:
#         predictions.write(u + '-' + i + ',' + str(globalAverage) + '\n')

predictions.close()