# Homework 3

## Tasks (Rating prediction)

9. Fit a predict of the form
$$
\text { rating (user, item ) } \simeq \alpha+\beta_{\text {user }}+\beta_{\text {item }}
$$
by fitting the mean and the two bias terms as described in the lecture notes. Use a regularization
parameter of $\lambda$ = 1. Report the MSE on the validation set

In [1]:
import gzip
import csv
from collections import defaultdict
import scipy
import random
import scipy.optimize
import numpy

In [2]:
path="trainInteractions.csv.gz"

In [3]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d

In [4]:
dataset = list(readCSV(path))

In [5]:
# train_set=dataset[:400000]
train_set=dataset
validate_set=dataset[400000:500000]

In [6]:
labels = [int(d['rating']) for d in train_set]

In [7]:
validate_truth=[int(d['rating']) for d in validate_set]

In [8]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
# reviewsPerUser = defaultdict(set)
# reviewsPerItem = defaultdict(set)

In [9]:
for d in train_set:
    user,item = d['user_id'], d['recipe_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

In [10]:
ratingMean = sum([int(d['rating']) for d in train_set]) / len(train_set)
ratingMean

4.580794

In [11]:
N = len(train_set)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())

In [12]:
alpha = ratingMean

In [13]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

In [14]:
userGamma = {}
itemGamma = {}

In [15]:
K = 2
global step
step=1

In [16]:
for u in reviewsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [17]:
for i in reviewsPerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [18]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [19]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [20]:
def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

In [21]:
def prediction_(user, item):
    if user in userBiases and item in itemBiases:
        pred=alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])
    elif user in userBiases:
        pred=alpha + userBiases[user]
    elif item in itemBiases:
        pred=alpha + itemBiases[item]
    else:
        pred=alpha
    if pred<0:
        return 0
    if pred>5.0:
        return 5.0
#     if 5-pred<0.2:
#         return round(pred)
    return pred

In [22]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [23]:
def cost(theta, labels, lamb1,lamb2):
    print(lamb1)
    print(lamb2)
    global step
    unpack(theta)
    predictions = [prediction(d['user_id'], d['recipe_id']) for d in train_set]
    validate_predictions=[prediction_(d['user_id'], d['recipe_id']) for d in validate_set]
    cost = MSE(predictions, labels)
    validate_cost=MSE(validate_predictions,validate_truth)
    print("step = " + str(step))
    print("MSE = " + str(cost))
    print("validate MSE = " + str(validate_cost))
    step+=1
    for u in users:
        cost += lamb1*userBiases[u]**2
        for k in range(K):
            cost += lamb2*userGamma[u][k]**2
    for i in items:
        cost += lamb1*itemBiases[i]**2
        for k in range(K):
            cost += lamb2*itemGamma[i][k]**2
    return cost

In [24]:
def derivative(theta, labels, lamb1,lamb2):
    print(lamb1)
    print(lamb2)
    unpack(theta)
    N = len(train_set)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in reviewsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in reviewsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for d in train_set:
        u,i = d['user_id'], d['recipe_id']
        pred = prediction(u, i)
        diff = pred - int(d['rating'])
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb1*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb2*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb1*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb2*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

Here we set $\lambda$ =1

In [25]:
lamb1=0.0000191

In [26]:
lamb2=0.0000191

In [27]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, lamb1,lamb2), maxiter=100)

1.91e-05
1.91e-05
step = 1
MSE = 0.9008988917628877
validate MSE = 0.9095456429986768
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 2
MSE = 0.8880241571688571
validate MSE = 0.8968403638986748
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 3
MSE = 1.0694934720588878
validate MSE = 1.0677361047959757
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 4
MSE = 0.8854417748377286
validate MSE = 0.8940734098908445
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 5
MSE = 0.8800521335458377
validate MSE = 0.8845107113038106
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 6
MSE = 0.8792322749008901
validate MSE = 0.8838350007683817
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 7
MSE = 0.8761255441833137
validate MSE = 0.8812621964391661
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 8
MSE = 0.855250437823834
validate MSE = 0.8621415105410789
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 9
MSE = 0.8440545890830167
validate MSE = 0.8520987040053183
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 10
MSE = 0.8287256513872432
validate MSE

1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 80
MSE = 0.675223662636483
validate MSE = 0.6812067828500005
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 81
MSE = 0.679662939008372
validate MSE = 0.6821792083401188
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 82
MSE = 0.6749806826044052
validate MSE = 0.680932579280073
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 83
MSE = 0.6751003767149287
validate MSE = 0.681042757560023
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 84
MSE = 0.6754137097720575
validate MSE = 0.6813461679357234
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 85
MSE = 0.6887184821577377
validate MSE = 0.6932069233888334
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 86
MSE = 0.675005313137348
validate MSE = 0.6809458554352107
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 87
MSE = 0.6752436929581822
validate MSE = 0.6811723742816683
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 88
MSE = 0.6742901366436254
validate MSE = 0.6801483833568197
1.91e-05
1.91e-05
1.91e-05
1.91e-05
step = 89
MSE = 0.671646

(array([ 4.44444164e+00,  2.09376410e-02, -1.01672656e-01, ...,
         2.82368575e-04,  2.73902038e-03,  5.53244343e-03]),
 0.7269453454532787,
 {'grad': array([-1.48187101e-02, -3.45719986e-05,  5.88835743e-07, ...,
          1.31647737e-07,  1.02755544e-07,  1.99481610e-07]),
  'task': b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT',
  'funcalls': 119,
  'nit': 100,
  'warnflag': 1})

After training on train set, we evaludate the result on the validation set

In [28]:
userRatings = defaultdict(list)
itemRatings =defaultdict(list)

In [29]:
userAverage = {}
itemAverage ={}
globalAverage=ratingMean

In [30]:
for d in train_set:
    r = int(d['rating'])
    user = d['user_id']
    item=d['recipe_id']
    userRatings[user].append(r)
    itemRatings[item].append(r)

In [31]:
for u in userRatings:
      userAverage[u] = sum(userRatings[u]) / len(userRatings[u])

In [32]:
for i in itemRatings:
    itemAverage[i] = sum(itemRatings[i]) / len(itemRatings[i])

In [33]:
globalAverage=ratingMean

In [34]:
predictions = []
validate_truth=[int(d['rating']) for d in validate_set]
for d in validate_set:
    user=d['user_id']
    item=d['recipe_id']
    predictions.append(prediction_(user, item))
#     if user in userBiases and item in itemBiases:
#         predictions.append(prediction_(user, item))
#     elif item in itemBiases:
#         predictions.append(itemAverage[item])
#     elif user in userBiases:
#         predictions.append(userAverage[user])
#     else:
#         predictions.append(globalAverage)

MSE(predictions, validate_truth)

0.6624550547796129

In [35]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        
      #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    predictions.write(u + '-' + i + ',' + str(prediction_(u, i)) + '\n')
#     if u in userBiases and i in itemBiases:
#         predictions.write(u + '-' + i + ',' + str(prediction_(u, i)) + '\n')
#     elif i in itemBiases:
#         predictions.write(u + '-' + i + ',' + str(itemAverage[i]) + '\n')
#     elif u in userBiases:
#         predictions.write(u + '-' + i + ',' + str(userAverage[u]) + '\n')
#     else:
#         predictions.write(u + '-' + i + ',' + str(globalAverage) + '\n')

predictions.close()