In [1]:
import gzip
import csv
from collections import defaultdict
import scipy
import random
import scipy.optimize
import numpy

In [2]:
def readCSV(path):
    c = csv.reader(open(path,'rt'))
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        d['rating']=int(d['rating'])
        yield d

In [3]:
trainset=list(readCSV("data/trainSet.csv"))
validset=list(readCSV("data/validSet.csv"))
testset=list(readCSV("data/testSet.csv"))

In [4]:
labels = [d['rating'] for d in trainset]
validate_truth=[d['rating'] for d in validset]
test_truth=[d['rating'] for d in testset]

In [5]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

In [6]:
for d in trainset:
    user,item = d['user_id'], d['item_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

In [7]:
ratingMean = sum([d['rating'] for d in trainset]) / len(trainset)
ratingMean

4.545234115884549

In [8]:
N = len(trainset)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())

In [9]:
alpha = ratingMean

In [10]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)
userGamma = {}
itemGamma = {}

In [11]:
K = 1
global step
step=1

In [12]:
for u in reviewsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
for i in reviewsPerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [13]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [14]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [15]:
def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

In [16]:
def prediction_(user, item):
    if user in userBiases and item in itemBiases:
        pred=alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])
    elif user in userBiases:
        pred=alpha + userBiases[user]
    elif item in itemBiases:
        pred=alpha + itemBiases[item]
    else:
        pred=alpha
    if pred<1:
        return 1
    if pred>5:
        return 5

    return pred

In [17]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [18]:
def cost(theta, labels, lamb):
    global step
    unpack(theta)
    predictions = [prediction(d['user_id'], d['item_id']) for d in trainset]
    validate_predictions=[prediction_(d['user_id'], d['item_id']) for d in validset]
    cost = MSE(predictions, labels)
    validate_cost=MSE(validate_predictions,validate_truth)
    print("step = " + str(step))
    print("MSE = " + str(cost))
    print("validate MSE = " + str(validate_cost))
    step+=1
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

In [19]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(trainset)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in reviewsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in reviewsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for d in trainset:
        u,i = d['user_id'], d['item_id']
        pred = prediction(u, i)
        diff = pred - d['rating']
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [20]:
lamb=0.001

In [21]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, lamb), maxfun=30)

step = 1
MSE = 0.5101077837963454
validate MSE = 0.5151365790304477
step = 2
MSE = 0.5039225737184666
validate MSE = 0.5103250290589882
step = 3
MSE = 2.5293446421185757
validate MSE = 2.546573720215951
step = 4
MSE = 0.5046952665500837
validate MSE = 0.5113856730683405
step = 5
MSE = 0.49797979745928944
validate MSE = 0.5049870140627108
step = 6
MSE = 0.48760636395331786
validate MSE = 0.5014754936204091
step = 7
MSE = 0.4802082070757493
validate MSE = 0.49755142810159236
step = 8
MSE = 0.47917718769139106
validate MSE = 0.4971522503645473
step = 9
MSE = 0.47922490901584114
validate MSE = 0.49777785167065675
step = 10
MSE = 0.48016675148552185
validate MSE = 0.49842493936545773
step = 11
MSE = 0.48006920457643265
validate MSE = 0.4982977284643863
step = 12
MSE = 0.4801983587192863
validate MSE = 0.49835403021671687
step = 13
MSE = 0.48041839765002026
validate MSE = 0.4984715660031348
step = 14
MSE = 0.4805555450044409
validate MSE = 0.49854745517922755
step = 15
MSE = 0.48060642636354

(array([ 4.53767711e+00, -4.06880288e-03,  3.91953983e-03, ...,
        -6.18005719e-06,  1.17592197e-06, -1.26700583e-06]),
 0.49261649678552755,
 {'grad': array([ 6.10695436e-06, -7.52157923e-09,  3.42678745e-09, ...,
         -1.24092483e-08,  2.64925550e-09, -2.52335181e-09]),
  'task': b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 21,
  'nit': 17,
  'warnflag': 0})

In [22]:
predictions = []
for d in testset:
    user=d['user_id']
    item=d['item_id']
    predictions.append(prediction_(user, item))

In [23]:
MSE(predictions, test_truth)

0.4960154659046572