In [45]:
import numpy  as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy
import urllib
import scipy.optimize
import random
from math import exp
from math import log
from collections import defaultdict
import gzip
import pandas as pd
import copy

In [46]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(np.array(predictions),np.array(labels))]
    return sum(differences) / len(differences)

def MAE(predictions, labels):
    differences = [abs(x - y) for x, y in zip(predictions, labels)]
    return sum(differences) / len(differences)


def RMSE(predictions, labels):
    differences = [(x - y) ** 2 for x, y in zip(predictions, labels)]
    return (sum(differences) ** 0.5) / len(differences)
  

In [4]:
dataset_dir = "../Datasets/final/"
X_train = pd.read_pickle(dataset_dir+"X_train.pkl")
y_train = pd.read_pickle(dataset_dir+"y_train.pkl")

In [6]:
X = pd.DataFrame()
X["user"] = X_train["gPlusUserId"]
X["id"] = X_train["gPlusPlaceId"]
X["review"] = y_train["rating"]

In [20]:
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
users = set()
items = set()
labels = []
summ = 0
for i in range(len(X)):
    u = X.user[i]
    b = X.id[i]
    r = X.review[i]
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))
    users.add(u)
    items.add(b)
    labels.append(r)
    summ = summ+ r
summ /= len(X)
print(summ)

3.8368615036650597


In [21]:
alpha = summ


In [22]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)
userGamma = {}
itemGamma = {}
users = list(users)
items = list(items)
K = 5 
for u in users:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
for i in items:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [35]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K



def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

def dotprod(x,y):
    return x.dot(y.transpose())

def prediction(user, item):
    if user in userBiases and item in itemBiases:
            return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])
    else:
        return alpha
    
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d['user'], d['id']) for index,d in X.iterrows()]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost


def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(X)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in ratingsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in ratingsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for index, d in X.iterrows():
        u,i = d['user'], d['id']
        pred = prediction(u, i)
        diff = pred - d['review']
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [37]:
nUsers = len(users)
nItems = len(items)
K = 5
lamb = 1


scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, 0.001))

MSE = 1.6868873776359776
MSE = 1.6896548948183898
MSE = 1.847592157340711
MSE = 1.6791182094756796
MSE = 1.6612154900076728
MSE = 1.6610153288442242
MSE = 1.6610001211621417
MSE = 1.6610001711681912
MSE = 1.661000760111357
MSE = 1.6610585381111622
MSE = 1.6613364360903904
MSE = 1.6616737821039156
MSE = 1.6618885349034975
MSE = 1.661911643871896
MSE = 1.6619053041585015
MSE = 1.6619007148210756
MSE = 1.6619022946021127
MSE = 1.6619285484104216
MSE = 1.6619863301502475
MSE = 1.6619565512271386
MSE = 1.662041578847292
MSE = 1.662021310611376
MSE = 1.6620213168915918
MSE = 1.662022495026603
MSE = 1.662025547968272
MSE = 1.6620266381631956
MSE = 1.66203508554562
MSE = 1.6620302811715169


(array([ 3.83419764e+00,  2.40723501e-03,  2.39816696e-03, ...,
        -1.43721583e-06, -6.09261433e-07, -1.08904873e-06]),
 1.674180429567798,
 {'grad': array([-5.12793065e-06,  7.12111530e-11,  1.62371082e-10, ...,
         -2.88053276e-09, -1.22209630e-09, -2.18248764e-09]),
  'task': 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 28,
  'nit': 24,
  'warnflag': 0})

In [40]:
X_val = pd.read_pickle(dataset_dir+"X_val.pkl")
y_val = pd.read_pickle(dataset_dir+"y_val.pkl")
X_v = pd.DataFrame()
X_v["user"] = X_val["gPlusUserId"]
X_v["id"] = X_val["gPlusPlaceId"]
X_v["review"] = y_val["rating"]
validPreds = [prediction(d['user'], d['id']) for idx,d in X_v.iterrows()]
labels = [d['review'] for idx,d in X_v.iterrows()]
print(MSE(validPreds,labels))
print(MAE(validPreds,labels))

1.6951475763890387
1.0745110848472417


In [44]:
X_tes = pd.read_pickle(dataset_dir+"X_test.pkl")
y_tes = pd.read_pickle(dataset_dir+"y_test.pkl")
X_t = pd.DataFrame()
X_t["user"] = X_tes["gPlusUserId"]
X_t["id"] = X_tes["gPlusPlaceId"]
X_t["review"] = y_tes["rating"]
testPreds = [prediction(d['user'], d['id']) for idx,d in X_t.iterrows()]
labels = [d['review'] for idx,d in X_t.iterrows()]
print(MSE(testPreds,labels))
print(MAE(testPreds,labels))

1.673983892405506
1.0695566110287056


In [47]:
## Doing the same for temporal dataset 

In [48]:
X_train = pd.read_pickle("../Datasets/temporal/final/X_train.pkl")
X_val = pd.read_pickle("../Datasets/temporal/final/X_val.pkl")
X_test = pd.read_pickle("../Datasets/temporal/final/X_test.pkl")
y_train = pd.read_pickle("../Datasets/temporal/final/y_train.pkl")
y_val = pd.read_pickle("../Datasets/temporal/final/y_val.pkl")
y_test = pd.read_pickle("../Datasets/temporal/final/y_test.pkl")

In [49]:
X = pd.DataFrame()
X["user"] = X_train["gPlusUserId"]
X["id"] = X_train["gPlusPlaceId"]
X["review"] = y_train["rating"]

In [50]:
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
users = set()
items = set()
labels = []
summ = 0
for i in range(len(X)):
    u = X.user[i]
    b = X.id[i]
    r = X.review[i]
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))
    users.add(u)
    items.add(b)
    labels.append(r)
    summ = summ+ r
summ /= len(X)
print(summ)

3.907114673446477


In [51]:
alpha = summ

In [52]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)
userGamma = {}
itemGamma = {}
users = list(users)
items = list(items)
K = 5 
for u in users:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
for i in items:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [53]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K



def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

def dotprod(x,y):
    return x.dot(y.transpose())

def prediction(user, item):
    if user in userBiases and item in itemBiases:
            return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])
    else:
        return alpha
    
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d['user'], d['id']) for index,d in X.iterrows()]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost


def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(X)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in ratingsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in ratingsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for index, d in X.iterrows():
        u,i = d['user'], d['id']
        pred = prediction(u, i)
        diff = pred - d['review']
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [54]:
nUsers = len(users)
nItems = len(items)
K = 5
lamb = 1


scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, 0.001))

MSE = 1.4005824025880729
MSE = 1.3999605991493735
MSE = 1.3974933746706046
MSE = 8.773672059222733
MSE = 1.5620177250675524
MSE = 1.3765055274139466
MSE = 1.3768479486831624
MSE = 1.3768572390591471
MSE = 1.3768949901246514
MSE = 1.377055427377236
MSE = 1.377459064607603
MSE = 1.3775471133707986
MSE = 1.377632950629717
MSE = 1.3776820777154652
MSE = 1.3777605662819754
MSE = 1.3777663803286992
MSE = 1.377765971402898
MSE = 1.3777659992827904


(array([ 3.90613825e+00,  2.24775267e-03,  2.24741147e-03, ...,
         5.04163121e-07, -9.84724907e-07,  1.67563619e-07]),
 1.3888215054041557,
 {'grad': array([ 7.98215138e-06,  1.29500386e-10,  2.04166442e-10, ...,
          1.00930641e-09, -1.95697329e-09,  3.14954788e-10]),
  'task': 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 18,
  'nit': 13,
  'warnflag': 0})

In [61]:
from metrics import MAE,MSE

In [77]:
X_t = 0
X_t = pd.DataFrame()
X_t["user"] = X_test["gPlusUserId"]
X_t["id"] = X_test["gPlusPlaceId"]
X_t["review"] = y_test["rating"]
testPreds = [prediction(d['user'], d['id']) for idx,d in X_t.iterrows()]
labels = [d['review'] for idx,d in X_t.iterrows()]
print(MSE(testPreds,y_test["rating"]))
print(MAE(testPreds,y_test["rating"]))

2.535442192195376
1.3139664711321033
