In [1]:
import numpy as np

%load_ext autoreload
%autoreload 2
import my_baselines

# Purchase prediction

In [2]:
with open('train.json') as f:
    data = f.readlines()

import ast
data = [ast.literal_eval(x) for x in data]
# data[0]

In [3]:
##########################
# 1
##########################

trainX, valX = data[:100000], data[100000:]

reviewer_item_pair = {}
reviewer_list = []
item_list = []

for d in data:
    reviewer = d['reviewerID']
    item = d['itemID']
    pair = reviewer_item_pair.get(reviewer, [])
    pair.append(item)
    reviewer_item_pair[reviewer] = pair
    reviewer_list.append(reviewer)
    item_list.append(item)
    
import random
cnt = 0
neg_pair = []

while(cnt < 100000):
    reviewer = random.choice(reviewer_list)
    item = random.choice(item_list)
    if item not in reviewer_item_pair[reviewer]:
        neg_pair.append((reviewer, item))
        cnt += 1    
        
new_valX = []
new_valY = []

for d in valX:
    new_valX.append((d['reviewerID'], d['itemID']))
    new_valY.append(1)

new_valX += neg_pair
new_valY += len(neg_pair) * [0]

In [4]:
def EvaluationPurchase(purchasePrediction, X, Y):
    acc = 0
    for (u, i), y in zip(X, Y):
        purchase = purchasePrediction(u, i)
        acc += int(y == purchase)
    return acc / len(X)

acc = EvaluationPurchase(my_baselines.ProfPurchasePrediction, new_valX, new_valY)
print("Accuracy of Prof's baseline model: %f" % acc)

Accuracy of Prof's baseline model: 0.499225


In [5]:
##########################
# 2
##########################
item_popularity = {}

for d in data:
    item = d['itemID']
    p = item_popularity.get(item, 0)
    item_popularity[item] = p + 1

item_popularity_rank = sorted(item_popularity.items(), key=lambda x: x[1], reverse=True)
item_rank = {x[0]:rank for rank, x in enumerate(item_popularity_rank)}

def p2(u, i):
    try:
        rank = item_rank[i]
    except KeyError:
        return 0
    if rank < len(item_rank) * threshold:
        return 1
    else:
        return 0

# my_baselines.purchaseBaseline(p2)  
threshold = 0.5
acc = EvaluationPurchase(p2, new_valX, new_valY)
print("Accuracy of top 50%% popularity model: %f" % acc)

threshold = 0.3
acc = EvaluationPurchase(p2, new_valX, new_valY)
print("Accuracy of top 30%% popularity model: %f" % acc)

# Using top 30th percentile of popularity is better.
# Large majority of purchases are within top 30% popularity items.

Accuracy of top 50% popularity model: 0.499645
Accuracy of top 30% popularity model: 0.498575


In [8]:
##########################
# 3
##########################
reviewer_cat_pair = {}
item_cat_pair = {}

for d in data:
    reviewer = d['reviewerID']
    item = d['itemID']
    cat = d['categories']
    
    reviewer_pair = reviewer_cat_pair.get(reviewer, [])
    item_pair = item_cat_pair.get(item, [])
    for c in cat:
        if c not in reviewer_pair:
            reviewer_pair.append(c)
        if c not in item_pair:
            item_pair.append(c)
        
    reviewer_cat_pair[reviewer] = reviewer_pair
    item_cat_pair[item] = item_pair
    
def p3(u, i):
    try:
        reviewer_pair = reviewer_cat_pair[u]
        item_pair = item_cat_pair[i]
    except KeyError:
        return 0
    
    for i in item_pair:
        if i in reviewer_pair: return 1
    return 0

# my_baselines.purchaseBaseline(p3)
acc = EvaluationPurchase(p3, new_valX, new_valY)
print("Accuracy of repeat purchase model: %f" % acc)

Accuracy of repeat purchase model: 0.889675


In [7]:
def p3(u, i):
    try:
        reviewer_pair = reviewer_cat_pair[u]
        item_pair = item_cat_pair[i]
    except KeyError:
        return 0
    
    for i in item_pair:
        if i in reviewer_pair: return 1
    return 0

In [7]:
##########################
# 4
##########################
kaggle_username = 'Fanjin'
kaggle_username

'Fanjin'

# Rating prediction

In [8]:
new_valX = [(d['reviewerID'], d['itemID']) for d in valX]
new_valY = [d['rating'] for d in valX]

In [9]:
##########################
# 5
##########################
alpha = 0.
for d in trainX:
    alpha += d['rating']
alpha /= len(trainX)


def p5(u, i):
    return alpha

# my_baselines.ratingBaseline(p5)
print('The value of Alpha: %f' % alpha)

The value of Alpha: 4.232000


In [10]:
##########################
# 6
##########################


def p6_init(trainX):
    alpha = 0.
    beta_user = {}
    beta_item = {}

    for d in trainX:
        reviewer = d['reviewerID']
        item = d['itemID']
        rating = d['rating']

        bu = beta_user.get(reviewer, (0., 0))
        bi = beta_item.get(item, (0., 0))

        bu = (bu[0]+rating, bu[1]+1)
        bi = (bi[0]+rating, bi[1]+1)

        alpha += rating
        beta_user[reviewer] = bu
        beta_item[item] = bi

    alpha /= len(trainX)
    beta_user = {r:(b[0]/b[1] - alpha) for r,b in beta_user.items()}
    beta_item = {r:(b[0]/b[1] - alpha) for r,b in beta_item.items()}
    return alpha, beta_user, beta_item

def p6_iterative(alpha, beta_user, beta_item, lam, trainX, n_iter):
    for _ in range(n_iter):
        alpha, beta_user, beta_item = p6_update(alpha, beta_user, beta_item, lam, trainX)
        
    return alpha, beta_user, beta_item

def p6_update(alpha, beta_user, beta_item, lam, trainX):
    beta_user_update = {}
    beta_item_update = {}
    alpha_t = 0.
    cnt = 0

    for d in trainX:
        u = d['reviewerID']
        i = d['itemID']
        rating = d['rating']

        bu = beta_user.get(u, 0.)
        bi = beta_item.get(i, 0.)

        bu_t = beta_user_update.get(u, (0., 0))
        bi_t = beta_item_update.get(i, (0., 0))

        alpha_t += rating - (bu + bi)
        cnt += 1
        bu_t = (bu_t[0]+(rating - (alpha + bi)), bu_t[1]+1)
        bi_t = (bi_t[0]+(rating - (alpha + bu)), bi_t[1]+1)

        beta_user_update[u] = bu_t
        beta_item_update[i] = bi_t

    alpha_t /= len(trainX)
    beta_user_update = {r:(b[0]/(b[1]+lam)) for r,b in beta_user_update.items()}
    beta_item_update = {r:(b[0]/(b[1]+lam)) for r,b in beta_item_update.items()}
    
    return alpha_t, beta_user_update, beta_item_update

In [11]:
def p6_train(lam, n_iter, trainX):
    alpha, beta_user, beta_item = p6_init(trainX)
    alpha, beta_user, beta_item = p6_iterative(alpha, beta_user, beta_item, lam, trainX, n_iter)
    return alpha, beta_user, beta_item
    
def p6(u, i):
    bu = beta_user.get(u, 0.)
    bi = beta_item.get(i, 0.)
    return alpha + bu + bi

def EvaluationRating(ratingPrediction, X, Y):
    mse = 0.
    for (u, i), y in zip(X, Y):
        rating = ratingPrediction(u, i)
        mse += (y - rating) ** 2
    return mse / len(X)

lam = 1
alpha, beta_user, beta_item = p6_train(lam, 1, trainX)


# my_baselines.ratingBaseline(p6)
mse = EvaluationRating(p6, new_valX, new_valY)
print("MSE Problem 6 model: %f" % mse)

MSE Problem 6 model: 1.247006


In [12]:
##########################
# 7
##########################

beta_user_rank = sorted(beta_user.items(), key=lambda x: x[1])
beta_item_rank = sorted(beta_item.items(), key=lambda x: x[1])

print('Largest beta')
print('User# ' + beta_user_rank[-1][0])
print('Item# ' + beta_item_rank[-1][0])

print('Smallest beta')
print('User# ' + beta_user_rank[0][0])
print('Item# ' + beta_item_rank[0][0])

Largest beta
User# U605818049
Item# I262560051
Smallest beta
User# U204516481
Item# I444377179


In [13]:
##########################
# 8
##########################

lmbda = [0.01, 0.1, 1, 2, 5, 10]
best_mse = float('Inf')
best_lam = None

for lam in lmbda:
    alpha, beta_user, beta_item = p6_train(lam, 5,trainX)
    mse = EvaluationRating(p6, new_valX, new_valY)
    
    if mse <= best_mse:
        best_mse = mse
        best_lam = lam

print('Best lambda is: %f, Its MSE: %f' % (best_lam, best_mse))

Best lambda is: 5.000000, Its MSE: 1.139957
