In [95]:
import gzip
import implicit
import random
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [96]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        yield l.strip().split(',')

In [97]:
example = readCSV("train_Interactions.csv.gz")
print(next(example))
del example

['u79354815', 'b14275065', '4']


In [98]:
### Ratings Prediction
train_size = 199000
data       = [line for line in readCSV("train_Interactions.csv.gz")]
# random.shuffle(data)
train      = data[:train_size]
val        = data[train_size:]

allRatings = []
userBookRatings = defaultdict(lambda: defaultdict(float))
userRatings = defaultdict(list)
userBooks   = defaultdict(set)
bookUsers   = defaultdict(set)
all_users    = set()
all_books    = set()

for user, book, rating in train:
    all_users.add(user)
    all_books.add(book)
    rating = int(rating)
    allRatings.append(rating)
    userRatings[user].append(rating)
    userBookRatings[user][book] = rating
    userBooks[user].add(book)
    bookUsers[book].add(user)

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for user in userRatings:
    userAverage[user] = sum(userRatings[user]) / len(userRatings[user])

In [99]:
# Coordinate Descent
def coordinate_descent(lambda_opt = 1, iterations = 100):

    alpha_sum, bu_sum, bb_sum = 0, 0, 0

    train_len = len(train)
    bu = defaultdict(lambda: 1)
    bb = defaultdict(lambda: 1)

    for descent in range(iterations):
        alpha_sum = 0
        for user, book, _ in train:
            alpha_sum += userBookRatings[user][book] - (bu[user] + bb[book])
        alpha = alpha_sum / train_len

        for user in userRatings:
            bu_sum = 0
            for book in userBooks[user]:
                bu_sum += userBookRatings[user][book] - (alpha + bb[book])
            bu[user] = bu_sum / (lambda_opt + len(userBooks[user]))

        for book in bookUsers:
            bb_sum = 0
            for user in bookUsers[book]:
                bb_sum += userBookRatings[user][book] - (alpha + bu[user])
            bb[book] = bb_sum / (lambda_opt + len(bookUsers[book]))
            
    return alpha, bu, bb

# alpha, bu, bb = coordinate_descent(lambda_opt = 1, iterations = 100)

In [100]:
# lambda_values = np.logspace(0, 1, num = 20)

# MSEs = []
# loop_count = 0
# for lambda_opt in lambda_values:
#     loop_count += 1; print(loop_count, end = ', ')
#     alpha, bu, bb = coordinate_descent(lambda_opt, iterations = 20)
#     rating_labels = []
#     diff = 0
#     for user, book, rating in val:
#         user_rating = alpha + bu[user] + bb[book]
#         diff += (user_rating - int(rating)) ** 2
#     MSE = diff / len(val)
#     MSEs.append(MSE)

# plt.plot(lambda_values, MSEs, label='Validation')
# plt.ylabel('MSE')
# plt.xlabel('lambda value'), plt.xscale('log')
# plt.title('MSE vs lambda values')
# plt.legend()
# plt.show()

# indx = MSEs.index(min(MSEs))
# print('\nLambda for lowest MSE is:', lambda_values[indx])
# print('\nBest MSE is:', MSEs[indx])

In [101]:
alpha_normal, bu_normal, bb_normal = coordinate_descent(2.8, iterations = 100)

diff = 0
for user, book, rating in val:
    user_rating = alpha_normal + bu_normal[user] + bb_normal[book]
    diff += (user_rating - int(rating)) ** 2
    MSE_normal = diff / len(val)
print(MSE_normal)

1.1031931568630036


In [102]:
# Best k = 4, step_size = 400, lambda_opt1 = 2.8 
# k = 3 or 4 is the best, try different lambda_opt1
train_len = len(train)
k = 6
all_users_lst = list(all_users)
all_books_lst = list(all_books)
all_users_lst.sort()
all_books_lst.sort()

book_indices = defaultdict(int)
for book in userBooks[user]:
    book_ind = all_books_lst.index(book)
    book_indices[book] = book_ind

user_indices = defaultdict(int)
for user in bookUsers[book]:
    user_ind = all_users_lst.index(user)
    user_indices[user] = user_ind

gamma_u = ((np.random.rand(len(all_users_lst), k)) - 0.5) * 0.1
gamma_b = ((np.random.rand(len(all_books_lst), k)) - 0.5) * 0.1

In [104]:
step_size = 1 * 50; alpha = 0
lambda_opt1 = 2.8; lambda_opt2 = 1
print(step_size)
print(lambda_opt2)
print('Starting Descent...')
# print('gamma u', gamma_u[:6])
# print('gamma b', gamma_b[:6])

alpha = alpha_normal
bu = bu_normal
bb = bb_normal
count = 0
overfit_count = 0
MSE = MSE_normal
prev_MSE = 10
for i in range(5000000):  
    count += 1
    if MSE_normal - prev_MSE == 10 ** -4 : step_size = step_size / 10
    rating_labels = []
    diff = 0
    for user, book, rating in val:
        if user in all_users_lst and book in all_books_lst: 
            user_ind = user_indices[user]
            book_ind = book_indices[book]
            mult = np.dot(gamma_u[user_ind], gamma_b[book_ind])
            user_rating = alpha + bu[user] + bb[book] + mult
        else:
            user_rating = alpha + bu[user] + bb[book];
        diff += (user_rating - int(rating)) ** 2
    print('Step size is: ', step_size, 'MSE is ', MSE)
    MSE = diff / len(val)
    cur_MSE = MSE
    
    if cur_MSE > prev_MSE and count > 10:
        overfit_count += 1
        if overfit_count >= 5:
            print('Probably overfitting now, breaking...')
            print('Loop count was:', i)
            break
        
    prev_MSE = cur_MSE
    for j in range(1):
        # Alpha
        alpha_sum = 0
        for user, book, _ in train:
            user_ind = user_indices[user]
            book_ind = book_indices[book]
            gamma_mult = np.dot(gamma_u[user_ind], gamma_b[book_ind])
            alpha_sum += userBookRatings[user][book] - (bu[user] + bb[book] + gamma_mult)
        alpha = alpha_sum / train_len

        # Beta_u
        for user in userRatings:
            bu_sum = 0
            for book in userBooks[user]:
                user_ind = user_indices[user]
                book_ind = book_indices[book]
                gamma_mult = np.dot(gamma_u[user_ind], gamma_b[book_ind])
                bu_sum += userBookRatings[user][book] - (alpha + bb[book] + gamma_mult)
            bu[user] = bu_sum / (lambda_opt1 + len(userBooks[user]))

        # Beta_b
        for book in bookUsers:
            bb_sum = 0
            for user in bookUsers[book]:
                user_ind = user_indices[user]
                book_ind = book_indices[book]
                gamma_mult = np.dot(gamma_u[user_ind], gamma_b[book_ind])
                bb_sum += userBookRatings[user][book] - (alpha + bu[user] + gamma_mult)
            bb[book] = bb_sum / (lambda_opt1 + len(bookUsers[book]))
        
    print('gamma u ave', sum([abs(i) for i in gamma_u[:,0]]) / len(gamma_u))
    for j in range(1):
        # Gamma_u
        for user_ind, user in enumerate(all_users_lst):
            gamma_u_sum = 0
            for book in userBooks[user]:
                book_ind = book_indices[book]
                pred1 = alpha + bu[user] + bb[book]
                pred2 = np.dot(gamma_u[user_ind], gamma_b[book_ind])
                rating = userBookRatings[user][book]
                err = rating - (pred1 + pred2)
                gamma_u_sum += err
            gradient = - 2 * gamma_u_sum * gamma_b[book_ind] / train_len
            gamma_u[user_ind] = gamma_u[user_ind] - step_size * gradient * lambda_opt2
        # Gamma_b
        for book_ind, book in enumerate(all_books_lst):
            gamma_b_sum = 0
            for user in bookUsers[book]:
                user_ind = user_indices[user]
                pred1 = alpha + bu[user] + bb[book]
                pred2 = np.dot(gamma_u[user_ind], gamma_b[book_ind])
                rating = userBookRatings[user][book]
                err = rating - (pred1 + pred2)
                gamma_b_sum += err
            gradient = - 2 * gamma_b_sum * gamma_u[user_ind] / train_len
            gamma_b[book_ind] = gamma_b[book_ind] - step_size * gradient * lambda_opt2

50
1
Starting Descent...
Step size is:  50 MSE is  1.1031931568630036
gamma u ave 0.06263158422725493
Step size is:  50 MSE is  1.1026895523459608
gamma u ave 0.06263214004837306
Step size is:  50 MSE is  1.103170853203941
gamma u ave 0.06263269491288147
Step size is:  50 MSE is  1.1031999905743612
gamma u ave 0.0626332493788651
Step size is:  50 MSE is  1.1032021860310715
gamma u ave 0.06263380348204378
Step size is:  50 MSE is  1.1032020453443412
gamma u ave 0.06263435722663788
Step size is:  50 MSE is  1.1032017333247575
gamma u ave 0.06263491061377091
Step size is:  50 MSE is  1.1032014408041075
gamma u ave 0.06263546364409965
Step size is:  50 MSE is  1.1032011807047872
gamma u ave 0.06263601631815247
Step size is:  50 MSE is  1.1032009503816105
gamma u ave 0.06263656863638965
Step size is:  50 MSE is  1.1032007465418194
gamma u ave 0.06263712059922513
Step size is:  50 MSE is  1.1032005663583622
gamma u ave 0.06263767220702815
Step size is:  50 MSE is  1.103200407400743
gamma u a

In [110]:
# Best k = 4, step_size = 400, lambda_opt1 = 2.8 
# k = 3 or 4 is the best, try different lambda_opt1
train_len = len(train)
k = 4
all_users_lst = list(all_users)
all_books_lst = list(all_books)
all_users_lst.sort()
all_books_lst.sort()

book_indices = defaultdict(int)
for book in userBooks[user]:
    book_ind = all_books_lst.index(book)
    book_indices[book] = book_ind

user_indices = defaultdict(int)
for user in bookUsers[book]:
    user_ind = all_users_lst.index(user)
    user_indices[user] = user_ind

gamma_u = ((np.random.rand(len(all_users_lst), k)) - 0.5) * 0.25
gamma_b = ((np.random.rand(len(all_books_lst), k)) - 0.5) * 0.25

step_size = 1 * 700; alpha = 0
lambda_opt1 = 4; lambda_opt2 = 1
print(step_size)
print(lambda_opt2)
print('Starting Descent...')
# print('gamma u', gamma_u[:6])
# print('gamma b', gamma_b[:6])

700
1
Starting Descent...


In [112]:
step_size = 1 * 400; alpha = 0
lambda_opt1 = 3.5; lambda_opt2 = 1
bu = defaultdict(lambda: 1)
bb = defaultdict(lambda: 1)
count = 0
overfit_count = 0
MSE = MSE_normal
prev_MSE = 10
for i in range(5000000):  
    count += 1
    # if prev_MSE < MSE_normal: step_size = step_size / 10
    rating_labels = []
    diff = 0
    for user, book, rating in val:
        if user in all_users_lst and book in all_books_lst: 
            user_ind = user_indices[user]
            book_ind = book_indices[book]
            mult = np.dot(gamma_u[user_ind], gamma_b[book_ind])
            user_rating = alpha + bu[user] + bb[book] + mult
        else:
            user_rating = alpha + bu[user] + bb[book];
        diff += (user_rating - int(rating)) ** 2
    print('Step size is: ', step_size, 'MSE is ', MSE)
    MSE = diff / len(val)
    cur_MSE = MSE
    
    if cur_MSE > prev_MSE and count > 10:
        overfit_count += 1
        if overfit_count >= 3:
            print('Probably overfitting now, breaking...')
            print('Loop count was:', i)
            break
        
    prev_MSE = cur_MSE
    for j in range(1):
        # Alpha
        alpha_sum = 0
        for user, book, _ in train:
            user_ind = user_indices[user]
            book_ind = book_indices[book]
            gamma_mult = np.dot(gamma_u[user_ind], gamma_b[book_ind])
            alpha_sum += userBookRatings[user][book] - (bu[user] + bb[book] + gamma_mult)
        alpha = alpha_sum / train_len

        # Beta_u
        for user in userRatings:
            bu_sum = 0
            for book in userBooks[user]:
                user_ind = user_indices[user]
                book_ind = book_indices[book]
                gamma_mult = np.dot(gamma_u[user_ind], gamma_b[book_ind])
                bu_sum += userBookRatings[user][book] - (alpha + bb[book] + gamma_mult)
            bu[user] = bu_sum / (lambda_opt1 + len(userBooks[user]))

        # Beta_b
        for book in bookUsers:
            bb_sum = 0
            for user in bookUsers[book]:
                user_ind = user_indices[user]
                book_ind = book_indices[book]
                gamma_mult = np.dot(gamma_u[user_ind], gamma_b[book_ind])
                bb_sum += userBookRatings[user][book] - (alpha + bu[user] + gamma_mult)
            bb[book] = bb_sum / (lambda_opt1 + len(bookUsers[book]))
        
    print('gamma u ave', sum([abs(i) for i in gamma_u[:,0]]) / len(gamma_u))
    for j in range(1):
        # Gamma_u
        for user_ind, user in enumerate(all_users_lst):
            gamma_u_sum = 0
            for book in userBooks[user]:
                book_ind = book_indices[book]
                pred1 = alpha + bu[user] + bb[book]
                pred2 = np.dot(gamma_u[user_ind], gamma_b[book_ind])
                rating = userBookRatings[user][book]
                err = rating - (pred1 + pred2)
                gamma_u_sum += err
            gradient = - 2 * gamma_u_sum * gamma_b[book_ind] / train_len
            gamma_u[user_ind] = gamma_u[user_ind] - step_size * gradient * lambda_opt2
        # Gamma_b
        for book_ind, book in enumerate(all_books_lst):
            gamma_b_sum = 0
            for user in bookUsers[book]:
                user_ind = user_indices[user]
                pred1 = alpha + bu[user] + bb[book]
                pred2 = np.dot(gamma_u[user_ind], gamma_b[book_ind])
                rating = userBookRatings[user][book]
                err = rating - (pred1 + pred2)
                gamma_b_sum += err
            gradient = - 2 * gamma_b_sum * gamma_u[user_ind] / train_len
            gamma_b[book_ind] = gamma_b[book_ind] - step_size * gradient * lambda_opt2

Step size is:  400 MSE is  1.1031931568630036
gamma u ave 0.07552353181257922
Step size is:  400 MSE is  5.0242209124888495
gamma u ave 0.0759823345926357
Step size is:  400 MSE is  1.1483633637362056
gamma u ave 0.07642689234507094
Step size is:  400 MSE is  1.1504274716992333
gamma u ave 0.07687747287845173
Step size is:  400 MSE is  1.1487905127397986
gamma u ave 0.07733289534959818
Step size is:  400 MSE is  1.1460901938387538
gamma u ave 0.07779096320398828
Step size is:  400 MSE is  1.1430456251298353
gamma u ave 0.07824717420607469
Step size is:  400 MSE is  1.139850838208792
gamma u ave 0.07869932134201846
Step size is:  400 MSE is  1.1366281157638665
gamma u ave 0.0791466208067447
Step size is:  400 MSE is  1.1334708756030212
gamma u ave 0.07958802147715097
Step size is:  400 MSE is  1.130447451546688
gamma u ave 0.0800220043504089
Step size is:  400 MSE is  1.1276037943067787
gamma u ave 0.08044713852654292
Step size is:  400 MSE is  1.1249672570449043
gamma u ave 0.080864026

Step size is:  400 MSE is  1.1024722833168816
gamma u ave 0.10620520286083569
Step size is:  400 MSE is  1.102466136774993
gamma u ave 0.10645280352900528
Step size is:  400 MSE is  1.102460044163784
gamma u ave 0.10670071707682398
Step size is:  400 MSE is  1.1024540073075377
gamma u ave 0.10694861988520933
Step size is:  400 MSE is  1.1024480281606905
gamma u ave 0.10719657408692021
Step size is:  400 MSE is  1.1024421088014615
gamma u ave 0.10744472160319955
Step size is:  400 MSE is  1.1024362514262955
gamma u ave 0.10769320800533684
Step size is:  400 MSE is  1.1024304583450142
gamma u ave 0.10794196523296243
Step size is:  400 MSE is  1.1024247319765856
gamma u ave 0.10819106253190248
Step size is:  400 MSE is  1.102419074845452
gamma u ave 0.10844021472795465
Step size is:  400 MSE is  1.1024134895784066
gamma u ave 0.10868941886044274
Step size is:  400 MSE is  1.1024079789018797
gamma u ave 0.10893895726495642
Step size is:  400 MSE is  1.1024025456396733
gamma u ave 0.1091886

In [113]:
rating_labels = []
diff = 0
for user, book, rating in val:
    if user in all_users_lst and book in all_books_lst: 
        user_ind = user_indices[user]
        book_ind = book_indices[book]
        mult = np.dot(gamma_u[user_ind], gamma_b[book_ind])
        user_rating = alpha + bu[user] + bb[book] + mult
        if user_rating > 5 or user_rating < 0:
            user_rating = alpha_normal + bu_normal[user] + bb_normal[book]
    else:
        user_rating = alpha + bu[user] + bb[book];
    diff += (user_rating - int(rating)) ** 2

MSE = diff / len(val)
print('MSE on the validation set', MSE)

MSE on the validation set 1.1024602750123518


In [114]:
with open("predictions_Rating.txt", 'w') as predictions:
    for l in open("pairs_Rating.txt"):
        if l.startswith("userID"):
            #header
            predictions.write(l)
            continue
        user, book = l.strip().split('-')
        user_ind = user_indices[user]
        book_ind = book_indices[book]
        mult = np.dot(gamma_u[user_ind], gamma_b[book_ind])
        user_rating = alpha + bu[user] + bb[book] + np.dot(gamma_u[user_ind], gamma_b[book_ind])
        if user_rating > 5 or user_rating < 0:
            user_rating = alpha_normal + bu_normal[user] + bb_normal[book]
        predictions.write(user + '-' + book + ',' + str(user_rating) + '\n')