In [1]:
import gzip
import random
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        yield l.strip().split(',')

In [3]:
### Would-read baseline: just rank which books are 
### popular and which are not, and return '1' if a
### book is among the top-ranked
train_size = 190000
val_size   = 10000

data = [line for line in readCSV("train_Interactions.csv.gz")]
train = data[:train_size]
val   = data[train_size:]
print(len(data))
print(len(train))
print(len(val))

200000
190000
10000


In [4]:
booksReadBy = defaultdict(set)
train_booksReadBy = defaultdict(set)
val_booksReadBy = defaultdict(set)
usersReadBook = defaultdict(set)
val_usersReadBook = defaultdict(set)
train_usersReadBook = defaultdict(set)
val_all_books = set()
train_all_books = set()
all_books    = set()
all_users    = set()
    
for user, book, _ in data:
    all_books.add(book)
    all_users.add(user)
    usersReadBook[book].add(user)
    booksReadBy[user].add(book)
    

for user, book, _ in train:
    train_all_books.add(book)
    train_usersReadBook[book].add(user)
    train_booksReadBy[user].add(book)

for user, book, _ in val:
    val_all_books.add(book)
    val_usersReadBook[book].add(user)
    val_booksReadBy[user].add(book)

val_unread = []

for user, book, _ in val: 
    unread_book = random.sample(all_books, 1)
    while(unread_book in list(booksReadBy[user])):
        unread_book = random.sample(all_books, 1)
    val_unread.append([user, str(unread_book[0]), '-1'])

val = val + val_unread
print(len(val))
print(val[0:3])

20000
[['u35176258', 'b30592470', '3'], ['u30851063', 'b81941226', '3'], ['u31368414', 'b40097012', '5']]


In [5]:
allRatings = []
userBookRatings = defaultdict(lambda: defaultdict(float))
userRatings = defaultdict(list)
userBooks   = defaultdict(set)
bookUsers   = defaultdict(set)


for user, book, rating in train:
    rating = int(rating)
    allRatings.append(rating)
    userRatings[user].append(rating)
    userBookRatings[user][book] = rating
    userBooks[user].add(book)
    bookUsers[book].add(user)
    
all_books_count = len(all_books)

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for user in userRatings:
    userAverage[user] = sum(userRatings[user]) / len(userRatings[user])

In [6]:
bookCount = defaultdict(int)
userCount = defaultdict(int)
total_users = 0
total_books_read = 0

for user, book, _ in train:
    userCount[user]  += 1
    bookCount[book]  += 1
    total_books_read += 1
    total_users      += 1

mostPopular      = [(bookCount[book], book) for book in bookCount]
mostPopularUsers = [(userCount[user], user) for user in userCount]

mostPopular.sort()
mostPopular.reverse()

mostPopularUsers.sort()
mostPopularUsers.reverse()

book_popularities = defaultdict(int)
for count, book in mostPopular:
    book_popularities[book] += count
    
user_popularities = defaultdict(int)
for count, user in mostPopularUsers:
    user_popularities[user] += count

def popular_books_set(mostPopular, threshold_ratio):
    return1 = set()
    cur_book_count = 0
    for book_count, book in mostPopular:
        cur_book_count += book_count
        return1.add(book)
        if cur_book_count > total_books_read *\
        threshold_ratio: 
            break
    return return1

In [7]:
def sims_max_value(sims):
    """
    Returns the Jaccard similarity between two sets,
    set1 & set2
    """
    prediction = False
    top5_ave   = 0
    if sims != []:
        prediction = max(sims)
        top5_ave = sum(sims[:5]) / min(5, len(sims))
    return prediction

# COSINE FUNCTIONS

In [8]:
allRatings = []
userRatings = defaultdict(list)
bookRatings = defaultdict(list)

for user, book, rating in readCSV("train_Interactions.csv.gz"):
    rating = int(rating)
    allRatings.append(rating)
    userRatings[user].append(rating)
    bookRatings[book].append(rating)

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for user in userRatings:
    userAverage[user] = sum(userRatings[user]) / len(userRatings[user])
    
bookAverage = {}
for book in bookRatings:
    bookAverage[book] = sum(bookRatings[book]) / len(bookRatings[book])

def cos_sim_books(book1, book2):
    if book1 == book2: return 1
    users_read_both = list(train_usersReadBook[book1]) + list(train_usersReadBook[book2])
    users_read_both = set(users_read_both)
    book1_ratings = 0
    book2_ratings = 0
#     zero_users = set()
#     for user in users_read_both:
#         if userBookRatings[user][book1] == 0: zero_users.add(user)
#         if userBookRatings[user][book2] == 0: zero_users.add(user)
#     for zero_user in zero_users:
#         users_read_both.remove(zero_user)

    if len(users_read_both) == 0: return 0
    
    numerator_sum = 0; denom_sum1 = 0; denom_sum2 = 0
    for user in users_read_both:
        arg1 = (userBookRatings[user][book1] > userAverage[user]) * 2 - 1
        arg2 = (userBookRatings[user][book2] > userAverage[user]) * 2 - 1
        numerator_sum += arg1 * arg2
        denom_sum1 += arg1 ** 2
        denom_sum2 += arg2 ** 2 
    denom = denom_sum1 * denom_sum2
    if denom == 0: return 0
    cosine_sim = numerator_sum / (denom ** 0.5)
    
    return cosine_sim

def cos_sim_users(user1, user2):
    if user1 == user2: return 1
    books_readby_both = list(train_booksReadBy[user1]) + list(train_booksReadBy[user2])
    books_readby_both = set(books_readby_both)
    user1_ratings = 0
    user2_ratings = 0
    zero_books = set()
    for book in books_readby_both:
        if userBookRatings[user1][book] == 0: zero_books.add(book)
        if userBookRatings[user2][book] == 0: zero_books.add(book)
    for zero_book in zero_books:
        books_readby_both.remove(zero_book)

    if len(books_readby_both) == 0: return 0
    
    numerator_sum = 0; denom_sum1 = 0; denom_sum2 = 0
    for book in books_readby_both:
        arg1 = (userBookRatings[user1][book] > bookAverage[book]) * 2 - 1
        arg2 = (userBookRatings[user2][book] > bookAverage[book]) * 2 - 1
        # LEFT HERE
        numerator_sum += arg1 * arg2
        denom_sum1 += arg1 ** 2
        denom_sum2 += arg2 ** 2 
    denom = denom_sum1 * denom_sum2
    if denom == 0: return 0
    cosine_sim = numerator_sum / (denom ** 0.5)
    
    return cosine_sim

def calc_cosine_books(user, book_predict, mode = 'train'):
    books_user_read = train_booksReadBy[user]
    cosine_sims = []
    for users_book in books_user_read:  
        if users_book == book_predict:
            if mode == 'train': continue
            # else: return 1
        cosine_sim = cos_sim_books(users_book, book_predict)
        cosine_sims.append(cosine_sim)
    return sims_max_value(cosine_sims)

def calc_cosine_users(user_predict, book, mode = 'train'):
    user_read_book = train_usersReadBook[book]
    cosine_sims = []
    for user in user_read_book:  
        if user == user_predict:
            if mode == 'train': continue
            # else: return 1
        cosine_sim = cos_sim_users(user, user_predict)
        cosine_sims.append(cosine_sim)
    return sims_max_value(cosine_sims)

# JACCARD FUNCTIONS

In [9]:
def jaccard(set1, set2):
    """
    Returns the Jaccard similarity between two sets,
    set1 & set2
    """
    set_intersection = len(set1.intersection(set2))
    set_union = len(set1.union(set2))
    if set_union == 0:
        return 0
    else:
        return set_intersection / set_union

def calc_jac_books(user, book_predict, mode = 'train'):
    books_user_read = train_booksReadBy[user]
    jac_sims = []
    for users_book in books_user_read:  
        if users_book == book_predict: 
            if mode == 'train': continue
            # else: return 1
        users_read_book_predict = train_usersReadBook[book_predict]
        users_read_users_book   = train_usersReadBook[users_book]
        jac_sim = jaccard(users_read_book_predict, users_read_users_book)
        jac_sims.append(jac_sim)
    return sims_max_value(jac_sims), len(sims)

def calc_jac_users(user_predict, book, mode = 'train'):
    user_read_book = train_usersReadBook[book]
    jac_sims = []
    for user in user_read_book:   
        if user == user_predict: 
            if mode == 'train': continue
            # else: return 1
        books_user_read          = train_booksReadBy[user_predict]
        books_user_predict_read  = train_booksReadBy[user]
        jac_sim = jaccard(books_user_read, books_user_predict_read)
        jac_sims.append(jac_sim)
    return sims_max_value(jac_sims)

def calc_jac_users_alt(user_predict, book, mode = 'train'):
    user_read_book = train_usersReadBook[book]
    jac_sims = []
    for user in user_read_book:   
        if user == user_predict: 
            if mode == 'train': continue
            # else: return 1
        books_user_read          = train_booksReadBy[user_predict]
        books_user_predict_read  = train_booksReadBy[user]
        jac_sim = jaccard(books_user_read, books_user_predict_read)
        jac_sims.append((jac_sim, user))
    if len(jac_sims) == 0: return 0
    jac_sims.sort(reverse = True)
    for user in jac_sims[:3]:
        if book in train_booksReadBy[user]: return 1
    return 0

# TRAINING

In [10]:
train_LR = train
train_unread = []

for user, book, _ in train: 
    unread_book = random.sample(all_books, 1)
    while(unread_book in list(booksReadBy[user])):
        unread_book = random.sample(all_books, 1)
    train_unread.append([user, str(unread_book[0]), '-1'])

In [14]:
max_popularity_book = mostPopular[0][0]
max_popularity_user = mostPopularUsers[0][0]
def feature(datum, mode = 'train'):
    user, book, _ = datum
    feat = [1]
    pop_val = (book in return1) * 1; feat.append(pop_val)
    book_pop = book_popularities[book] / max_popularity_book; feat.append(pop_val)
    jac_sim_book = calc_jac_books(user, book, mode); feat.append(jac_sim_book);
    jac_sim_user = calc_jac_users(user, book, mode); feat.append(jac_sim_user); 
#     cosine_sim_book  = calc_cosine_books(user, book, mode); feat.append(cosine_sim_book)
#     cosine_sim_user = calc_cosine_users(user, book, mode); feat.append(cosine_sim_user)
#     pearson_sim_book = calc_pearson(user, book, mode); feat.append(pearson_sim_book)
#     user_pop = user_popularities[user] / max_popularity_user; feat.append(user_pop)
    return feat

In [None]:
return1 = popular_books_set(mostPopular, 0.64)
train_LR = train + train_unread
# random.shuffle(train_LR)
# train_LR = train_LR[:50000]
loop_count = 0
LR_feat_train = []
print('here')
for d in train_LR:
    loop_count += 1
    if loop_count % 10000 == 0: print(loop_count, end = ', ');
    LR_feat_train.append(feature(d, mode = 'train'))
LR_labels_train = [int(rating) >= 0 for _, _, rating in train_LR]    

here
10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000, 160000, 170000, 

In [None]:
# return1 = popular_books_set(mostPopular, 0.64)
# train_LR = train + train_unread
# LR_feat_train   = np.array([feature(d, data = 'train') for d in train_LR])
LR_feat_val     = np.array([feature(d, mode = 'test')  for d in val])
LR_labels_val   = [int(rating) >= 0 for _, _, rating in val]

In [None]:
test_set = []
for l in open("pairs_Read.txt"):
    if l.startswith("userID"): # it's just the header
        continue
    user, book = l.strip().split('-') # it is a datapoint
    test_set.append((user, book, _))

X_test   = np.array([feature(d, 'test') for d in test_set])
y_test   = [int(rating) >= 0 for _, _, rating in test_set]

# LOGISTIC REGRESSION

In [None]:
# Balanced Error Rate function
def balanced_error_rate(pred, labels):
    TP_ = np.logical_and(pred, labels)
    FP_ = np.logical_and(pred, np.logical_not(labels))
    TN_ = np.logical_and(np.logical_not(pred), np.logical_not(labels))
    FN_ = np.logical_and(np.logical_not(pred), labels)

    TP = sum(TP_)
    FP = sum(FP_)
    TN = sum(TN_)
    FN = sum(FN_)
    
    acc = (TP + TN) / (TP + FP + TN + FN)
    BER = 1 - 0.5 * (TP / (TP + FN) + TN / (TN + FP))
    return acc, BER

In [None]:
LR_feat_train[:10]

In [None]:
C_values = np.logspace(-5, 5, num = 40)
accs = []
BERs = []
loop_count = 0
for C_value in C_values:
    loop_count += 1
    if loop_count % 10 == 0: print(loop_count, end = ', ')
    clf = LogisticRegression(C = C_value).fit(LR_feat_train, LR_labels_train)
    predicitons = clf.predict(LR_feat_val)
    acc, BER = balanced_error_rate(predicitons, LR_labels_val)
    # acc = clf.score(LR_feat_val, LR_labels_val)
    accs.append(acc)
    BERs.append(BER)

plt.plot(C_values, accs, label='Validation accuracy')
plt.plot(C_values, BERs, label='Validation BER')
plt.ylabel('Validation accuracy')
plt.xlabel('lambda value'), plt.xscale('log')
plt.title('Validation accuracy vs C values')
plt.legend()
plt.show()

indx = accs.index(max(accs))
print('\nC value for highest accuracy is:', C_values[indx])
print('Best validation accuracy is:', accs[indx])

In [None]:
0.006614740641230145
# C = 0.013433993325989001 # First 5 with 2 cosines, no book_pop, no aves
# C = 0.007227271320676181 # First 5 with only books_cosine, no aves (BEST, 0.68850 acc, 500? slack)
# C = 0.017012542798525893 # First 6 with cosine no aves (2nd BEST, 0.68716 acc, 1250 slack)

# TRY the below, than work on the BEST 3
# C = 0.004923882631706742 # First 6 with aves, no cosines
# C = ?                    # First 8 cosine aves & pearson excluded
# C = ?                    # First 10 only pearson excluded
# C = ?                    # Everything


In [None]:
# C = 0.013433993325989001 First 5

In [None]:
clf = LogisticRegression(C = 0.006614740641230145).fit(LR_feat_train, LR_labels_train)
# clf = LogisticRegression(C = C_values[indx]).fit(LR_feat_train, LR_labels_train)
def predict_datapoint_new(user, book_predict):
    feat = np.array(feature((user, book_predict,_), mode = 'test'))
    feat = feat.reshape(1, -1)
    prediction = clf.predict(feat)       
    return prediction

# WRITING PREDICTIONS TO FILE

In [None]:
confidence_scores = clf.decision_function(X_test)

In [None]:
pos_count = sum(clf.predict(X_test))
print(pos_count)

slack = 0
conf_with_labels = zip(confidence_scores, [(d[0],d[1]) for d in test_set])
conf_with_labels = list(conf_with_labels)
conf_with_labels.sort(reverse = True)
positives = conf_with_labels[:pos_count + slack]
negatives = conf_with_labels[pos_count + slack:]
positives = [d[1] for d in positives]
negatives = [d[1] for d in negatives]
print(len(positives))
print(len(negatives))

In [None]:
count = 0
with open("predictions_Read.txt", 'w') as predictions:
    for l in open("pairs_Read.txt"):
        if l.startswith("userID"): # it's just the header
            predictions.write(l)
            continue
        user, book = l.strip().split('-') # it is a datapoint
        if (user, book) in positives:
            count += 1
            predictions.write(user + '-' + book + ",1\n")
        else:
            predictions.write(user + '-' + book + ",0\n")

In [None]:
# with open('predictions_Read.txt', 'w') as predictions:
#     for l in open("pairs_Read.txt"):
#         if l.startswith("userID"): # it's just the header
#             predictions.write(l)
#             continue
#         user, book = l.strip().split('-') # it is a datapoint
#         prediction = predict_datapoint_new(user, book)
#         if prediction:
#             predictions.write(user + '-' + book + ",1\n")
#         else:
#             predictions.write(user + '-' + book + ",0\n")

# PEARSON FUNCTIONS

In [13]:
allRatings = []
userBookRatings = defaultdict(lambda: defaultdict(float))
userRatings = defaultdict(list)
userBooks   = defaultdict(set)
bookUsers   = defaultdict(set)

for user, book, rating in train:
    rating = int(rating)
    allRatings.append(rating)
    userRatings[user].append(rating)
    userBookRatings[user][book] = rating
    userBooks[user].add(book)
    bookUsers[book].add(user)

def pearson_sim(book1, book2):
    if book1 == book2: return 1
    users_read_both = list(train_usersReadBook[book1]) + list(train_usersReadBook[book2])
    users_read_both = set(users_read_both)
    book1_ratings = 0
    book2_ratings = 0
#     zero_users = set()
#     for user in users_read_both:
#         if userBookRatings[user][book1] == 0: zero_users.add(user)
#         if userBookRatings[user][book2] == 0: zero_users.add(user)
#     for zero_user in zero_users:
#         users_read_both.remove(zero_user)

    if len(users_read_both) == 0: return 0

    for user in users_read_both:
        book1_ratings += userBookRatings[user][book1]
        book2_ratings += userBookRatings[user][book2]
    
    book1_ratings = book1_ratings / len(users_read_both)
    book2_ratings = book2_ratings / len(users_read_both)
    numerator_sum = 0; denom_sum1 = 0; denom_sum2 = 0
    for user in users_read_both:
        arg1 = (userBookRatings[user][book1] - book1_ratings)
        arg2 = (userBookRatings[user][book2] - book2_ratings)
        # print(arg1, arg2)
        numerator_sum += arg1 * arg2
        denom_sum1 += arg1 ** 2
        denom_sum2 += arg2 ** 2
    denom = denom_sum1 * denom_sum2
    # print(numerator_sum)
    # print(denom_sum1, denom_sum2)
    if denom == 0: return 0
    pearson_sim = numerator_sum / (denom ** 0.5)
    
    if pearson_sim == 1: # Probably very low number of books in common
        return min(1, 0.2 * len(users_read_both))
    return pearson_sim

def calc_pearson(user, book_predict, mode):
    books_user_read = train_booksReadBy[user]
    pears_sims = []
    for users_book in books_user_read:  
        if users_book == book_predict: 
            if mode == 'train': continue  
            else: return 1
        pears_sim = pearson_sim(users_book, book_predict)
        pears_sims.append(pears_sim)
    return sims_max_value(pears_sims)