In [333]:
import os
import sys
import numpy as np
from numpy import genfromtxt
import pandas as pd
import scipy
import math
import random
from scipy import sparse
from collections import defaultdict

# Custom libraries
sys.path.append('../Util')
from loader import get_books, get_book_dataframe, get_book_features
from joiner import get_ratings, get_joint, load_amazon, load_goodreads
from reduction import reduce_matrix, get_sparse

In [334]:
def map_user_to_features(p, features):
    p_sparse = scipy.sparse.csr_matrix(p)
    # map new user to concept space by p*features
    user_to_concept = p_sparse.dot(features)
    # map user back to itme space with user_to_concept * featuresT
    result = user_to_concept.dot(features.T).todense()
    return result.T

In [335]:
def get_predictions(p, q, user_bias, item_bias, global_bias):
    pred_ratings = np.zeros(len(q))
    for i in range(len(q)):
        pred = global_bias + user_bias + item_bias[i] + np.dot(p, q[i])
        # pred = global_bias + user_bias + np.dot(p, q[i])
        pred_ratings[i] = pred
    return pred_ratings

In [336]:
def get_top_n_recs(result, books, n, q):
    recs = []
    for i in range(len(result)):
        if q[i] == 0: # book user hasn't already rated
            recs.append((i, result[i]))
        else:
            recs.append((i, float('-inf'))) 
            # recs.append((i, result[i])) #leave this to verify things actually working
    recs = sorted(recs, key=lambda tup: tup[1], reverse=True)

    top_titles = []
    for i in range(n):
        book_id = recs[i][0]
        title = books.iloc[book_id]['title']
        top_titles.append(title)
    return top_titles

In [337]:
# Set this to where you save and load all data
data_path = '../../goodbooks-10k/'

In [338]:
# Get dataframe from books
books = get_book_dataframe(data_path)

found books_dataframe in file...


In [339]:
books.iloc[0]['title']

'The Hunger Games (The Hunger Games, #1)'

In [813]:
# cu2rec components
filename = '../.tmp/goodbooks_sorted_f300'
q = genfromtxt('{}_q.csv'.format(filename), delimiter=',')
item_bias = genfromtxt('{}_item_bias.csv'.format(filename), delimiter=',')


# surprise components
# filename = '../.tmp/svd_100_300.npy'
# q = np.load(filename)
# filename = '../.tmp/Q_300.npy'
# q = np.load(filename)
# filename = '../.tmp/item_bias_300.npy'
# item_bias = np.load(filename)

In [814]:
# convert global bias to float - get from whatever dataset you used
global_bias = 3.919866

In [815]:
# user from goodreads
# sparse_new_user_scaled = scipy.sparse.load_npz('../.tmp/cached_users/user_likes_mystery_scifi_hates_fantasy.npz')
# sparse_new_user_scaled = scipy.sparse.load_npz('../.tmp/cached_users/user_likes_fantasy.npz')
sparse_new_user_scaled = scipy.sparse.load_npz('../.tmp/cached_users/user_nickgreenquist.npz')
new_user_ratings_scaled = sparse_new_user_scaled.toarray()
new_user_ratings_scaled = np.array(new_user_ratings_scaled[0].tolist())
new_user_ratings = np.copy(new_user_ratings_scaled)

In [816]:
# undo the rating mapping we usually do

# Turn 1-5 rating scale into negative - positive scale
# original mapper: ratings_mapper = {0:0, 1:-2, 2:-1, 3:1, 4:2, 5:3}
ratings_mapper = {0:0, -2:-1, -1:-2, 1:3, 2:4, 3:5}
for i in range(len(q)):
    new_user_ratings[i] = ratings_mapper[new_user_ratings_scaled[i]]
new_user_ratings

array([5, 5, 0, ..., 0, 0, 0])

In [817]:
# create array of indices of books this user has actually rated
indices = []
for i in range(len(new_user_ratings)):
    if new_user_ratings[i] != 0:
        indices.append(i)
len(indices)

202

In [818]:
# Hyperparams
learning_rate = 0.07
user_bias_reg = 0.0
P_reg = 0.0

# updates per rated book
iterations = 5

 # how many iterations to see the total loss at this step - remove in webapp!
calculate_total_loss = 1

n_factors = q.shape[1]
cols = q.shape[0]

In [819]:
# 1. set the user_bias for this user
new_user_bias = 0

In [820]:
# 2. set up new random P
mu, sigma = 0, 0.1
p = np.random.normal(mu, (sigma / n_factors), n_factors)

In [821]:
# 3. computer small number of iterations of SGD
for iteration in range(iterations):
    
    #= periodically calculate total loss and output
    if iteration == 0 or iteration == iterations - 1 or iteration % calculate_total_loss == 0:
        total_loss = 0.0
        for i in indices:
            rating = new_user_ratings[i]
            pred = global_bias + new_user_bias + item_bias[i] + np.dot(p, q[i])
            # pred = global_bias + new_user_bias + np.dot(p, q[i])
            error = rating - pred
            total_loss += pow(error, 2)
            
        last_rmse = rmse
        rmse = math.sqrt(total_loss / len(indices))
        print("RMSE at Iteration {}: {}".format(iteration, rmse))
        
        # Update learning rate if needed
        if last_rmse < rmse:
            current_patience -=1
        if current_patience <= 0:
            current_patience = max_patience
            learning_rate *= learning_rate_decay

            print("New Learning Rate: {}", learning_rate)

    # Gradient Descent using every book - ucomment below to go back to SGD
    # i = random.choice(indices)
    for i in indices:
    
        # calculate loss on random item
        rating = new_user_ratings[i]
        pred = global_bias + new_user_bias + item_bias[i] + np.dot(p, q[i])
        # pred = global_bias + new_user_bias + np.dot(p, q[i])
        error = rating - pred

        # update P
        for f in range(n_factors):
            p_update = learning_rate * (error * q[i][f] - P_reg * p[f])
            p[f] += p_update

        # update user bias
        ub_update = learning_rate * (error - user_bias_reg * new_user_bias)
        new_user_bias += ub_update


RMSE at Iteration 0: 1.770656185694843
RMSE at Iteration 1: 1.521928105467638
RMSE at Iteration 2: 1.4149172616138068
RMSE at Iteration 3: 1.3436697781979676
RMSE at Iteration 4: 1.288625839338802


In [822]:
# get predictions using partial fit
predictions_partial_fit = get_predictions(p, q, new_user_bias, item_bias, global_bias)

In [823]:
recs_partial_fit = get_top_n_recs(predictions_partial_fit, books, 25, new_user_ratings)
for rec in recs_partial_fit:
    print(rec)

The Road to Serfdom
Feeling Good: The New Mood Therapy
The Final Empire (Mistborn, #1)
The Heretic Queen
The Luxe (Luxe, #1)
Hope: A Memoir of Survival in Cleveland
The Blade Itself (The First Law, #1)
No One Here Gets Out Alive
The Raw Shark Texts
The Troop
Laskar Pelangi (Tetralogi Laskar Pelangi, #1)
Not Without My Daughter
The Mortal Instruments Boxed Set: City of Bones; City of Ashes; City of Glass (The Mortal Instruments, #1-3)
The Meaning of Marriage: Facing the Complexities of Commitment with the Wisdom of God
The Scent of Rain and Lightning
Falling into Place
Night World, No. 2 (Night World, #4-6)
Any Human Heart
The Sculptor
Lord of Scoundrels (Scoundrels, #3)
The King of Attolia (The Queen's Thief, #3)
The Queen of the Damned (The Vampire Chronicles, #3)
Beautiful Creatures (Caster Chronicles, #1)
Gone Girl
Trinity


In [824]:
'''

Combine recs from partial fit with recs from mapping to feature matrix using log_rank

'''

'\n\nCombine recs from partial fit with recs from mapping to feature matrix using log_rank\n\n'

In [825]:
# produce feature matrix
feature_matrix = get_book_features(books)
feature_matrix.shape

feature_matrix exists in file...


(10000, 82203)

In [826]:
# get predictions using feature matrix
predictions_features = map_user_to_features(new_user_ratings, feature_matrix)

In [827]:
'''
Log Ranking
'''

'\nLog Ranking\n'

In [828]:
# create tuple of book_id and rating for each method, then sort
partial_fit_ratings = []
feature_ratings = []
for i in range(len(books)):
    partial_fit_ratings.append((i, predictions_partial_fit[i]))
    feature_ratings.append((i, predictions_features[i]))

partial_fit_ratings = sorted(partial_fit_ratings, key=lambda x: x[1], reverse=True)
feature_ratings = sorted(feature_ratings, key=lambda x: x[1], reverse=True)

In [829]:
# map book_id to the rank for each method
id_to_rank_partial_fit = {}
id_to_rank_features = {}
for i in range(len(books)):
    book_id = partial_fit_ratings[i][0]
    id_to_rank_partial_fit[book_id] = math.log(i+1)

    book_id = feature_ratings[i][0]
    id_to_rank_features[book_id] = math.log(i+1)

In [830]:
weight_feature = 0.5

rankings = []
for i in range(len(books)):
    if new_user_ratings[i] == 0:
        rank = weight_feature*id_to_rank_features[i] + (1.0-weight_feature)*id_to_rank_partial_fit[i]
        rankings.append((rank, i))
rankings = sorted(rankings, key=lambda x: x[0])
print(len(rankings))

9798


In [831]:
top_books = []
for i in range(25):
    book_id = rankings[i][1]
    book = books.iloc[book_id] # index is book_id - 1
    book['rank'] = i + 1
    top_books.append(book)

In [832]:
for book in top_books:
    print(book['title'])

The Raw Shark Texts
The Final Empire (Mistborn, #1)
The Book of Three (The Chronicles of Prydain, #1)
A Wrinkle in Time (A Wrinkle in Time Quintet, #1)
Un Lun Dun
The King of Attolia (The Queen's Thief, #3)
J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings
The Road to Serfdom
The Blade Itself (The First Law, #1)
The Call of the Wild
Library of Souls (Miss Peregrine's Peculiar Children, #3)
The Lost World (Professor Challenger, #1)
Dandelion Wine (Green Town, #1)
Feeling Good: The New Mood Therapy
My Ántonia
Rebecca
The Prestige
The Luxe (Luxe, #1)
Incarceron (Incarceron, #1)
Hope: A Memoir of Survival in Cleveland
His Dark Materials (His Dark Materials #1-3)
Palace of Stone (Princess Academy, #2)
Tarzan of the Apes (Tarzan, #1)
I Capture the Castle
The Heretic Queen
