# Building Amazon Models

## Author: Andrew Lim

In [166]:
# Importing
import feather
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD
from keras import backend as k

In [2]:
# Reading data
df = feather.read_dataframe("../results/tidy_data.feather")
df.head()

Unnamed: 0,reviewer_id,asin,reviewer_name,review_text,overall,summary,unixReviewTime,num_found_helpful,num_found_unhelpful
0,A2HD75EMZR8QLN,700099867,123,Installing the game was a struggle (because of...,1.0,Pay to unlock content? I don't think so.,1341792000,8,12
1,A3UR8NLLY1ZHCX,700099867,"Alejandro Henao ""Electronic Junky""",If you like rally cars get this game you will ...,4.0,Good rally game,1372550400,0,0
2,A1INA0F5CWW3J4,700099867,"Amazon Shopper ""Mr.Repsol""",1st shipment received a book instead of the ga...,1.0,Wrong key,1403913600,0,0
3,A1DLMTOTHQ4AST,700099867,ampgreen,"I got this version instead of the PS3 version,...",3.0,"awesome game, if it did not crash frequently !!",1315958400,7,10
4,A361M14PU2GUEG,700099867,"Angry Ryan ""Ryan A. Forrest""",I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,DIRT 3,1308009600,2,2


In [59]:
rating_df = df[['reviewer_id', 'asin', 'overall']]
small_rating_df = rating_df.head()
small_rating_df

Unnamed: 0,reviewer_id,asin,overall
0,A2HD75EMZR8QLN,700099867,1.0
1,A3UR8NLLY1ZHCX,700099867,4.0
2,A1INA0F5CWW3J4,700099867,1.0
3,A1DLMTOTHQ4AST,700099867,3.0
4,A361M14PU2GUEG,700099867,4.0


In [40]:
# Obtaining number of reviewers and games reviewed:
n_reviewers = len(rating_df.reviewer_id.unique())
n_games = len(rating_df.asin.unique())

print("There are", n_reviewers, "reviewers")
print("There are", n_games, "games reviewed")

There are 24303 reviewers
There are 10672 games reviewed


In [41]:
# Creating map to map from reviewer id/asin(product id) to a number:
reviewer_map = dict(zip(np.unique(rating_df.reviewer_id), list(range(n_reviewers))))
game_map = dict(zip(np.unique(rating_df.asin), list(range(n_games))))

In [45]:
# Obtain numbers(index in sparse matrix) associated with each id:
reviewer_index = np.array([reviewer_map[reviewer_id] for reviewer_id in rating_df.reviewer_id])
game_index = np.array([game_map[asin] for asin in rating_df.asin])

In [49]:
# Obtain ratings to be put into sparse matrix:
ratings = np.array(rating_df.overall)

In [55]:
# Create sparse X matrix:
X = coo_matrix((ratings, (reviewer_index, game_index)), shape=(n_reviewers, n_games))

In [104]:
# Performing sanity check:
person_of_interest = reviewer_map['A361M14PU2GUEG']
game_of_interest = game_map['0700099867']

X.toarray()[person_of_interest, game_of_interest]

4.0

Now, the rating data is in a sparse matrix where the rows are users and the columns are movies. Note that all values of 0 for ratings are actually missing reviews. Reviewers can not give something 0 stars. This is because `coo_matrix` by default fills in missing values with 0.

In [106]:
def get_score(X1, X2):
    return np.nanmean((X1 - X2)**2)

# Making models:

In [95]:
num_latent_features = 10

svd = TruncatedSVD(n_components=num_latent_features)

U = svd.fit_transform(X) # User values of latent features
V = svd.components_ # Game values of latent features

In [99]:
X_pred = U@V

In [105]:
# Sanity Check:
X_pred[person_of_interest, game_of_interest]

0.030958221262283765

In [109]:
X_array = X.toarray()
X_array[X_array==0] = np.nan

In [113]:
get_score(X_array, X_pred)

16.958328448879598

The above is pretty bad because we did not properly handle the 0 values. We've included them within our model which is a bad thing! We should not train on these zero values. 

In [131]:
X_centered = X_array - np.nanmean(X_array)

In [187]:
n_iters = 5
alpha = 0.001

U = np.random.randn(n_reviewers, num_latent_features) * 1e-5
V = np.random.randn(num_latent_features, n_games) * 1e-5

k_U = k.variable(U)
k_V = k.variable(V)
k_X_centered = k.variable(X_centered)

for i in range(n_iters):
    # Obtain predictions:
    k_Xhat = k.dot(keras_U, keras_V)   
        
    # Obtain residual
    resid = k_X_centered - k_Xhat
#     resid[np.isnan(resid)] = 0
#     print(np.sum(k.eval(resid)**2))
    
    dU = k.dot(resid, k.transpose(k_V)) # result [n_reviewers x 10]
    dV = k.dot(k.transpose(k_U), resid)
    
    k_U = k_U - dU*alpha
    k_V = k_V - dV*alpha
    
X_pred = U@V
    

In [192]:
n_iters = 5
alpha = 0.001

U = np.random.randn(n_reviewers, num_latent_features) * 1e-5
V = np.random.randn(num_latent_features, n_games) * 1e-5

k_U = k.variable(U)
k_V = k.variable(V)
k_X_centered = k.variable(X_centered)

for i in range(n_iters):
    # Obtain predictions:
    k_Xhat = k.dot(keras_U, keras_V)   
        
    # Obtain residual
    resid = k_X_centered - k_Xhat
#     resid[np.isnan(resid)] = 0
    print(np.sum(k.eval(resid)**2))
    
    dU = k.dot(resid, k.transpose(k_V)) # result [n_reviewers x 10]
    dV = k.dot(k.transpose(k_U), resid)
    
    k_U = k_U - dU*alpha
    k_V = k_V - dV*alpha

ValueError: GraphDef cannot be larger than 2GB.

In [201]:
k.eval(k.variable(X_array))

ValueError: GraphDef cannot be larger than 2GB.