In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import surprise
from surprise import KNNBaseline, SVDpp
from surprise.model_selection import GridSearchCV, cross_validate
from surprise import Dataset, Reader
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, RobustScaler, StandardScaler, MinMaxScaler

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error 
import math
import tensorflow.compat.v1 as tf
from sklearn.decomposition import TruncatedSVD

In [2]:
df = pd.read_csv('Data/Cleaned_Data/book_user_explicit_rating_cleaned.csv',encoding='UTF-8')
df = df.drop(columns=['index'])
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.drop(columns=['Image_URL','ISBN'])
df = df.sample(n=8000, replace=False, random_state=1)

In [3]:
labelenc = LabelEncoder()

In [4]:
def label_encoding(df,param):
    df[param] = labelenc.fit_transform(df[param].values)
    df[param] = df[param].astype('category')

In [5]:
label_encoding(df, 'User_ID')
label_encoding(df, 'Unique_ISBN')

In [6]:
enc_data = df[['User_ID','Unique_ISBN','Book_Rating']]

In [25]:
print ('num of users:',df['User_ID'].nunique())
print ('num of books:',df['Unique_ISBN'].nunique())

num of users: 4424
num of books: 6809


In [26]:
user_ids = np.array(enc_data['User_ID'].tolist())
book_ids = np.array(enc_data['Unique_ISBN'].tolist())
user_ratings = np.array(enc_data["Book_Rating"].tolist())

In [27]:
graph = tf.Graph()
n_book = 6809
n_user = 4424
embedding_size = 30

lr = 0.0001
reg = 0.001

with graph.as_default():
    user = tf.placeholder(tf.int32, name="User_ID") 
    book = tf.placeholder(tf.int32, name="Unique_ISBN") 
    rating = tf.placeholder(tf.float32, name="Book_Rating") 

    book_embedding = tf.Variable(tf.truncated_normal([n_book, embedding_size], stddev=0.02, mean=0.) ,name="Book_Embedding")
    user_embedding = tf.Variable(tf.truncated_normal([n_user, embedding_size], stddev=0.02, mean=0.) ,name="User_Embedding")
    
    book_bias_embedding = tf.Variable(tf.truncated_normal([n_book], stddev=0.02, mean=0.) ,name="book_bias_embedding")
    user_bias_embedding = tf.Variable(tf.truncated_normal([n_user], stddev=0.02, mean=0.) ,name="user_bias_embedding")
    
    
    global_bias = tf.Variable(tf.truncated_normal([], stddev=0.02, mean=0.) ,name="global_bias")
    
    u = tf.nn.embedding_lookup(user_embedding, user)
    m = tf.nn.embedding_lookup(book_embedding, book)
    
    u_bias = tf.nn.embedding_lookup(user_bias_embedding, user)
    m_bias = tf.nn.embedding_lookup(book_bias_embedding, book)
    

    predicted_rating = tf.reduce_sum(tf.multiply(u, m), 1) + u_bias + m_bias + global_bias

    rmse = tf.sqrt(tf.reduce_mean(tf.square(predicted_rating - rating))) # RMSE
    cost = tf.nn.l2_loss(predicted_rating - rating)
    regularization = reg * (tf.nn.l2_loss(book_embedding) + tf.nn.l2_loss(user_embedding)
                            + tf.nn.l2_loss(book_bias_embedding) + tf.nn.l2_loss(user_bias_embedding))
    
    loss = cost + regularization
    
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

In [28]:
batch_size = 2
n_epoch = 50


with tf.Session(graph=graph) as sess:
    tf.initialize_all_variables().run()
    for _ in range(n_epoch):
        for start in range(0, user_ratings.shape[0] - batch_size, batch_size):
            end = start + batch_size
            _, cost_value = sess.run([optimizer, rmse], feed_dict={user: user_ids[start:end],
                                                  book: book_ids[start: end],
                                                  rating: user_ratings[start: end]})

        print ("RMSE", cost_value)
    embeddings = book_embedding.eval()

RMSE 8.101506
RMSE 7.703277
RMSE 7.306424
RMSE 6.9110403
RMSE 6.517368
RMSE 6.125718
RMSE 5.7364936
RMSE 5.3501844
RMSE 4.967443
RMSE 4.589114
RMSE 4.2163067
RMSE 3.8505027
RMSE 3.4936934
RMSE 3.1485133
RMSE 2.8185418
RMSE 2.5084975
RMSE 2.2236516
RMSE 1.9699754
RMSE 1.752543
RMSE 1.5734547
RMSE 1.4313893
RMSE 1.3215111
RMSE 1.2387948
RMSE 1.1773349
RMSE 1.1319624
RMSE 1.0984858
RMSE 1.0737143
RMSE 1.055301
RMSE 1.0415324
RMSE 1.0311364
RMSE 1.0232251
RMSE 1.0171317
RMSE 1.0123851
RMSE 1.0086204
RMSE 1.0055829
RMSE 1.0031289
RMSE 1.0010797
RMSE 0.9993301
RMSE 0.99783385
RMSE 0.996536
RMSE 0.9953724
RMSE 0.9943309
RMSE 0.9933933
RMSE 0.99252576
RMSE 0.9917288
RMSE 0.99099046
RMSE 0.99030924
RMSE 0.98968124
RMSE 0.989106
RMSE 0.9885533


In [37]:
df_rating = df.sample(n=8000, replace=False, random_state=1)
R = pd.pivot_table(df, values='Book_Rating', index=['User_ID'],columns=['Unique_ISBN'], fill_value=0).to_numpy()
print ('{0}x{1} user by user matrix'.format(*R.shape))

4424x6809 user by user matrix


In [38]:
df_rating2 = df.sample(n=8000, replace=False, random_state=1)
df_feature_frame = pd.pivot_table(df_rating2, values='Book_Rating', index=['Unique_ISBN'],columns=['User_ID'], fill_value=0)
df_feature = pd.pivot_table(df_rating2, values='Book_Rating', index=['Unique_ISBN'],columns=['User_ID'], fill_value=0).to_numpy()

print ('{0}x{1} user by book matrix'.format(*df_feature.shape))

6809x4424 user by book matrix


In [39]:

# train, test split
train_user = R[:3539, :885]
test_user = R[3539:, 885:]
train_book = df_feature[:5448, :1361]
test_book = df_feature[5448:, 1361:]

book_svd = TruncatedSVD(n_components=10)
book_features = book_svd.fit_transform(train_book)

print ("book_features.shape = {0}".format(book_features.shape))


user_svd = TruncatedSVD(n_components=10)
user_features = user_svd.fit_transform(train_user)

print ("user_features.shape = {0}".format(user_features.shape))

book_features.shape = (5448, 10)
user_features.shape = (3539, 10)


In [57]:
columns = ["uf{0}".format(i+1) for i in range(10)]
ufs = pd.DataFrame(user_features, columns = columns)
ufs["User_ID"] = ufs.index
print ("len(ufs) = {0}".format(len(ufs)))

columns = ["mf{0}".format(i+1) for i in range(10)]
mfs = pd.DataFrame(book_features, columns = columns)
mfs["Unique_ISBN"] = mfs.index
print ("len(ufs) = {0}".format(len(ufs)))

train_data = df.merge(ufs, on="User_ID") \
    .merge(mfs, on="Unique_ISBN") \
    .drop(["Unique_ISBN", "Unique_ISBN","Book_Title","Book_Author","Publication_year","Publisher","Age","Country","Age_Range"], axis = 1)

print ("len(train_data) = {0}".format(len(train_data)))

len(ufs) = 3539
len(ufs) = 3539
len(train_data) = 5288


In [58]:
train_data

Unnamed: 0,User_ID,Book_Rating,uf1,uf2,uf3,uf4,uf5,uf6,uf7,uf8,...,mf1,mf2,mf3,mf4,mf5,mf6,mf7,mf8,mf9,mf10
0,1319,9,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,...,-4.901206e-10,-6.578185e-06,-1.205858e-05,1.768841e-05,-1.813331e-05,-1.526908e-04,-2.429389e-05,-3.314454e-06,-0.000584,7.951911e-04
1,1319,10,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,...,-5.445784e-10,-7.309095e-06,-1.339842e-05,1.965378e-05,-2.014813e-05,-1.696564e-04,-2.699321e-05,-3.682726e-06,-0.000649,8.835456e-04
2,1331,10,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,...,8.070233e-04,-9.133549e-03,-1.245234e-03,-1.642208e-02,1.330366e-02,3.276841e-02,1.664350e-01,7.353357e-02,0.417826,-4.080528e-01
3,1247,10,1.524639e-07,2.857789e-04,-3.545103e-04,6.559561e-03,0.006188,-3.147451e-03,-0.006032,-0.012284,...,8.070233e-04,-9.133549e-03,-1.245234e-03,-1.642208e-02,1.330366e-02,3.276841e-02,1.664350e-01,7.353357e-02,0.417826,-4.080528e-01
4,306,10,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,...,8.070233e-04,-9.133549e-03,-1.245234e-03,-1.642208e-02,1.330366e-02,3.276841e-02,1.664350e-01,7.353357e-02,0.417826,-4.080528e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5283,2785,7,-1.022603e-12,-5.398232e-08,2.190244e-07,8.143598e-08,-0.000004,9.306414e-07,-0.000001,-0.000001,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00
5284,33,8,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000,...,2.333344e-09,-2.126520e-08,-4.640530e-09,-7.077944e-08,9.555926e-08,-1.182666e-07,4.932792e-07,-1.102889e-07,0.000001,-1.239381e-06
5285,120,10,2.095547e-08,-6.481280e-06,4.216986e-06,5.941555e-06,-0.000097,-4.194014e-05,0.000295,0.000086,...,-1.508156e-08,2.450514e-07,2.781879e-07,7.619366e-07,-1.945821e-07,2.583962e-06,-1.083153e-05,-6.219042e-06,-0.000028,2.201146e-05
5286,2001,8,1.236874e-09,5.704191e-07,1.961981e-06,2.082267e-05,0.000006,3.560379e-07,-0.000026,-0.000015,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00


In [59]:
targets = np.array(train_data.Book_Rating)
data = np.array(train_data.drop("Book_Rating", axis = 1))

print ("targets.shape = {0}".format(targets.shape))
print ("data.shape = {0}".format(data.shape))

targets.shape = (5288,)
data.shape = (5288, 21)


In [66]:
regressor = GradientBoostingRegressor(learning_rate=0.5, n_estimators=1000, verbose=1)
regressor.fit(data, targets)
print ("Final MSE", math.sqrt(mean_squared_error( regressor.predict(data), targets )))

      Iter       Train Loss   Remaining Time 
         1           3.1814           10.18s
         2           3.0627           10.82s
         3           3.0124           10.79s
         4           2.9681           10.95s
         5           2.9088           10.85s
         6           2.8804           10.81s
         7           2.8547           10.76s
         8           2.8457           10.68s
         9           2.8314           10.61s
        10           2.8166           10.60s
        20           2.6580           12.00s
        30           2.5111           11.80s
        40           2.4003           11.28s
        50           2.3422           10.89s
        60           2.2816           10.53s
        70           2.2202           10.32s
        80           2.1688           10.11s
        90           2.1252            9.90s
       100           2.0785            9.73s
       200           1.7033            8.26s
       300           1.4764            7.22s
       40