In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import surprise
from surprise import KNNBaseline, SVDpp
from surprise.model_selection import GridSearchCV, cross_validate
from surprise import Dataset, Reader
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, RobustScaler, StandardScaler, MinMaxScaler

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error 
import math
import tensorflow.compat.v1 as tf
from sklearn.decomposition import TruncatedSVD

In [2]:
df = pd.read_csv('Data/Cleaned_Data/book_user_explicit_rating_cleaned.csv',encoding='UTF-8')
df = df.drop(columns=['index'])
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.drop(columns=['Image_URL','ISBN'])
df = df.sample(n=8000, replace=False, random_state=1)

In [3]:
labelenc = LabelEncoder()

In [4]:
def label_encoding(df,param):
    df[param] = labelenc.fit_transform(df[param].values)
    df[param] = df[param].astype('category')

In [5]:
label_encoding(df, 'User_ID')
label_encoding(df, 'Unique_ISBN')

In [6]:
enc_data = df[['User_ID','Unique_ISBN','Book_Rating']]

In [7]:
print ('num of users:',df['User_ID'].nunique())
print ('num of books:',df['Unique_ISBN'].nunique())

num of users: 4613
num of books: 4858


In [8]:
user_ids = np.array(enc_data['User_ID'].tolist())
book_ids = np.array(enc_data['Unique_ISBN'].tolist())
user_ratings = np.array(enc_data["Book_Rating"].tolist())

In [9]:
graph = tf.Graph()
n_book = 4858
n_user = 4613
embedding_size = 30

lr = 0.0001
reg = 0.01

with graph.as_default():
    user = tf.placeholder(tf.int32, name="User_ID") 
    book = tf.placeholder(tf.int32, name="Unique_ISBN") 
    rating = tf.placeholder(tf.float32, name="Book_Rating") 

    book_embedding = tf.Variable(tf.truncated_normal([n_book, embedding_size], stddev=0.02, mean=0.) ,name="Book_Embedding")
    user_embedding = tf.Variable(tf.truncated_normal([n_user, embedding_size], stddev=0.02, mean=0.) ,name="User_Embedding")
    
    book_bias_embedding = tf.Variable(tf.truncated_normal([n_book], stddev=0.02, mean=0.) ,name="book_bias_embedding")
    user_bias_embedding = tf.Variable(tf.truncated_normal([n_user], stddev=0.02, mean=0.) ,name="user_bias_embedding")
    
    
    global_bias = tf.Variable(tf.truncated_normal([], stddev=0.02, mean=0.) ,name="global_bias")
    
    u = tf.nn.embedding_lookup(user_embedding, user)
    m = tf.nn.embedding_lookup(book_embedding, book)
    
    u_bias = tf.nn.embedding_lookup(user_bias_embedding, user)
    m_bias = tf.nn.embedding_lookup(book_bias_embedding, book)
    

    predicted_rating = tf.reduce_sum(tf.multiply(u, m), 1) + u_bias + m_bias + global_bias

    rmse = tf.sqrt(tf.reduce_mean(tf.square(predicted_rating - rating))) # RMSE
    cost = tf.nn.l2_loss(predicted_rating - rating)
    regularization = reg * (tf.nn.l2_loss(book_embedding) + tf.nn.l2_loss(user_embedding)
                            + tf.nn.l2_loss(book_bias_embedding) + tf.nn.l2_loss(user_bias_embedding))
    
    loss = cost + regularization
    
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

In [10]:
batch_size = 3
n_epoch = 50


with tf.Session(graph=graph) as sess:
    tf.initialize_all_variables().run()
    for _ in range(n_epoch):
        for start in range(0, user_ratings.shape[0] - batch_size, batch_size):
            end = start + batch_size
            _, cost_value = sess.run([optimizer, rmse], feed_dict={user: user_ids[start:end],
                                                  book: book_ids[start: end],
                                                  rating: user_ratings[start: end]})

        print ("RMSE", cost_value)
    embeddings = book_embedding.eval()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


2022-05-13 05:20:22.961359: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


RMSE 7.777958
RMSE 7.505955
RMSE 7.234772
RMSE 6.964234
RMSE 6.6943717
RMSE 6.425237
RMSE 6.1569133
RMSE 5.889475
RMSE 5.623029
RMSE 5.3576813
RMSE 5.0935755
RMSE 4.8308554
RMSE 4.5697093
RMSE 4.3103685
RMSE 4.0530896
RMSE 3.7982023
RMSE 3.5460896
RMSE 3.2972357
RMSE 3.0522618
RMSE 2.8118665
RMSE 2.577039
RMSE 2.3489945
RMSE 2.1292312
RMSE 1.9196124
RMSE 1.7224097
RMSE 1.540222
RMSE 1.375818
RMSE 1.2316439
RMSE 1.1091907
RMSE 1.0085825
RMSE 0.92989117
RMSE 0.87099427
RMSE 0.8283429
RMSE 0.79824835
RMSE 0.77744234
RMSE 0.7633318
RMSE 0.7539663
RMSE 0.7479346
RMSE 0.7442599
RMSE 0.74222785
RMSE 0.7413635
RMSE 0.74132293
RMSE 0.74184746
RMSE 0.7427727
RMSE 0.74397373
RMSE 0.7453457
RMSE 0.74683213
RMSE 0.7483754
RMSE 0.7499508
RMSE 0.75152516


In [11]:
df_rating = df.sample(n=8000, replace=False, random_state=1)
R = pd.pivot_table(df, values='Book_Rating', index=['User_ID'],columns=['Unique_ISBN'], fill_value=0).to_numpy()
print ('{0}x{1} user by user matrix'.format(*R.shape))

4613x4858 user by user matrix


In [12]:
df_rating2 = df.sample(n=8000, replace=False, random_state=1)
df_feature_frame = pd.pivot_table(df_rating2, values='Book_Rating', index=['Unique_ISBN'],columns=['User_ID'], fill_value=0)
df_feature = pd.pivot_table(df_rating2, values='Book_Rating', index=['Unique_ISBN'],columns=['User_ID'], fill_value=0).to_numpy()

print ('{0}x{1} user by book matrix'.format(*df_feature.shape))

4858x4613 user by book matrix


In [13]:

# train, test split
train_user = R[:3539, :885]
test_user = R[3539:, 885:]
train_book = df_feature[:5448, :1361]
test_book = df_feature[5448:, 1361:]

book_svd = TruncatedSVD(n_components=10)
book_features = book_svd.fit_transform(train_book)

print ("book_features.shape = {0}".format(book_features.shape))


user_svd = TruncatedSVD(n_components=10)
user_features = user_svd.fit_transform(train_user)

print ("user_features.shape = {0}".format(user_features.shape))

book_features.shape = (4858, 10)
user_features.shape = (3539, 10)


In [14]:
columns = ["uf{0}".format(i+1) for i in range(10)]
ufs = pd.DataFrame(user_features, columns = columns)
ufs["User_ID"] = ufs.index
print ("len(ufs) = {0}".format(len(ufs)))

columns = ["mf{0}".format(i+1) for i in range(10)]
bfs = pd.DataFrame(book_features, columns = columns)
bfs["Unique_ISBN"] = bfs.index
print ("len(ufs) = {0}".format(len(ufs)))

train_data = df.merge(ufs, on="User_ID") \
    .merge(bfs, on="Unique_ISBN") \
    .drop(["Unique_ISBN", "Book_Title","Book_Author","Publication_year","Publisher","Age","Country","Age_Range"], axis = 1)

print ("len(train_data) = {0}".format(len(train_data)))

len(ufs) = 3539
len(ufs) = 3539
len(train_data) = 6077


In [15]:
train_data

Unnamed: 0,User_ID,Book_Rating,uf1,uf2,uf3,uf4,uf5,uf6,uf7,uf8,...,mf1,mf2,mf3,mf4,mf5,mf6,mf7,mf8,mf9,mf10
0,1876,9,1.262811e-08,-1.584695e-06,-5.146988e-07,2.367609e-06,-0.000008,0.000008,0.000001,0.000016,...,2.217247e-09,-5.139575e-08,2.814097e-07,-7.244991e-07,-1.892237e-06,1.111194e-06,-1.259206e-06,-7.111684e-06,8.213754e-06,-4.239322e-06
1,3278,9,-1.255322e-08,9.320144e-07,6.116781e-07,-8.151667e-07,0.000005,-0.000006,-0.000002,-0.000004,...,2.217247e-09,-5.139575e-08,2.814097e-07,-7.244991e-07,-1.892237e-06,1.111194e-06,-1.259206e-06,-7.111684e-06,8.213754e-06,-4.239322e-06
2,2310,7,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,2.217247e-09,-5.139575e-08,2.814097e-07,-7.244991e-07,-1.892237e-06,1.111194e-06,-1.259206e-06,-7.111684e-06,8.213754e-06,-4.239322e-06
3,310,9,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,2.217247e-09,-5.139575e-08,2.814097e-07,-7.244991e-07,-1.892237e-06,1.111194e-06,-1.259206e-06,-7.111684e-06,8.213754e-06,-4.239322e-06
4,1876,10,1.262811e-08,-1.584695e-06,-5.146988e-07,2.367609e-06,-0.000008,0.000008,0.000001,0.000016,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6072,3449,7,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
6073,143,8,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,-1.012908e-08,5.555968e-09,3.289475e-09,7.476632e-08,-1.963558e-07,8.919809e-08,-5.123917e-07,-1.901309e-07,1.214308e-06,-1.379525e-06
6074,173,8,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,-5.801265e-09,-7.994722e-08,-3.814622e-07,9.453282e-09,3.793940e-09,2.207450e-08,5.001083e-07,9.187706e-07,-7.233215e-07,1.282461e-06
6075,317,5,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,9.151263e-11,2.407034e-10,6.884360e-10,1.409765e-10,4.455540e-10,-1.792917e-10,-1.830821e-09,-3.309987e-10,-1.361204e-10,-9.834230e-10


In [16]:
targets = np.array(train_data.Book_Rating)
data = np.array(train_data.drop("Book_Rating", axis = 1))

print ("targets.shape = {0}".format(targets.shape))
print ("data.shape = {0}".format(data.shape))

targets.shape = (6077,)
data.shape = (6077, 21)


In [18]:
regressor = GradientBoostingRegressor(learning_rate=0.1, n_estimators=500, verbose=1)
regressor.fit(data, targets)
print ("Final RMSE", math.sqrt(mean_squared_error( regressor.predict(data), targets )))

      Iter       Train Loss   Remaining Time 
         1           3.0245            7.26s
         2           3.0047            7.43s
         3           2.9883            7.40s
         4           2.9735            7.41s
         5           2.9559            7.33s
         6           2.9410            7.19s
         7           2.9267            7.13s
         8           2.9129            7.07s
         9           2.9033            6.99s
        10           2.8907            6.95s
        20           2.7841            6.66s
        30           2.7341            6.42s
        40           2.6985            6.29s
        50           2.6739            6.21s
        60           2.6395            6.09s
        70           2.6087            5.90s
        80           2.5857            5.80s
        90           2.5615            5.70s
       100           2.5372            5.56s
       200           2.3387            4.08s
       300           2.1969            2.70s
       40