In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import surprise
from surprise import KNNBaseline, SVDpp
from surprise.model_selection import GridSearchCV, cross_validate
from surprise import Dataset, Reader
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, RobustScaler, StandardScaler, MinMaxScaler

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error 
import math
import tensorflow.compat.v1 as tf
from sklearn.decomposition import TruncatedSVD

In [2]:
df = pd.read_csv('Data/Cleaned_Data/book_user_explicit_rating_cleaned.csv',encoding='UTF-8')
df = df.drop(columns=['index'])
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.drop(columns=['Image_URL','ISBN'])
df = df.sample(n=8000, replace=False, random_state=1)

In [3]:
labelenc = LabelEncoder()

In [4]:
def label_encoding(df,param):
    df[param] = labelenc.fit_transform(df[param].values)
    df[param] = df[param].astype('category')

In [5]:
label_encoding(df, 'User_ID')
label_encoding(df, 'Unique_ISBN')

In [6]:
enc_data = df[['User_ID','Unique_ISBN','Book_Rating']]

In [25]:
print ('num of users:',df['User_ID'].nunique())
print ('num of books:',df['Unique_ISBN'].nunique())

num of users: 4424
num of books: 6809


In [26]:
user_ids = np.array(enc_data['User_ID'].tolist())
book_ids = np.array(enc_data['Unique_ISBN'].tolist())
user_ratings = np.array(enc_data["Book_Rating"].tolist())

In [27]:
graph = tf.Graph()
n_book = 6809
n_user = 4424
embedding_size = 30

lr = 0.0001
reg = 0.001

with graph.as_default():
    user = tf.placeholder(tf.int32, name="User_ID") 
    book = tf.placeholder(tf.int32, name="Unique_ISBN") 
    rating = tf.placeholder(tf.float32, name="Book_Rating") 

    book_embedding = tf.Variable(tf.truncated_normal([n_book, embedding_size], stddev=0.02, mean=0.) ,name="Book_Embedding")
    user_embedding = tf.Variable(tf.truncated_normal([n_user, embedding_size], stddev=0.02, mean=0.) ,name="User_Embedding")
    
    book_bias_embedding = tf.Variable(tf.truncated_normal([n_book], stddev=0.02, mean=0.) ,name="book_bias_embedding")
    user_bias_embedding = tf.Variable(tf.truncated_normal([n_user], stddev=0.02, mean=0.) ,name="user_bias_embedding")
    
    
    global_bias = tf.Variable(tf.truncated_normal([], stddev=0.02, mean=0.) ,name="global_bias")
    
    u = tf.nn.embedding_lookup(user_embedding, user)
    m = tf.nn.embedding_lookup(book_embedding, book)
    
    u_bias = tf.nn.embedding_lookup(user_bias_embedding, user)
    m_bias = tf.nn.embedding_lookup(book_bias_embedding, book)
    

    predicted_rating = tf.reduce_sum(tf.multiply(u, m), 1) + u_bias + m_bias + global_bias

    rmse = tf.sqrt(tf.reduce_mean(tf.square(predicted_rating - rating))) # RMSE
    cost = tf.nn.l2_loss(predicted_rating - rating)
    regularization = reg * (tf.nn.l2_loss(book_embedding) + tf.nn.l2_loss(user_embedding)
                            + tf.nn.l2_loss(book_bias_embedding) + tf.nn.l2_loss(user_bias_embedding))
    
    loss = cost + regularization
    
    optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

In [28]:
batch_size = 2
n_epoch = 50


with tf.Session(graph=graph) as sess:
    tf.initialize_all_variables().run()
    for _ in range(n_epoch):
        for start in range(0, user_ratings.shape[0] - batch_size, batch_size):
            end = start + batch_size
            _, cost_value = sess.run([optimizer, rmse], feed_dict={user: user_ids[start:end],
                                                  book: book_ids[start: end],
                                                  rating: user_ratings[start: end]})

        print ("RMSE", cost_value)
    embeddings = book_embedding.eval()

RMSE 8.101506
RMSE 7.703277
RMSE 7.306424
RMSE 6.9110403
RMSE 6.517368
RMSE 6.125718
RMSE 5.7364936
RMSE 5.3501844
RMSE 4.967443
RMSE 4.589114
RMSE 4.2163067
RMSE 3.8505027
RMSE 3.4936934
RMSE 3.1485133
RMSE 2.8185418
RMSE 2.5084975
RMSE 2.2236516
RMSE 1.9699754
RMSE 1.752543
RMSE 1.5734547
RMSE 1.4313893
RMSE 1.3215111
RMSE 1.2387948
RMSE 1.1773349
RMSE 1.1319624
RMSE 1.0984858
RMSE 1.0737143
RMSE 1.055301
RMSE 1.0415324
RMSE 1.0311364


In [None]:
df_rating = enc_data.sample(n=8000, replace=False, random_state=1)
R = pd.pivot_table(df_rating, values='Book_Rating', index=['User_ID'],columns=['Unique_ISBN'], fill_value=0).to_numpy()
print ('{0}x{1} user by user matrix'.format(*R.shape))

In [None]:
df_rating2 = enc_data.sample(n=8000, replace=False, random_state=1)
df_feature_frame = pd.pivot_table(df_rating2, values='Book_Rating', index=['Unique_ISBN'],columns=['User_ID'], fill_value=0)
df_feature = pd.pivot_table(df_rating2, values='Book_Rating', index=['Unique_ISBN'],columns=['User_ID'], fill_value=0).to_numpy()

print ('{0}x{1} user by book matrix'.format(*df_feature.shape))

In [None]:

# train, test split
train_user = R[:3539, :885]
test_user = R[3539:, 885:]
train_book = df_feature[:5448, :1361]
test_book = df_feature[5448:, 1361:]

book_svd = TruncatedSVD(n_components=10)
book_features = book_svd.fit_transform(train_book)

print ("book_features.shape = {0}".format(book_features.shape))


user_svd = TruncatedSVD(n_components=10)
user_features = user_svd.fit_transform(train_user)

print ("user_features.shape = {0}".format(user_features.shape))

In [None]:
df_rating_avg = df.groupby(['Unique_ISBN'])['Book_Rating'].mean().reset_index()
df_rating_avg['Book_Rating'] = df_rating_avg['Book_Rating'].round(1)
df_rating_avg['Unique_ISBN'] = df_rating_avg['Unique_ISBN'].astype(str)
df_rating_ = df_rating_avg.set_index('Unique_ISBN')
df_train = pd.merge(df_feature_frame, df_rating_, left_index=True, right_index=True, how='right')
df_train.dropna(inplace=True)
targets = np.array(df_train.Book_Rating)
data = np.array(df_train.drop(['rating'], axis = 1))

print ("targets.shape = {0}".format(targets.shape))
print ("data.shape = {0}".format(data.shape))