In [1]:
# Tensorflow imports
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

# Misc
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re

### Import Data

First download the data from kaggle.
Then load that data into a dataframe.

In [2]:
#path = kagglehub.dataset_download("mohamedbakhet/amazon-books-reviews")

In [2]:
book_data = pd.read_csv('data/books_data.csv')
books_rating_data = pd.read_csv('data/Books_rating.csv')

### Explore the dataset

In [3]:
len(book_data), len(books_rating_data)

(212404, 3000000)

In [4]:
book_data.columns, books_rating_data.columns

(Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
        'publishedDate', 'infoLink', 'categories', 'ratingsCount'],
       dtype='object'),
 Index(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/helpfulness',
        'review/score', 'review/time', 'review/summary', 'review/text'],
       dtype='object'))

#### A look of the Book data

In [5]:
book_data[:3]

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],


In [6]:
book_data.info(), book_data.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Title          212403 non-null  object 
 1   description    143962 non-null  object 
 2   authors        180991 non-null  object 
 3   image          160329 non-null  object 
 4   previewLink    188568 non-null  object 
 5   publisher      136518 non-null  object 
 6   publishedDate  187099 non-null  object 
 7   infoLink       188568 non-null  object 
 8   categories     171205 non-null  object 
 9   ratingsCount   49752 non-null   float64
dtypes: float64(1), object(9)
memory usage: 16.2+ MB


(None,
 Title                 1
 description       68442
 authors           31413
 image             52075
 previewLink       23836
 publisher         75886
 publishedDate     25305
 infoLink          23836
 categories        41199
 ratingsCount     162652
 dtype: int64)

In [7]:
book_data.nunique()

Title            212403
description      133226
authors          127278
image            149387
previewLink      188099
publisher         16016
publishedDate     11582
infoLink         184506
categories        10883
ratingsCount        478
dtype: int64

Looking into Ratings count

In [8]:
book_data['ratingsCount'].aggregate(['max', 'min'])

max    4895.0
min       1.0
Name: ratingsCount, dtype: float64

Looking into publisher

In [9]:
book_data['publisher'].value_counts()

publisher
Simon and Schuster                                                     3454
Penguin                                                                2825
Routledge                                                              2394
John Wiley & Sons                                                      2031
Harper Collins                                                         1911
                                                                       ... 
Georges Boka Editeur                                                      1
Van Riebeeck Society, The                                                 1
Post Madison Pub                                                          1
Honolulu : Department of Anthropology, Bernice Pauahi Bishop Museum       1
New York : Lothrop, Lee & Shepard                                         1
Name: count, Length: 16016, dtype: int64

Looking into author

In [10]:
book_data['authors'].value_counts()

authors
['Rose Arny']                                        236
['William Shakespeare']                              191
['Library of Congress. Copyright Office']            178
['Agatha Christie']                                  142
['Erle Stanley Gardner']                             124
                                                    ... 
['Nancy Milio']                                        1
['Michael Solomon Alexander (bp. of Jerusalem.)']      1
['Thacher Hurd', 'John Cassidy']                       1
['Bobby Winters']                                      1
['Benjamin Friedlander']                               1
Name: count, Length: 127278, dtype: int64

#### A look of the Rating data

In [11]:
books_rating_data[:3]

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."


In [12]:
books_rating_data.info(), books_rating_data.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  object 
 1   Title               object 
 2   Price               float64
 3   User_id             object 
 4   profileName         object 
 5   review/helpfulness  object 
 6   review/score        float64
 7   review/time         int64  
 8   review/summary      object 
 9   review/text         object 
dtypes: float64(2), int64(1), object(7)
memory usage: 228.9+ MB


(None,
 Id                          0
 Title                     208
 Price                 2518829
 User_id                561787
 profileName            561905
 review/helpfulness          0
 review/score                0
 review/time                 0
 review/summary            407
 review/text                 8
 dtype: int64)

In [13]:
books_rating_data.nunique()

Id                     221998
Title                  212403
Price                    6004
User_id               1008972
profileName            854145
review/helpfulness      12084
review/score                5
review/time              6272
review/summary        1592314
review/text           2062648
dtype: int64

Looking into the data while grouped by the user id, Easier to digest information based on singular users rather than all reviews.

In [14]:
grouped_by_user = books_rating_data.groupby(by='User_id')

In [15]:
"""
Perform bayesian average of score based on user
This is because some users only review once and 
never again and this skews individual user review data
"""

def bayesian_avg(s):
    weight = 10
    
    # Calculate the sum of the values and the count of observations
    observed_sum = s.sum()
    observed_count = s.count()
    
    prior_mean = observed_sum / observed_count

    # Calculate the Bayesian average
    bayesian_avg = (observed_sum + prior_mean * weight) / (observed_count + weight)
    
    return bayesian_avg

#grouped_by_user['review/score'].apply(bayesian_avg)

Looking into data while grouped by the book

In [16]:
grouped_by_title = books_rating_data.groupby(by='Title')

In [17]:
#grouped_by_title['review/score'].apply(bayesian_avg)

Merge data together

In [18]:
# Remove entries with na as title.
brd = books_rating_data.dropna(subset=('Title'))
bd = book_data.dropna(subset=('Title'))

In [19]:
# Preprocessing methods
def preprocess_categories(row):
    # If nan
    if isinstance(row, float):
        return ''
    m = re.match(r"\['(.*?)'\]", row)
    if m:
        return m.group(1)
    return ''

In [20]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
# Preprocessing

merged = bd.merge(brd, how='left', on='Title')
merged_book_data = merged[['Title', 'categories', 'review/score', 'description', 'authors']]
merged_book_data.loc[:,'categories'] = merged_book_data['categories'].apply(preprocess_categories)
merged_book_data['description'].fillna('', inplace=True)
merged_book_data['authors'].fillna('', inplace=True)
merged_book_data['combined_text_features'] = '' + merged_book_data['description'] + ' ' + \
    merged_book_data['categories'] + ' ' + merged_book_data['authors']

grouped_by_title = merged_book_data.groupby('Title')

# Average/Normalize Ratings of books
merged_book_data['average rating'] = grouped_by_title['review/score'].transform(lambda x : round(x.mean(), 2))
merged_book_data['average rating'] = MinMaxScaler((0, 5)).fit_transform(merged_book_data[['average rating']])
merged_book_data['Title'] = merged_book_data['Title'].astype('category')

merged_book_data.reset_index(drop=True, inplace=True)
#merged_book_data['average price'] = grouped_by_title.transform() # TODO

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_book_data['description'].fillna('', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_book_data['description'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df

In [39]:
merged_book_data[300:330]

Unnamed: 0,Title,categories,review/score,description,authors,combined_text_features,average rating
300,Pyrography Designs,Art,5.0,This book offers 30 North American wildlife il...,['Sue Walters'],This book offers 30 North American wildlife il...,4.0625
301,Pyrography Designs,Art,4.0,This book offers 30 North American wildlife il...,['Sue Walters'],This book offers 30 North American wildlife il...,4.0625
302,"The ancient constitution and the feudal law,: ...",History,4.0,Pocock explores the relationship between the s...,['J. G. A. Pocock'],Pocock explores the relationship between the s...,3.75
303,Anna Karenina. (Lernmaterialien),Fiction,1.0,"De 16-jarige vrienden Red (ik-figuur), Leo, Ro...",['Cara Delevingne'],"De 16-jarige vrienden Red (ik-figuur), Leo, Ro...",0.0
304,Seance,Fiction,5.0,"Mysterieus, duister en passioneel Londen, 19de...",['Kevin Valgaeren'],"Mysterieus, duister en passioneel Londen, 19de...",4.0625
305,Seance,Fiction,5.0,"Mysterieus, duister en passioneel Londen, 19de...",['Kevin Valgaeren'],"Mysterieus, duister en passioneel Londen, 19de...",4.0625
306,Seance,Fiction,3.0,"Mysterieus, duister en passioneel Londen, 19de...",['Kevin Valgaeren'],"Mysterieus, duister en passioneel Londen, 19de...",4.0625
307,Seance,Fiction,4.0,"Mysterieus, duister en passioneel Londen, 19de...",['Kevin Valgaeren'],"Mysterieus, duister en passioneel Londen, 19de...",4.0625
308,"Open marriage;: A new life style for couples,",Family & Relationships,5.0,Advocates the importance of individuality in b...,"[""Nena O'Neill"", ""George O'Neill""]",Advocates the importance of individuality in b...,3.375
309,"Open marriage;: A new life style for couples,",Family & Relationships,5.0,Advocates the importance of individuality in b...,"[""Nena O'Neill"", ""George O'Neill""]",Advocates the importance of individuality in b...,3.375


In [23]:

# Compute similarity of text features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(merged_book_data['combined_text_features'])

#merged_book_data.drop(['review/score', 'description', 'authors', 'categories'],axis=1, inplace=True)
#merged_book_data.drop_duplicates(inplace=True)

In [41]:
merged_book_data[:5]

Unnamed: 0,Title,categories,review/score,description,authors,combined_text_features,average rating
0,Its Only Art If Its Well Hung!,Comics & Graphic Novels,4.0,,['Julie Strain'],Comics & Graphic Novels ['Julie Strain'],3.75
1,Dr. Seuss: American Icon,Biography & Autobiography,5.0,Philip Nel takes a fascinating look into the k...,['Philip Nel'],Philip Nel takes a fascinating look into the k...,4.45
2,Dr. Seuss: American Icon,Biography & Autobiography,5.0,Philip Nel takes a fascinating look into the k...,['Philip Nel'],Philip Nel takes a fascinating look into the k...,4.45
3,Dr. Seuss: American Icon,Biography & Autobiography,4.0,Philip Nel takes a fascinating look into the k...,['Philip Nel'],Philip Nel takes a fascinating look into the k...,4.45
4,Dr. Seuss: American Icon,Biography & Autobiography,4.0,Philip Nel takes a fascinating look into the k...,['Philip Nel'],Philip Nel takes a fascinating look into the k...,4.45


In [25]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 138751918 stored elements and shape (2999792, 236277)>

### Feature Engineering

Most of the features initially will be removed as to simplify the recommender system

### Data Formatting - Tensorflow

Set up a shuffled dataset

In [26]:
#ratings_tf = tf.data.Dataset.from_tensor_slices(dict(merged_book_data))

In [27]:
#tf.random.set_seed(42)
#shuffled = ratings_tf.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

#train = shuffled.take(80_000)
#test = shuffled.skip(80_000).take(20_000)

In [28]:
#final_data = merged_book_data.set_index(merged_book_data['Title'])
#final_data = final_data.drop('Title', axis=1)

In [29]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

X = tfidf_matrix

X_train, X_test = train_test_split(X, random_state=1)

#ohe = OneHotEncoder(handle_unknown='ignore')
#X_train = ohe.fit_transform(X_train)
#X_test = ohe.transform(X_test)

#encoded_data = merged_book_data

In [30]:
X_train[0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23 stored elements and shape (1, 236277)>

### Building Recommender System - Tensorflow

This recommender system will use an embedding model for the model to learn embeddings unsupervised and then from these embeddings run KNN.

First, I'm going to set up a base case of just running KNN.

In [31]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine')
knn.fit(X_train)

In [36]:
def recommend_books_knn(title, books_data, knn_model, tfidf_matrix, n_recommendations=5):
    idx = books_data[books_data['Title'] == title].index[0]
    book_vector = tfidf_matrix[idx]
    distances, indices = knn_model.kneighbors(book_vector, n_neighbors=n_recommendations+1)
    
    recommended_indices = indices[0][1:]
    recommendations = books_data.iloc[recommended_indices]
    return recommendations

In [57]:
merged_book_data[550:600]

Unnamed: 0,Title,categories,review/score,description,authors,combined_text_features,average rating
550,America at 1750: A Social Portrait,History,2.0,Demonstrates how the colonies developed into t...,['Richard Hofstadter'],Demonstrates how the colonies developed into t...,3.475
551,Death Dream,"Body, Mind & Spirit",5.0,"The authors probe the phenomenon of ""pre-death...","['Kelly Bulkeley', 'Patricia Bulkley']","The authors probe the phenomenon of ""pre-death...",3.3
552,Death Dream,"Body, Mind & Spirit",3.0,"The authors probe the phenomenon of ""pre-death...","['Kelly Bulkeley', 'Patricia Bulkley']","The authors probe the phenomenon of ""pre-death...",3.3
553,Death Dream,"Body, Mind & Spirit",4.0,"The authors probe the phenomenon of ""pre-death...","['Kelly Bulkeley', 'Patricia Bulkley']","The authors probe the phenomenon of ""pre-death...",3.3
554,Death Dream,"Body, Mind & Spirit",4.0,"The authors probe the phenomenon of ""pre-death...","['Kelly Bulkeley', 'Patricia Bulkley']","The authors probe the phenomenon of ""pre-death...",3.3
555,Death Dream,"Body, Mind & Spirit",5.0,"The authors probe the phenomenon of ""pre-death...","['Kelly Bulkeley', 'Patricia Bulkley']","The authors probe the phenomenon of ""pre-death...",3.3
556,Death Dream,"Body, Mind & Spirit",3.0,"The authors probe the phenomenon of ""pre-death...","['Kelly Bulkeley', 'Patricia Bulkley']","The authors probe the phenomenon of ""pre-death...",3.3
557,Death Dream,"Body, Mind & Spirit",1.0,"The authors probe the phenomenon of ""pre-death...","['Kelly Bulkeley', 'Patricia Bulkley']","The authors probe the phenomenon of ""pre-death...",3.3
558,Death Dream,"Body, Mind & Spirit",4.0,"The authors probe the phenomenon of ""pre-death...","['Kelly Bulkeley', 'Patricia Bulkley']","The authors probe the phenomenon of ""pre-death...",3.3
559,Death Dream,"Body, Mind & Spirit",4.0,"The authors probe the phenomenon of ""pre-death...","['Kelly Bulkeley', 'Patricia Bulkley']","The authors probe the phenomenon of ""pre-death...",3.3


In [58]:
example_title = merged_book_data['Title'].iloc[576]
matrix = recommend_books_knn(example_title, merged_book_data, knn, tfidf_matrix)
print("Recommending books based on: ", example_title)
matrix

Recommending books based on:  Information Theory in Analytical Chemistry (Chemical Analysis: A Series of Monographs on Analytical Chemistry and Its Applications)


Unnamed: 0,Title,categories,review/score,description,authors,combined_text_features,average rating
64792,Night,Juvenile Fiction,5.0,Here is the wonderful new version of the class...,['Clement Moore'],Here is the wonderful new version of the class...,4.4625
1209221,Midnight Secrets (Berkley Sensation),,5.0,,,,4.2
1262219,We never went to the moon,,3.0,,,,2.0625
338198,A Clockwork Orange,Juvenile Fiction,5.0,In de nabije toekomst maken jongeren de strate...,['Anthony Burgess'],In de nabije toekomst maken jongeren de strate...,4.4625
65222,Night,Juvenile Fiction,4.0,Here is the wonderful new version of the class...,['Clement Moore'],Here is the wonderful new version of the class...,4.4625
