In [1]:
# ! python -m spacy download en_core_web_sm
# ! pip install pandas numpy scikit-learn transformers spacy

In [2]:
# Library
import pandas as pd
import numpy as np
import nltk
import re
import plotly.express as px
import plotly.graph_objs as go
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load the data
books = pd.read_csv('/Users/yuting/Desktop/archive/books_data.csv')
ratings = pd.read_csv('/Users/yuting/Desktop/archive/Books_rating.csv')
print(ratings.shape)
print(books.shape)

(3000000, 10)
(212404, 10)


In [4]:
books.head(1)

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],


In [5]:
ratings.head(1)

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...


In [6]:
# Merging both the dataset 
data = pd.merge(ratings,books, on = 'Title')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000000 entries, 0 to 2999999
Data columns (total 19 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  object 
 1   Title               object 
 2   Price               float64
 3   User_id             object 
 4   profileName         object 
 5   review/helpfulness  object 
 6   review/score        float64
 7   review/time         int64  
 8   review/summary      object 
 9   review/text         object 
 10  description         object 
 11  authors             object 
 12  image               object 
 13  previewLink         object 
 14  publisher           object 
 15  publishedDate       object 
 16  infoLink            object 
 17  categories          object 
 18  ratingsCount        float64
dtypes: float64(3), int64(1), object(15)
memory usage: 457.8+ MB


## Data Preprocessing 

In [7]:
# check for missing values
print("Missing values:\n", data.isnull().sum())
print("")
# check for deplicate values
print("Duplicates:\n", data.duplicated().sum())

Missing values:
 Id                          0
Title                     208
Price                 2518829
User_id                561787
profileName            561886
review/helpfulness          0
review/score                0
review/time                 0
review/summary             38
review/text                 8
description            640225
authors                390634
image                  540306
previewLink            330623
publisher              782617
publishedDate          354581
infoLink               330623
categories             551498
ratingsCount          1360694
dtype: int64

Duplicates:
 8774


In [8]:
# Drop missing values
data = data.dropna()
# Drop duplicates
data = data.drop_duplicates()
# Verify shape of cleaned data
print(data.shape)

(174094, 19)


Since we have too much data, I will drop all NA value and dupliates of them to reduce the data size.

In [9]:
# data selection needed
data = data[['Title', 'review/score', 'review/text', 'description', 'authors', 'categories', 'publishedDate']]
data.head(2)

Unnamed: 0,Title,review/score,review/text,description,authors,categories,publishedDate
47,The Church of Christ: A Biblical Ecclesiology ...,5.0,With the publication of Everett Ferguson's boo...,In The Church of Christ: A Biblical Ecclesiolo...,['Everett Ferguson'],['Religion'],1996
48,The Church of Christ: A Biblical Ecclesiology ...,5.0,Everett Ferguson approaches the subject of ear...,In The Church of Christ: A Biblical Ecclesiolo...,['Everett Ferguson'],['Religion'],1996


In [10]:
# #create a new column for description and text
# data['description_review'] = data['review/text'].map(str) + ' ' + data['description']
# data.head(1)

In [11]:
# Removes bractes and colons from authors name
data['authors'] = data['authors'].str.extract(r'\'(.*)\'')
# Removes bractes and colons from categories and &
data['categories'] = data['categories'].str.extract(r'\'(.*)\'')
data['categories'] = data['categories'].str.replace('&', '')
data.head(2)

Unnamed: 0,Title,review/score,review/text,description,authors,categories,publishedDate
47,The Church of Christ: A Biblical Ecclesiology ...,5.0,With the publication of Everett Ferguson's boo...,In The Church of Christ: A Biblical Ecclesiolo...,Everett Ferguson,Religion,1996
48,The Church of Christ: A Biblical Ecclesiology ...,5.0,Everett Ferguson approaches the subject of ear...,In The Church of Christ: A Biblical Ecclesiolo...,Everett Ferguson,Religion,1996


In [12]:
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', " ", doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)
norm_corpus = normalize_corpus(list(data['description']))
len(norm_corpus)

174094

In [13]:
# Vectorize
tf = TfidfVectorizer(ngram_range=(1, 2), min_df=2)
tfidf_matrix = tf.fit_transform(norm_corpus)
tfidf_matrix.shape

(174094, 493946)

In [17]:
# plot of rating score
colors = ['gold', 'mediumturquoise','brown']
labels = data['review/score'].value_counts().keys().map(str)
values = data['review/score'].value_counts()/data['review/score'].value_counts().shape[0]

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=18,
                  marker=dict(colors=colors, line=dict(color='white', width=0.1)))
fig.update_layout(
    title={'text': "Distribution of Review Scores", 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
    annotations=[dict(x=0.5, y=-0.1, showarrow=False, text="Source: Amazon book review score", font=dict(size=12, color="grey"))],
    width=500, height=500
)
fig.show()

In [18]:
# plot of Top 50 Books by Number of Readers
user_per_book=data.groupby('Title')['review/score'].count()
user_per_book=user_per_book.sort_values(ascending = False)
fig = px.bar(user_per_book.head(50))
fig.update_layout(title={'text': "Top 50 Books by Ratings", 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'}, height=550)
fig.show()

In [24]:
# text pre-processing
data['description'] = norm_corpus
data['review/text'] = normalize_corpus(list(data['review/text']))

In [25]:
# save new dataset
data.to_csv('cleaned_data.csv', index=False)