In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the data
data = pd.read_csv("data/goodreads_book.csv")
# Only keep columns we need
data.head()     

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,RatingDist4,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,pagesNumber,Description,Count of text reviews
0,1000000,Flight from Eden,Kathryn A. Graham,0595199402,4.0,2001,1,10,Writer's Showcase Press,5:1,4:1,3:1,2:0,1:0,total:3,1,,380,"What could a computer expert, a mercenary with...",1
1,1000001,Roommates Again,Kathryn O. Galbraith,0689505973,3.2,1994,1,4,Margaret K. McElderry Books,5:0,4:3,3:1,2:0,1:1,total:5,1,,44,"During their stay at Camp Sleep-Away, sisters ...",1
2,1000003,The King At The Door,Brock Cole,0374440417,3.95,1992,31,12,Farrar Straus Giroux,5:5,4:9,3:4,2:1,1:0,total:19,0,,32,A poorly dressed old man appears at an inn and...,0
3,1000004,"Giotto: The Scrovegni Chapel, Padua",Bruce Cole,080761310X,4.47,1993,1,8,George Braziller,5:9,4:5,3:0,2:1,1:0,total:15,2,,118,This beautiful series lavishly illustrates the...,2
4,1000005,Larky Mavis,Brock Cole,0374343659,3.69,2001,3,8,"Farrar, Straus and Giroux (BYR)",5:19,4:12,3:9,2:7,1:4,total:51,8,,32,<b>Another orginal picture-book fairy tale</b>...,8


In [3]:
# Check column names
data.columns

Index(['Id', 'Name', 'Authors', 'ISBN', 'Rating', 'PublishYear',
       'PublishMonth', 'PublishDay', 'Publisher', 'RatingDist5', 'RatingDist4',
       'RatingDist3', 'RatingDist2', 'RatingDist1', 'RatingDistTotal',
       'CountsOfReview', 'Language', 'pagesNumber', 'Description',
       'Count of text reviews'],
      dtype='object')

---
### Filter Columns

In [4]:
# Only keep columns we need
eda_data = data[['Id', 'Name', 'Authors', 'ISBN', 'Rating', 'PublishYear', 'Publisher', 'Language', 'Description']].copy()
eda_data.head()     

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,Publisher,Language,Description
0,1000000,Flight from Eden,Kathryn A. Graham,0595199402,4.0,2001,Writer's Showcase Press,,"What could a computer expert, a mercenary with..."
1,1000001,Roommates Again,Kathryn O. Galbraith,0689505973,3.2,1994,Margaret K. McElderry Books,,"During their stay at Camp Sleep-Away, sisters ..."
2,1000003,The King At The Door,Brock Cole,0374440417,3.95,1992,Farrar Straus Giroux,,A poorly dressed old man appears at an inn and...
3,1000004,"Giotto: The Scrovegni Chapel, Padua",Bruce Cole,080761310X,4.47,1993,George Braziller,,This beautiful series lavishly illustrates the...
4,1000005,Larky Mavis,Brock Cole,0374343659,3.69,2001,"Farrar, Straus and Giroux (BYR)",,<b>Another orginal picture-book fairy tale</b>...


In [5]:
# Find number of smaples and features
eda_data.shape

(39705, 9)

In [6]:
# Check data types of features
eda_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39705 entries, 0 to 39704
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id           39705 non-null  int64  
 1   Name         39705 non-null  object 
 2   Authors      39705 non-null  object 
 3   ISBN         39577 non-null  object 
 4   Rating       39705 non-null  float64
 5   PublishYear  39705 non-null  int64  
 6   Publisher    39345 non-null  object 
 7   Language     7013 non-null   object 
 8   Description  34559 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 2.7+ MB


In [7]:
# Quantiles Summary
eda_data.describe()

Unnamed: 0,Id,Rating,PublishYear
count,39705.0,39705.0,39705.0
mean,1049590.0,3.412516,1998.979751
std,28716.78,1.302399,8.39099
min,1000000.0,0.0,1898.0
25%,1025072.0,3.4,1996.0
50%,1049176.0,3.82,2001.0
75%,1074358.0,4.08,2005.0
max,1099996.0,5.0,2030.0


---
---
# Data Cleaning
### Find missing values and replace with appropriate substitution.

In [8]:
# Find number of NaN in each column
eda_data.isna().sum()

Id                 0
Name               0
Authors            0
ISBN             128
Rating             0
PublishYear        0
Publisher        360
Language       32692
Description     5146
dtype: int64

In [9]:
eda_data.Language.unique()

array([nan, 'eng', 'fre', 'en-US', 'en-GB', 'spa', 'ger', 'ita', 'jpn',
       'en-CA', 'nl', 'lat', 'cat', 'cze', 'grc', 'por', 'per', 'rus',
       'mul', 'swe', 'ind', 'rum', 'raj', 'ang', 'afr', 'eus', 'zho',
       'ypk', 'gle', 'frm', 'tur'], dtype=object)

---
### Replace missing publisher info with unknown

In [10]:
eda_data[["Publisher"]] = eda_data[["Publisher"]].fillna("unknown")

In [11]:
eda_data.isna().sum()

Id                 0
Name               0
Authors            0
ISBN             128
Rating             0
PublishYear        0
Publisher          0
Language       32692
Description     5146
dtype: int64

---
### Remove rows with missing description

In [12]:
# Remove rows having atleast ine NaN in any feature
# eda_data.dropna(axis=0, how="any", inplace=True)
eda_data.dropna(subset=["Description"], inplace=True)

---
### Remove URLs from the description

In [13]:
list(eda_data.Description[eda_data.Id == 1099555]) #Description with url and html tag

['<i>Alternate Cover Edition can be found <a href="https://www.goodreads.com/book/show/38559855" rel="nofollow">here</a></i><br /><br />Das Böse hält keinen Winterschlaf.<br />Kathy Reichs auch nicht.<br /><br /><br />Was könnte frostiger sein als ein kanadischer Dezembersturm? Tempe Brennan, forensische Anthropologin in Montreal, wird an einem tristen Montagmorgen zu einem Fundort gerufen, der ihr das Blut in den Adern gefrieren lässt. Verscharrt in einem Kellergewölbe liegen die Leichen dreier junger Frauen. Nicht eine Gewebefaser, kein Fetzen Kleidung geben Aufschluss darüber, wann und warum diese Mädchen sterben mussten. Nur dank akribischer Ermittlungen und weiblicher Intuition kommt Tempe dem Mörder auf die Spur. Doch sie muss auf alles gefasst sein, denn ihr Gegner ist an Kaltblütigkeit nicht zu übertreffen …<br /><br /><br />Tempe Brennans siebter Fall.<br /><br /><br /><br /><br />']

In [14]:
import re
url_pattern = re.compile(r'https?://\S+|www\.\S+')
def remove_url(text):   
    return re.sub(url_pattern, r'', text)

eda_data.Description = eda_data.Description.apply(remove_url)

---
### Remove html tags from the description.

In [15]:
html_pattern = re.compile('<[^>]*>')
def clean_html_tags(text):
    return re.sub(html_pattern, r'', text)

eda_data.Description = eda_data.Description.apply(clean_html_tags)

In [16]:
list(eda_data.Description[eda_data.Id == 1099555])

['Alternate Cover Edition can be found hereDas Böse hält keinen Winterschlaf.Kathy Reichs auch nicht.Was könnte frostiger sein als ein kanadischer Dezembersturm? Tempe Brennan, forensische Anthropologin in Montreal, wird an einem tristen Montagmorgen zu einem Fundort gerufen, der ihr das Blut in den Adern gefrieren lässt. Verscharrt in einem Kellergewölbe liegen die Leichen dreier junger Frauen. Nicht eine Gewebefaser, kein Fetzen Kleidung geben Aufschluss darüber, wann und warum diese Mädchen sterben mussten. Nur dank akribischer Ermittlungen und weiblicher Intuition kommt Tempe dem Mörder auf die Spur. Doch sie muss auf alles gefasst sein, denn ihr Gegner ist an Kaltblütigkeit nicht zu übertreffen …Tempe Brennans siebter Fall.']

---
### Find book descriptions with very short length and remove them.

In [17]:
eda_data["length"] = [len(d.split()) for d in eda_data['Description'].tolist()]
eda_data[eda_data.length.isin(range(0,4))][["Id", "Name", "Description", "length"]]\
.sort_values(by=["length"], ascending=True).head(10)

Unnamed: 0,Id,Name,Description,length
2327,1005785,Tag-along Tails: Ring Tailed Lemur,,0
30582,1076433,Buddhism,,0
636,1001461,Barbarism,Poetry,1
36831,1092143,Nighthawk F-117 Stealth Fighter,New,1
31889,1079751,Mahmoud Ahmadinejad: President of Iran,##############################################...,1
28972,1072319,The Milk Group,none,1
28889,1072133,Frequently Asked Questions about Being Gifted,##############################################...,1
28887,1072128,Frequently Asked Questions about Stuttering,##############################################...,1
27518,1068566,Even More True Stories: An Intermediate Reader,textformat=02&gt;,1
27371,1068180,The Visual Guide to College Composition,textformat=02&gt;,1


- As we have removed tags we see empty strings in the description, lets first convert them to NaN and then remove them

In [18]:
# Replace empty strings of description with NaN
eda_data.Description = eda_data.Description.replace(r'^\s*$', np.nan, regex=True)

eda_data[eda_data.length.isin(range(0,4))][["Id", "Name", "Description", "length"]]\
.sort_values(by=["length"], ascending=True).head()

Unnamed: 0,Id,Name,Description,length
2327,1005785,Tag-along Tails: Ring Tailed Lemur,,0
30582,1076433,Buddhism,,0
636,1001461,Barbarism,Poetry,1
36831,1092143,Nighthawk F-117 Stealth Fighter,New,1
31889,1079751,Mahmoud Ahmadinejad: President of Iran,##############################################...,1


In [19]:
eda_data.dropna(subset=["Description"], inplace=True)

eda_data[eda_data.length.isin(range(0,4))][["Id", "Name", "Description", "length"]]\
.sort_values(by=["length"], ascending=True).head()

Unnamed: 0,Id,Name,Description,length
636,1001461,Barbarism,Poetry,1
36831,1092143,Nighthawk F-117 Stealth Fighter,New,1
31889,1079751,Mahmoud Ahmadinejad: President of Iran,##############################################...,1
28972,1072319,The Milk Group,none,1
28889,1072133,Frequently Asked Questions about Being Gifted,##############################################...,1


In [20]:
print(set(eda_data.Description[eda_data.length.isin(range(0,4))]))

{'vol. 1of 2', '-- Yasuhiro Nakasone', 'Level A', 'Haynes.', '中古品につき多少のキズ・汚れ・日焼け等はありますが、状態は良好です。', 'Book by', 'Poetry', 'Edition bilingue.', 'Erlogen von Loriot.', 'Noel Polk, editor', 'Re-creation of Landscape', 'Publishers Weekly', '《文化、權力與國家:1900—1942年的華北農村》是美國學者杜贊奇的名著。杜贊奇（Prasenjit Duara），早年就學於印度，後赴美國求學，師從著名漢學大師孔飛力，現任美國芝加哥大學歷史學系及東亞語言文明系教授。其著作除本書外，還有廣為學界選舉的《從民族國家拯救歷史》。此兩書使杜贊奇成為名聞國際的漢學家。本書是以鄉村的文化網絡為基本結構並考察其功能力，作者主要利用日本南滿鐵道株式會社調查部編撰的《中國慣行調查報告》、南開大學經濟研究所在20世紀二三十年代所做的社會調查材料，以及中外學者已有的研究成果，通過細致的個案研究，向我們展示了1900—1942年間華北農村社會的政治經濟文化的一般狀況。作者力圖打通歷史學與社會學的間隔，提出了「國家政權建設」和「權力的文化網絡」兩個中心概念。作者認為，「國家政權建設」是一種全球性現象，作為一個概念，同更古老的「資本主義」等概念一樣，具有深遠的分析性含義。「權力的文化網絡」概念則吸收了西方學術界有關文化研究的成果，反對一些現代化論者用單一社會體系或一套所謂的「中國價值觀」去理解中國的觀點，同時也反對認為價值觀點交互感應的功能主義論者的學說。最重要的一點是，作者在書中貫穿了這樣一種方法:在考慮話語—主體—制度這三者對歷史的建構時，應加入許多外來事物和偶然因素，因為參與主體和主體性構成的不僅有話語，還有外來事物；而由主體構建的制度，還應包括制度本身的邏輯性和偶然性。本書曾先後榮獲1989年度的美國歷史學會費正清獎以及1900年度的亞洲研究學會列文森獎。', 'undefined', 'One WomanOs Charlottetown', 'none', 'English and Italian', 'New', 'Bild Wissensbibliothek', 

In [21]:
# Drop records with very short description
eda_data.drop(eda_data.index[eda_data.length.isin(range(0,4))], inplace = True)
print(eda_data[eda_data.length.isin(range(0,4))].shape[0])
del eda_data["length"]

0


---
### Find if there is duplication of rows/features

In [22]:
# Find number of duplicated rows
eda_data[eda_data.duplicated()==True].shape[0]

0

In [23]:
# Find number of unique values of each feature
eda_data.nunique()

Id             34516
Name           34516
Authors        25190
ISBN           34429
Rating           263
PublishYear       91
Publisher       6095
Language          28
Description    34087
dtype: int64

- We see that some descriptions are repaeted this possibly could be due to different versions of the same book. 

In [24]:
eda_data[eda_data.duplicated(subset="Description", keep=False)][["Id","Name", "ISBN", "Language", "Description"]].copy()

Unnamed: 0,Id,Name,ISBN,Language,Description
78,1000165,Strange Empire,0873512987,,"With passion and verve, Joseph Kinsey Howard, ..."
79,1000166,Strange Empire: Louis Riel and the M?tis People,0888620594,eng,"With passion and verve, Joseph Kinsey Howard, ..."
89,1000185,Lighthouse at the End of the World: The First ...,0803246765,,"At the extreme tip of South America, Staten Is..."
90,1000188,Lighthouse at the End of the World,0803260075,eng,"At the extreme tip of South America, Staten Is..."
162,1000353,The Moravian Springplace Mission to the Cherok...,0803232667,,"In 1801 the Moravians, a Pietist German-speaki..."
...,...,...,...,...,...
39525,1099485,Communication and Citizenship: Journalism and ...,0415100674,,First published in 1993. Routledge is an impri...
39540,1099555,Monday Mourning,0743233476,eng,Alternate Cover Edition can be found hereDas B...
39542,1099557,Meurtres à La Carte,222110062X,,Alternate Cover Edition can be found hereDas B...
39550,1099575,Mopsa The Fairy,1430441771,,This scarce antiquarian book is a facsimile re...


- We see that same books have different ISBN becuase an ISBN is assigned to each separate edition and variation of a publication. For example, an e-book, a paperback and a hardcover edition of the same book will each have a different ISBN (except reprintings).

---
### Drop variants of the books
- Only keep variants where language is not null, if language is missing for all the variants then keep the first occurrence and delete rest.

In [25]:
eda_data = eda_data.sort_values(by="Language", na_position='last').drop_duplicates(subset=['Description'], keep='first');

In [26]:
eda_data.isna().sum()

Id                 0
Name               0
Authors            0
ISBN              84
Rating             0
PublishYear        0
Publisher          0
Language       27794
Description        0
dtype: int64

---
### Extract book series information from the name of the book
- Remove irrelevant info from the name of the book to improve efficiency of the tokenization.
- Book names with hashtag represent edition of the book in a series

In [65]:
series_pattern =  "(?:[;]\s*|\(\s*)([^\(;]*\s*#\s*\d+(?:\.?\d+|\\&\d+|-?\d*))"
def get_book_series_info(text):
    series_info = re.findall(series_pattern, text)
    return series_info if series_info else np.nan

eda_data['BookSeriesInfo'] = eda_data.Name.apply(get_book_series_info)

book_name_cases = ["Sire Lines, Revised Edition (Blood-Horse Classics Library)", 
                   "Patriot Games ((Blood-Horse Classics Library)) (Jack Ryan, #1; Jack Ryan Universe, #2)",
                   "Oh, Cuan Lejos Llegaras! (Oh, the Places You'll Go!", "Ranma 1/2, Vol. 28 (Ranma ½ (US 2nd), #28)", 
                   "Fairy Realm edition, #1-3 (Fairy Realm, #1-3)", "Fairy Realm edition, #1.3 (Fairy Realm, #1.3)", 
                   "The Best Catch In Texas (Men of the West, #10) (Silhouette Special Edition #1814)",
                   "Codes: How to Make Them and Break Them (Murderous Maths, # 14)",
                   "Lovers and Ladies (Lovers and Ladies, #5&6)"]

print("\033[1m{:90}\033[0m\033[1m{:5}\033[0m".format("Book Name", "Series Information"))
for name in book_name_cases:
    print("{!s:90}{!s:5}".format(name, get_book_series_info(name)))


[1mBook Name                                                                                 [0m[1mSeries Information[0m
Sire Lines, Revised Edition (Blood-Horse Classics Library)                                nan  
Patriot Games ((Blood-Horse Classics Library)) (Jack Ryan, #1; Jack Ryan Universe, #2)    ['Jack Ryan, #1', 'Jack Ryan Universe, #2']
Oh, Cuan Lejos Llegaras! (Oh, the Places You'll Go!                                       nan  
Ranma 1/2, Vol. 28 (Ranma ½ (US 2nd), #28)                                                ['US 2nd), #28']
Fairy Realm edition, #1-3 (Fairy Realm, #1-3)                                             ['Fairy Realm, #1-3']
Fairy Realm edition, #1.3 (Fairy Realm, #1.3)                                             ['Fairy Realm, #1.3']
The Best Catch In Texas (Men of the West, #10) (Silhouette Special Edition #1814)         ['Men of the West, #10', 'Silhouette Special Edition #1814']
Codes: How to Make Them and Break Them (Murderous Maths, # 14)     

- As we are using regex there will be certain exceptions, for example it misses a book name with nested brackets case. The name of the book is "Ranma 1/2, Vol. 28 (Ranma ½ (US 2nd), #28)", it should extract series information as `Ranma ½ (US 2nd), #28]`, instead, it extracts `[US 2nd), #28]`

In [58]:
eda_data[["Id","Name", "BookSeriesInfo"]][eda_data.BookSeriesInfo.isna() == False]

Unnamed: 0,Id,Name,BookSeriesInfo
34920,1087256,"Lovers and Ladies (Lovers and Ladies, #5&6)","[Lovers and Ladies, #5&6]"
37838,1094956,The Ultimate Hitchhiker's Guide to the Galaxy ...,"[Hitchhiker's Guide, #1-5]"
7619,1018819,The Stanislaski Sisters: Natasha and Rachel (S...,[Stanislaskis #1]
2414,1005974,The Scripture on the Explication of Underlying...,"[Bdk English Tripitaka Translation Series, #25-]"
4985,1012297,Princess in Pink / Project Princess (The Princ...,"[The Princess Diaries, #4.5]"


In [59]:
eda_data.isna().sum()

Id                    0
Name                  0
Authors               0
ISBN                 84
Rating                0
PublishYear           0
Publisher             0
Language          27794
Description           0
BookSeriesInfo    34082
dtype: int64

---
### Remove the series info from the name of the book

In [67]:
series_remove_pattern = re.compile("(?:[\(]\s*[^\(;]*\s*#\s*\d+(?:\.?\d+|\\&\d+|-?\d*)(?:;|\))|\s*[^\(;]*\s*#\s*\d+(?:\.?\d+|\\&\d+|-?\d*)\))")       
def remove_series_info(text):
    return re.sub(series_remove_pattern, r'', text)

print("\033[1m{:90}\033[0m\033[1m{:5}\033[0m".format("Book Name", "Series Information"))
for name in book_name_cases:
    print("{!s:90}{!s:5}".format(name, remove_series_info(name)))

eda_data["Name"]= eda_data["Name"].str.replace(series_remove_pattern, r'').str.strip()

[1mBook Name                                                                                 [0m[1mSeries Information[0m
Sire Lines, Revised Edition (Blood-Horse Classics Library)                                Sire Lines, Revised Edition (Blood-Horse Classics Library)
Patriot Games ((Blood-Horse Classics Library)) (Jack Ryan, #1; Jack Ryan Universe, #2)    Patriot Games ((Blood-Horse Classics Library)) 
Oh, Cuan Lejos Llegaras! (Oh, the Places You'll Go!                                       Oh, Cuan Lejos Llegaras! (Oh, the Places You'll Go!
Ranma 1/2, Vol. 28 (Ranma ½ (US 2nd), #28)                                                Ranma 1/2, Vol. 28 (Ranma ½ 
Fairy Realm edition, #1-3 (Fairy Realm, #1-3)                                             Fairy Realm edition, #1-3 
Fairy Realm edition, #1.3 (Fairy Realm, #1.3)                                             Fairy Realm edition, #1.3 
The Best Catch In Texas (Men of the West, #10) (Silhouette Special Edition #1814)         The

---
### Impute missing language with book title language

In [None]:
# import numpy as np
# from nltk.classify.textcat import TextCat
# tc = TextCat()
# def detect_language(text):
#     # print(text, end="\t")
#     # Guess language using first five words
#     text = " ".join(text.split()[:5])
#     if text.isnumeric():
#         # print("---NUMERIC---")
#         # print("eng")
#         return 'eng'
#     else:
#         # print(tc.guess_language(text).strip())
#         return tc.guess_language(text).strip()
# eda_data['Language'] = eda_data.apply(lambda x: detect_language(x['Name']) if pd.isna(x['Language']) else x['Language'], axis=1)