In [1]:
import pandas as pd
import os
import requests
import json
import api_keys

In [2]:
dataDir='data'

column_names=['ISBN',
             'Name',
             'Authors',
             'Description',
             'Language',
             'pagesNumber',
             'Publisher',
             'PublishYear',
             'Rating',
             'CountsOfReview']

mainData=pd.DataFrame(columns = column_names)

# Loop through each file in the data directory and load the files in a dataframe for ETL
for file in os.listdir(dataDir):
    filePath = '' + dataDir + '/' + os.fsdecode(file)
    df = pd.read_csv(filePath)
    # workaround for files without 'description' column
    if 'Description' not in df.columns:
        df['Description']="None"
    # initial stage of ETL - filter required columns
    df = df[['ISBN',
             'Name',
             'Authors',
             'Description',
             'Language',
             'pagesNumber',
             'Publisher',
             'PublishYear',
             'Rating',
             'CountsOfReview']]
    # remove non-english characters from Name and Author
    df['Name']=df['Name'].str.replace('[^a-zA-Z0-9!@#$%^&*()-+?/`~"\':; ]', '')
    df['Authors']=df['Authors'].str.replace('[^a-zA-Z0-9!@#$%^&*()-+?/`~"\':; ]', '')
    # drop rows with missing values
    df.dropna(how='any',inplace=True)
    # append CSV data to main dataframe
    mainData = mainData.append(df,ignore_index=True)

In [3]:
mainData

Unnamed: 0,ISBN,Name,Authors,Description,Language,pagesNumber,Publisher,PublishYear,Rating,CountsOfReview
0,0439358078,Harry Potter and the Order of the Phoenix (Har...,JK Rowling,,eng,870,Scholastic Inc.,2004,4.50,29770
1,0439554896,Harry Potter and the Chamber of Secrets (Harry...,JK Rowling,,eng,352,Scholastic,2003,4.42,244
2,043965548X,Harry Potter and the Prisoner of Azkaban (Harr...,JK Rowling,,eng,435,Scholastic Inc.,2004,4.57,37093
3,0439682584,Harry Potter Boxed Set Books 15 (Harry Potter ...,JK Rowling,,eng,2690,Scholastic,2004,4.78,166
4,0976540606,"Unauthorized Harry Potter Book Seven News: ""Ha...",W Frederick Zimmerman,,en-US,152,Nimble Books,2005,3.79,1
...,...,...,...,...,...,...,...,...,...,...
45740,0813311594,The Center Of The Universe: The Geopolitics Of...,Graham E Fuller,,eng,301,Westview Press,1991,3.47,0
45741,0883650975,Gloria Vanderbilt Book Of Collage,Gloria Vanderbilt,,eng,112,Galahad Books,1970,3.42,3
45742,0310295610,Who Moved the Stone?,Frank Morison,,en-US,193,Zondervan Academic,1987,3.91,50
45743,0786867280,Silent Joe,T Jefferson Parker,,eng,341,Hyperion Books,2001,3.85,124


In [4]:
# View distinct languages
mainData.Language.unique()

array(['eng', 'en-US', 'fre', 'spa', 'mul', 'en-GB', 'grc', 'enm',
       'en-CA', 'ger', 'jpn', 'ara', 'nl', 'zho', 'lat', 'por', 'srp',
       'ita', 'rus', 'msa', 'glg', 'wel', 'swe', 'nor', 'kor', 'tur',
       'gla', 'lit', 'per', 'pol', 'gle', 'cat', 'afr', 'ind', 'frs',
       'sco', 'nav', 'gre', 'urd', 'elx', '--', 'cze', 'tlh', 'ang',
       'hin', 'raj', 'nub', 'fin', 'dan', 'heb'], dtype=object)

In [8]:
# filter languages & drop duplicates
enLanguages=['en-US','eng','en-GB','en-CA']
mainData2=mainData[mainData.Language.isin(enLanguages)].copy()
# sorting will keep the records with most reviews when duplicates are dropped
mainData2.sort_values(by='CountsOfReview',ascending=0,inplace=True)
mainData2 = mainData[mainData.Language.isin(enLanguages)].drop_duplicates()
#mainData2.drop_duplicates(subset='ISBN',inplace=True)
#mainData2.drop_duplicates(subset='Name',inplace=True)
# Convert certain columns to INT
mainData2['CountsOfReview']=mainData2['CountsOfReview'].astype(int)
mainData2['pagesNumber']=mainData2['pagesNumber'].astype(int)
mainData2['PublishYear']=mainData2['PublishYear'].astype(int)
mainData2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42396 entries, 0 to 45744
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ISBN            42396 non-null  object 
 1   Name            42396 non-null  object 
 2   Authors         42396 non-null  object 
 3   Description     42396 non-null  object 
 4   Language        42396 non-null  object 
 5   pagesNumber     42396 non-null  int32  
 6   Publisher       42396 non-null  object 
 7   PublishYear     42396 non-null  int32  
 8   Rating          42396 non-null  float64
 9   CountsOfReview  42396 non-null  int32  
dtypes: float64(1), int32(3), object(6)
memory usage: 3.1+ MB


In [9]:
mainData2.sort_values(by='Name',inplace=True)

In [10]:
mainData2

Unnamed: 0,ISBN,Name,Authors,Description,Language,pagesNumber,Publisher,PublishYear,Rating,CountsOfReview
37344,4770027036,/ The Tale Of Genji: 22,Waki Yamato,,eng,160,講談社,2001,0.00,0
14774,0821224964,Propos de Paris,Henri CartierBresson,,eng,168,Bulfinch,1998,4.65,3
1653,0743470796,said the shotgun to the head,Saul Williams,,en-US,192,MTV Books,2003,4.22,215
9926,0310243564,"""A"" Is for Abductive : The Language of the Eme...",Leonard Sweet,,en-US,338,Zondervan,2002,3.14,3
21190,0446674060,"""A"" Is for Admission: The Insider's Guide to G...",Michele A Hernndez,,en-US,288,Warner Books,2009,3.75,4
...,...,...,...,...,...,...,...,...,...,...
42125,0811212386,what it means to be avantgarde,David Antin,,eng,207,New Directions Publishing Corporation,1993,4.44,5
37137,0345470583,xxxHolic Vol 1 (xxxHOLiC #1),CLAMP,,en-US,178,Del Rey,2004,4.15,352
37140,0345471199,xxxHolic Vol 2 (xxxHOLiC #2),CLAMP,,eng,208,Del Rey,2004,4.19,88
37138,0345471814,xxxHolic Vol 3 (xxxHOLiC #3),CLAMP,,eng,182,Del Rey,2004,4.22,71


In [11]:
dataCut=mainData2.head(50).reset_index()
dataCut

Unnamed: 0,index,ISBN,Name,Authors,Description,Language,pagesNumber,Publisher,PublishYear,Rating,CountsOfReview
0,37344,4770027036,/ The Tale Of Genji: 22,Waki Yamato,,eng,160,講談社,2001,0.0,0
1,14774,0821224964,Propos de Paris,Henri CartierBresson,,eng,168,Bulfinch,1998,4.65,3
2,1653,0743470796,said the shotgun to the head,Saul Williams,,en-US,192,MTV Books,2003,4.22,215
3,9926,0310243564,"""A"" Is for Abductive : The Language of the Eme...",Leonard Sweet,,en-US,338,Zondervan,2002,3.14,3
4,21190,0446674060,"""A"" Is for Admission: The Insider's Guide to G...",Michele A Hernndez,,en-US,288,Warner Books,2009,3.75,4
5,37264,0761128050,"""A"" Is for Adultery Angst and Adults Only",Sara Midda,,en-GB,64,Workman Publishing Company,2002,3.59,1
6,21196,0385471270,"""An Honorable Profession"": A Tribute to Robert...",Pierre Salinger,,eng,212,Main Street Books,1993,4.12,2
7,9079,0070183171,"""Dear Genius"": A Memoir of My Life with Truman...",Jack Dunphy,,eng,275,McGraw-Hill Companies,1987,3.33,6
8,21491,0292713428,"""Evil"" Arabs in American Popular Film: Orienta...",Tim Jon Semmerling,,eng,303,University of Texas Press,2006,4.5,0
9,22431,0590020498,"""I Can't"" Said The Ant",Polly Cameron,,eng,36,Scholastic Inc.,1948,4.1,12
