In [112]:
import pandas as pd
import os
import requests
import json
import api_keys

In [130]:
dataDir='data'

column_names=['ISBN',
             'Name',
             'Authors',
             'Description',
             'Language',
             'pagesNumber',
             'Publisher',
             'PublishYear',
             'Rating',
             'CountsOfReview']

mainData=pd.DataFrame(columns = column_names)

# Loop through each file in the data directory and load the files in a dataframe for ETL
for file in os.listdir(dataDir):
    filePath = '' + dataDir + '/' + os.fsdecode(file)
    df = pd.read_csv(filePath)
    # workaround for files without 'description' column
    if 'Description' not in df.columns:
        df['Description']="None"
    # initial stage of ETL - filter required columns
    df = df[['ISBN',
             'Name',
             'Authors',
             'Description',
             'Language',
             'pagesNumber',
             'Publisher',
             'PublishYear',
             'Rating',
             'CountsOfReview']]
    # remove non-english characters from Name and Author
    df['Name']=df['Name'].str.replace('[^a-zA-Z0-9!@#$%^&*()-+?/`~"\':; ]', '')
    df['Authors']=df['Authors'].str.replace('[^a-zA-Z0-9!@#$%^&*()-+?/`~"\':; ]', '')
    # drop rows with missing values
    df.dropna(how='any',inplace=True)
    # append CSV data to main dataframe
    mainData = mainData.append(df,ignore_index=True)

In [131]:
mainData

Unnamed: 0,ISBN,Name,Authors,Description,Language,pagesNumber,Publisher,PublishYear,Rating,CountsOfReview
0,0439358078,Harry Potter and the Order of the Phoenix (Har...,JK Rowling,,eng,870,Scholastic Inc.,2004,4.50,29770
1,0439554896,Harry Potter and the Chamber of Secrets (Harry...,JK Rowling,,eng,352,Scholastic,2003,4.42,244
2,043965548X,Harry Potter and the Prisoner of Azkaban (Harr...,JK Rowling,,eng,435,Scholastic Inc.,2004,4.57,37093
3,0439682584,Harry Potter Boxed Set Books 15 (Harry Potter ...,JK Rowling,,eng,2690,Scholastic,2004,4.78,166
4,0976540606,"Unauthorized Harry Potter Book Seven News: ""Ha...",W Frederick Zimmerman,,en-US,152,Nimble Books,2005,3.79,1
...,...,...,...,...,...,...,...,...,...,...
66029,0761989269,Introduction to Museum Work,G Ellis Burcaw,,en-GB,240,AltaMira Press,1975,3.03,3
66030,0679723420,Pale Fire,Vladimir Nabokov,,eng,315,Vintage,1989,4.15,353
66031,0689874464,Amelia's Book of Notes & Note Passing (Amelia'...,Marissa Moss,,eng,80,Simon & Schuster Books for Young Readers,2006,3.87,11
66032,0934052379,Passing It On,Yuri Kochiyama,,eng,223,UCLA Asian American Studies Center Press,2004,4.14,8


In [135]:
# View distinct languages
mainData.Language.unique()

array(['eng', 'en-US', 'fre', 'spa', 'mul', 'en-GB', 'grc', 'enm',
       'en-CA', 'ger', 'jpn', 'ara', 'nl', 'zho', 'lat', 'por', 'srp',
       'ita', 'rus', 'msa', 'glg', 'wel', 'swe', 'nor', 'kor', 'tur',
       'gla', 'lit', 'per', 'pol', 'gle', 'cat', 'afr', 'ind', 'frs',
       'sco', 'nav', 'gre', 'urd', 'elx', '--', 'cze', 'tlh', 'ang',
       'hin', 'raj', 'nub', 'fin', 'dan', 'heb', 'ypk', 'lao', 'hye',
       'fil', 'frm', 'tgl', 'wak'], dtype=object)

In [145]:
# filter languages & drop duplicates
enLanguages=['en-US','eng','en-GB','en-CA']
# sorting will keep the records with most reviews when duplicates are dropped
mainData2.sort_values(by='CountsOfReview',ascending=0,inplace=True)
mainData2 = mainData[mainData.Language.isin(enLanguages)].drop_duplicates()
#mainData2.drop_duplicates(subset='ISBN',inplace=True)
#mainData2.drop_duplicates(subset='Name',inplace=True)
# Convert certain columns to INT
mainData2['CountsOfReview']=mainData2['CountsOfReview'].astype(int)
mainData2['pagesNumber']=mainData2['pagesNumber'].astype(int)
mainData2['PublishYear']=mainData2['PublishYear'].astype(int)
mainData2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61048 entries, 0 to 66033
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ISBN            61048 non-null  object 
 1   Name            61048 non-null  object 
 2   Authors         61048 non-null  object 
 3   Description     61048 non-null  object 
 4   Language        61048 non-null  object 
 5   pagesNumber     61048 non-null  int32  
 6   Publisher       61048 non-null  object 
 7   PublishYear     61048 non-null  int32  
 8   Rating          61048 non-null  float64
 9   CountsOfReview  61048 non-null  int32  
dtypes: float64(1), int32(3), object(6)
memory usage: 4.4+ MB


In [146]:
mainData2.sort_values(by='Name',inplace=True)

In [147]:
mainData2

Unnamed: 0,ISBN,Name,Authors,Description,Language,pagesNumber,Publisher,PublishYear,Rating,CountsOfReview
37344,4770027036,/ The Tale Of Genji: 22,Waki Yamato,,eng,160,講談社,2001,0.00,0
14774,0821224964,Propos de Paris,Henri CartierBresson,,eng,168,Bulfinch,1998,4.65,3
1653,0743470796,said the shotgun to the head,Saul Williams,,en-US,192,MTV Books,2003,4.22,215
53413,0060541644,"""A Problem from Hell"": America and the Age of ...",Samantha Power,,eng,620,Basic Books,2003,4.23,447
9926,0310243564,"""A"" Is for Abductive : The Language of the Eme...",Leonard Sweet,,en-US,338,Zondervan,2002,3.14,3
...,...,...,...,...,...,...,...,...,...,...
49849,0131495054,xUnit Test Patterns: Refactoring Test Code,Gerard Meszaros,,eng,883,Addison-Wesley Professional,2007,3.94,43
37137,0345470583,xxxHolic Vol 1 (xxxHOLiC #1),CLAMP,,en-US,178,Del Rey,2004,4.15,352
37140,0345471199,xxxHolic Vol 2 (xxxHOLiC #2),CLAMP,,eng,208,Del Rey,2004,4.19,88
37138,0345471814,xxxHolic Vol 3 (xxxHOLiC #3),CLAMP,,eng,182,Del Rey,2004,4.22,71


In [144]:
dataCut=mainData2.head(50).reset_index()
dataCut

Unnamed: 0,index,ISBN,Name,Authors,Description,Language,pagesNumber,Publisher,PublishYear,Rating,CountsOfReview
0,37344,4770027036,/ The Tale Of Genji: 22,Waki Yamato,,eng,160,講談社,2001,0.0,0
1,14774,821224964,Propos de Paris,Henri CartierBresson,,eng,168,Bulfinch,1998,4.65,3
2,1653,743470796,said the shotgun to the head,Saul Williams,,en-US,192,MTV Books,2003,4.22,215
3,53413,60541644,"""A Problem from Hell"": America and the Age of ...",Samantha Power,,eng,620,Basic Books,2003,4.23,447
4,9926,310243564,"""A"" Is for Abductive : The Language of the Eme...",Leonard Sweet,,en-US,338,Zondervan,2002,3.14,3
5,21190,446674060,"""A"" Is for Admission: The Insider's Guide to G...",Michele A Hernndez,,en-US,288,Warner Books,2009,3.75,4
6,37264,761128050,"""A"" Is for Adultery Angst and Adults Only",Sara Midda,,en-GB,64,Workman Publishing Company,2002,3.59,1
7,48896,60932333,"""A"" Is for Attitude : An Alphabet for Living",Patricia RussellMcCloud,,en-US,256,Harper Perennial,2002,3.33,0
8,21196,385471270,"""An Honorable Profession"": A Tribute to Robert...",Pierre Salinger,,eng,212,Main Street Books,1993,4.12,2
9,9079,70183171,"""Dear Genius"": A Memoir of My Life with Truman...",Jack Dunphy,,eng,275,McGraw-Hill Companies,1987,3.33,6
