In [1]:
import pandas as pd
import os
import requests
import json
import config
import datetime

In [2]:
dataDir='data'

column_names=['ISBN',
             'Name',
             'Authors',
             'Description',
             'Language',
             'pagesNumber',
             'Publisher',
             'PublishYear',
             'Rating',
             'CountsOfReview']

mainData=pd.DataFrame(columns = column_names)

# Loop through each file in the data directory and load the files in a dataframe for ETL
for file in os.listdir(dataDir):
    filePath = '' + dataDir + '/' + os.fsdecode(file)
    df = pd.read_csv(filePath)
    # workaround for files without 'description' column
    if 'Description' not in df.columns:
        df['Description']="None"
    # initial stage of ETL - filter required columns
    df = df[['ISBN',
             'Name',
             'Authors',
             'Description',
             'Language',
             'pagesNumber',
             'Publisher',
             'PublishYear',
             'Rating',
             'CountsOfReview']]
    # remove non-english characters from Name and Author
    df['Name']=df['Name'].str.replace('[^a-zA-Z0-9!@#$%^&*()-+?/`~"\':; ]', '')
    #df['Authors']=df['Authors'].str.replace('[^a-zA-Z0-9!@#$%^&*()-+?/`~"\':; ]', '')
    # drop rows with missing values
    df.dropna(how='any',inplace=True)
    # append CSV data to main dataframe
    mainData = mainData.append(df,ignore_index=True)

In [3]:
mainData

Unnamed: 0,ISBN,Name,Authors,Description,Language,pagesNumber,Publisher,PublishYear,Rating,CountsOfReview
0,0439358078,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,,eng,870,Scholastic Inc.,2004,4.50,29770
1,0439554896,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,,eng,352,Scholastic,2003,4.42,244
2,043965548X,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling,,eng,435,Scholastic Inc.,2004,4.57,37093
3,0439682584,Harry Potter Boxed Set Books 15 (Harry Potter ...,J.K. Rowling,,eng,2690,Scholastic,2004,4.78,166
4,0976540606,"Unauthorized Harry Potter Book Seven News: ""Ha...",W. Frederick Zimmerman,,en-US,152,Nimble Books,2005,3.79,1
...,...,...,...,...,...,...,...,...,...,...
45740,0813311594,The Center Of The Universe: The Geopolitics Of...,Graham E. Fuller,,eng,301,Westview Press,1991,3.47,0
45741,0883650975,Gloria Vanderbilt Book Of Collage,Gloria Vanderbilt,,eng,112,Galahad Books,1970,3.42,3
45742,0310295610,Who Moved the Stone?,Frank Morison,,en-US,193,Zondervan Academic,1987,3.91,50
45743,0786867280,Silent Joe,T. Jefferson Parker,,eng,341,Hyperion Books,2001,3.85,124


In [4]:
# View distinct languages
mainData.Language.unique()

array(['eng', 'en-US', 'fre', 'spa', 'mul', 'en-GB', 'grc', 'enm',
       'en-CA', 'ger', 'jpn', 'ara', 'nl', 'zho', 'lat', 'por', 'srp',
       'ita', 'rus', 'msa', 'glg', 'wel', 'swe', 'nor', 'kor', 'tur',
       'gla', 'lit', 'per', 'pol', 'gle', 'cat', 'afr', 'ind', 'frs',
       'sco', 'nav', 'gre', 'urd', 'elx', '--', 'cze', 'tlh', 'ang',
       'hin', 'raj', 'nub', 'fin', 'dan', 'heb'], dtype=object)

In [5]:
# filter languages & drop duplicates
enLanguages=['en-US','eng','en-GB','en-CA']
mainData2=mainData[mainData.Language.isin(enLanguages)].copy()
# sorting will keep the records with most reviews when duplicates are dropped
mainData2.sort_values(by='CountsOfReview',ascending=0,inplace=True)
mainData2 = mainData[mainData.Language.isin(enLanguages)].drop_duplicates()
mainData2.drop_duplicates(subset='ISBN',inplace=True)
mainData2.drop_duplicates(subset='Name',inplace=True)
# Convert certain columns to INT
mainData2['CountsOfReview']=mainData2['CountsOfReview'].astype(int)
mainData2['pagesNumber']=mainData2['pagesNumber'].astype(int)
mainData2['PublishYear']=mainData2['PublishYear'].astype(int)
mainData2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41154 entries, 0 to 45744
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ISBN            41154 non-null  object 
 1   Name            41154 non-null  object 
 2   Authors         41154 non-null  object 
 3   Description     41154 non-null  object 
 4   Language        41154 non-null  object 
 5   pagesNumber     41154 non-null  int32  
 6   Publisher       41154 non-null  object 
 7   PublishYear     41154 non-null  int32  
 8   Rating          41154 non-null  float64
 9   CountsOfReview  41154 non-null  int32  
dtypes: float64(1), int32(3), object(6)
memory usage: 3.0+ MB


In [6]:
mainData2.sort_values(by='Name',inplace=True)

In [7]:
maxData = config.maximum_data
dataCut=mainData2.head(maxData).reset_index(drop=True)

In [8]:
# initialise DF's
categoryDF = {"category_id":[],
             "category_name":[]}

isbn_categoryDF = {"isbn_no":[],
                   "category_id":[]}

authorDF = {"author_id":[],
            "author_name":[]}

isbn_authorDF = {"isbn_no":[],
                 "author_id":[]}

print_typeDF = {"print_type_id":[],
                "print_type":[]}

googlebooks_dataDF= {"isbn_no":[],
                     "print_type_id":[],
                     "retail_price":[]}
# get a list of ISBNs
isbn = dataCut['ISBN']

In [9]:
# initialise ID's
category_id = 0
author_id = 0
print_type_id = 0

# initialise counters
prc_cntr=0
record=0
recs_fetched=1
set_no = 1

# create URL
url=f'https://www.googleapis.com/books/v1/volumes?key={config.g_key}&q=isbn:'

# record runtime
startTime = datetime.datetime.now().strftime('%d/%m/%y %H:%M:%S')

# loop through ISBNs and do a googlebooks API call
for i in isbn:

    # GET the API data
    response = requests.get(f"{url}{i}").json()
    prc_cntr += 1
    prcnt=round((prc_cntr/maxData)*100,0)

    # if response returns data then process the data
    if response['totalItems'] != 0:
        
        # initialise authors list
        authors=[]
        
        print(f"RECORD {prc_cntr}: {prcnt}% - Processing ISBN No. {i}")
        
        # get author data
        try:
            authors=response['items'][0]['volumeInfo']['authors']
        except (KeyError, IndexError):
            authors.append(dataCut.loc[dataCut['ISBN'] == i]["Authors"].iloc[0])
        
        # get print_type data
        print_type=response['items'][0]['volumeInfo']['printType']
        
        # get categories data
        try:
            categories=response['items'][0]['volumeInfo']['categories']
        except (KeyError, IndexError):
            categories=[]
        
        # get list price data
        try:
            listPrice=response['items'][0]['saleInfo']['listPrice']['amount']
        except (KeyError, IndexError):
            listPrice=0.00   
        
        # load categories data in objects
        if len(categories) > 0:
            for c in categories:
                cCaps = c.upper()
                if cCaps not in categoryDF['category_name']: 
                    category_id += 1
                    categoryDF['category_id'].append(category_id)
                    categoryDF['category_name'].append(cCaps)
                    finalCatId = category_id
                else: 
                    finalCatId = categoryDF['category_id'][categoryDF['category_name'].index(cCaps)]

                isbn_categoryDF['isbn_no'].append(i)
                isbn_categoryDF['category_id'].append(finalCatId)
        
        # load authors data in objects
        for a in authors:
            aCaps = a.upper()
            if aCaps not in authorDF['author_name']: 
                author_id += 1
                authorDF['author_id'].append(author_id)
                authorDF['author_name'].append(aCaps)
                finalAuthId = author_id
            else: 
                finalAuthId = authorDF['author_id'][authorDF['author_name'].index(aCaps)]

            isbn_authorDF['isbn_no'].append(i)
            isbn_authorDF['author_id'].append(finalAuthId)
        
        # load print type data
        if print_type not in print_typeDF['print_type']:
            ptCaps = print_type.upper()
            print_type_id += 1
            print_typeDF['print_type_id'].append(print_type_id)
            print_typeDF['print_type'].append(ptCaps)
            finalPrintId = print_type_id
        else:
            finalPrintId = print_typeDF['print_type_id'][print_typeDF['print_type'].index(ptCaps)]
        
        # load google books data
        googlebooks_dataDF['isbn_no'].append(i)
        googlebooks_dataDF['print_type_id'].append(finalPrintId)
        googlebooks_dataDF['retail_price'].append(listPrice)
        
    else:
        # skip if ISBN is not found
        print(f"RECORD {prc_cntr}: {prcnt}% - ISBN not found. Skipping...")

endTime = datetime.datetime.now().strftime('%d/%m/%y %H:%M:%S')

# record start and completion time
print(f"START TIME:     {startTime} \nCOMPLETION TIME: {endTime}")

RECORD 1: 2.0% - Processing ISBN No. 4770027036
RECORD 2: 4.0% - Processing ISBN No. 0821224964
RECORD 3: 6.0% - Processing ISBN No. 0743470796
RECORD 4: 8.0% - Processing ISBN No. 0310243564
RECORD 5: 10.0% - ISBN not found. Skipping...
RECORD 6: 12.0% - Processing ISBN No. 0761128050
RECORD 7: 14.0% - Processing ISBN No. 0385471270
RECORD 8: 16.0% - Processing ISBN No. 0070183171
RECORD 9: 18.0% - Processing ISBN No. 0292713428
RECORD 10: 20.0% - Processing ISBN No. 0590020498
RECORD 11: 22.0% - ISBN not found. Skipping...
RECORD 12: 24.0% - ISBN not found. Skipping...
RECORD 13: 26.0% - Processing ISBN No. 0226284352
RECORD 14: 28.0% - Processing ISBN No. 0688093388
RECORD 15: 30.0% - Processing ISBN No. 0813523168
RECORD 16: 32.0% - Processing ISBN No. 0813521378
RECORD 17: 34.0% - ISBN not found. Skipping...
RECORD 18: 36.0% - ISBN not found. Skipping...
RECORD 19: 38.0% - Processing ISBN No. 0818403128
RECORD 20: 40.0% - ISBN not found. Skipping...
RECORD 21: 42.0% - Processing I

In [10]:
# Convert directory of lists to DataFrames
categoryDF=pd.DataFrame(categoryDF)
isbn_categoryDF=pd.DataFrame(isbn_categoryDF)
authorDF=pd.DataFrame(authorDF)
isbn_authorDF=pd.DataFrame(isbn_authorDF)
print_typeDF=pd.DataFrame(print_typeDF)
googlebooks_dataDF=pd.DataFrame(googlebooks_dataDF)