In [1]:
import pandas as pd
import os
import requests
import json
import config
import datetime

In [2]:
dataDir='Kaggle_csvData'

column_names=['ISBN',
             'Name',
             'Authors',
             'Description',
             'Language',
             'pagesNumber',
             'Publisher',
             'PublishYear',
             'Rating',
             'CountsOfReview']

mainData=pd.DataFrame(columns = column_names)

# Loop through each file in the data directory and load the files in a dataframe for ETL
for file in os.listdir(dataDir):
    filePath = '' + dataDir + '/' + os.fsdecode(file)
    df = pd.read_csv(filePath)
    # workaround for files without 'description' column
    if 'Description' not in df.columns:
        df['Description']="None"
    # initial stage of ETL - filter required columns
    df = df[['ISBN',
             'Name',
             'Authors',
             'Description',
             'Language',
             'pagesNumber',
             'Publisher',
             'PublishYear',
             'Rating',
             'CountsOfReview']]
    # remove non-english characters from Name and Author
    df['Name']=df['Name'].str.replace('[^a-zA-Z0-9!@#$%^&*()-+?/`~"\':; ]', '')
    #df['Authors']=df['Authors'].str.replace('[^a-zA-Z0-9!@#$%^&*()-+?/`~"\':; ]', '')
    # drop rows with missing values
    df.dropna(how='any',inplace=True)
    # append CSV data to main dataframe
    mainData = mainData.append(df,ignore_index=True)

In [3]:
# View distinct languages
mainData.Language.unique()

array(['eng', 'en-US', 'fre', 'spa', 'mul', 'en-GB', 'grc', 'enm',
       'en-CA', 'ger', 'jpn', 'ara', 'nl', 'zho', 'lat', 'por', 'srp',
       'ita', 'rus', 'msa', 'glg', 'wel', 'swe', 'nor', 'kor', 'tur',
       'gla', 'lit', 'per', 'pol', 'gle', 'cat', 'afr', 'ind', 'frs',
       'sco', 'nav', 'gre', 'urd', 'elx'], dtype=object)

In [4]:
# filter languages & drop duplicates
enLanguages=['en-US','eng','en-GB','en-CA']
mainData2=mainData[mainData.Language.isin(enLanguages)].copy()
# sorting will keep the records with most reviews when duplicates are dropped
mainData2.sort_values(by='CountsOfReview',ascending=0,inplace=True)
mainData2 = mainData[mainData.Language.isin(enLanguages)].drop_duplicates()
mainData2.drop_duplicates(subset='ISBN',inplace=True)
mainData2.drop_duplicates(subset='Name',inplace=True)
# Convert certain columns to INT
mainData2['CountsOfReview']=mainData2['CountsOfReview'].astype(int)
mainData2['pagesNumber']=mainData2['pagesNumber'].astype(int)
mainData2['PublishYear']=mainData2['PublishYear'].astype(int)
mainData2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30454 entries, 0 to 33544
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ISBN            30454 non-null  object 
 1   Name            30454 non-null  object 
 2   Authors         30454 non-null  object 
 3   Description     30454 non-null  object 
 4   Language        30454 non-null  object 
 5   pagesNumber     30454 non-null  int32  
 6   Publisher       30454 non-null  object 
 7   PublishYear     30454 non-null  int32  
 8   Rating          30454 non-null  float64
 9   CountsOfReview  30454 non-null  int32  
dtypes: float64(1), int32(3), object(6)
memory usage: 2.2+ MB


In [None]:
# View Statistical overview for numerical columns
mainData2.describe()

In [None]:
# Investgate why pagesNumber has minumim value of 0. (Mostly they seem to be Audiobooks) 
low_page_data= mainData2.loc[(mainData2["pagesNumber"] >= 0) & (mainData2["pagesNumber"] <= 5)]
low_page_data

In [5]:
mainData2.sort_values(by='Name',inplace=True)

In [6]:
maxData = config.maximum_data
dataCut=mainData2.head(maxData).reset_index(drop=True)
dataCut

Unnamed: 0,ISBN,Name,Authors,Description,Language,pagesNumber,Publisher,PublishYear,Rating,CountsOfReview
0,0821224964,Propos de Paris,Henri Cartier-Bresson,,eng,168,Bulfinch,1998,4.65,3
1,0743470796,said the shotgun to the head,Saul Williams,,en-US,192,MTV Books,2003,4.22,215
2,0310243564,"""A"" Is for Abductive : The Language of the Eme...",Leonard Sweet,,en-US,338,Zondervan,2002,3.14,3
3,0446674060,"""A"" Is for Admission: The Insider's Guide to G...",Michele A. Hernández,,en-US,288,Warner Books,2009,3.75,4
4,0385471270,"""An Honorable Profession"": A Tribute to Robert...",Pierre Salinger,,eng,212,Main Street Books,1993,4.12,2
5,0070183171,"""Dear Genius"": A Memoir of My Life with Truman...",Jack Dunphy,,eng,275,McGraw-Hill Companies,1987,3.33,6
6,0292713428,"""Evil"" Arabs in American Popular Film: Orienta...",Tim Jon Semmerling,,eng,303,University of Texas Press,2006,4.5,0
7,0590020498,"""I Can't"" Said The Ant",Polly Cameron,,eng,36,Scholastic Inc.,1948,4.1,12
8,0740715267,"""My Teenage Son's Goal In Life Is To Make Me F...",Dave Barry,,eng,112,Andrews McMeel Publishing,2001,3.64,10
9,0688093388,"""Stand Back"" Said the Elephant ""I'm Going to S...",Patricia Thomas,,eng,32,"William Morrow & Company, Inc.",1990,4.39,134


In [7]:
# initialise DF's
categoryDF = {"category_id":[],
             "category_name":[]}

isbn_categoryDF = {"isbn_no":[],
                   "category_id":[]}

authorDF = {"author_id":[],
            "author_name":[]}

isbn_authorDF = {"isbn_no":[],
                 "author_id":[]}

print_typeDF = {"print_type_id":[],
                "print_type":[]}

googlebooks_dataDF= {"isbn_no":[],
                     "print_type_id":[],
                     "retail_price":[]}
# get a list of ISBNs
isbn = dataCut['ISBN']

In [8]:
# initialise ID's
category_id = 0
author_id = 0
print_type_id = 0

# initialise counters
prc_cntr=0
record=0
recs_fetched=1
set_no = 1

# create URL
url=f'https://www.googleapis.com/books/v1/volumes?key={config.g_key}&q=isbn:'

# record runtime
startTime = datetime.datetime.now().strftime('%d/%m/%y %H:%M:%S')

# loop through ISBNs and do a googlebooks API call
for i in isbn:

    # GET the API data
    response = requests.get(f"{url}{i}").json()
    prc_cntr += 1
    prcnt=round((prc_cntr/maxData)*100,0)

    # if response returns data then process the data
    if response['totalItems'] != 0:
        
        # initialise authors list
        authors=[]
        
        print(f"RECORD {prc_cntr}: {prcnt}% - Processing ISBN No. {i}")
        
        # get author data
        try:
            authors=response['items'][0]['volumeInfo']['authors']
        except (KeyError, IndexError):
            authors.append(dataCut.loc[dataCut['ISBN'] == i]["Authors"].iloc[0])
        
        # get print_type data
        print_type=response['items'][0]['volumeInfo']['printType']
        
        # get categories data
        try:
            categories=response['items'][0]['volumeInfo']['categories']
        except (KeyError, IndexError):
            categories=[]
        
        # get list price data
        try:
            listPrice=response['items'][0]['saleInfo']['listPrice']['amount']
        except (KeyError, IndexError):
            listPrice=0.00   
        
        # load categories data in objects
        if len(categories) > 0:
            for c in categories:
                cCaps = c.upper()
                if cCaps not in categoryDF['category_name']: 
                    category_id += 1
                    categoryDF['category_id'].append(category_id)
                    categoryDF['category_name'].append(cCaps)
                    finalCatId = category_id
                else: 
                    finalCatId = categoryDF['category_id'][categoryDF['category_name'].index(cCaps)]

                isbn_categoryDF['isbn_no'].append(i)
                isbn_categoryDF['category_id'].append(finalCatId)
        
        # load authors data in objects
        for a in authors:
            aCaps = a.upper()
            if aCaps not in authorDF['author_name']: 
                author_id += 1
                authorDF['author_id'].append(author_id)
                authorDF['author_name'].append(aCaps)
                finalAuthId = author_id
            else: 
                finalAuthId = authorDF['author_id'][authorDF['author_name'].index(aCaps)]

            isbn_authorDF['isbn_no'].append(i)
            isbn_authorDF['author_id'].append(finalAuthId)
        
        # load print type data
        if print_type not in print_typeDF['print_type']:
            ptCaps = print_type.upper()
            print_type_id += 1
            print_typeDF['print_type_id'].append(print_type_id)
            print_typeDF['print_type'].append(ptCaps)
            finalPrintId = print_type_id
        else:
            finalPrintId = print_typeDF['print_type_id'][print_typeDF['print_type'].index(ptCaps)]
        
        # load google books data
        googlebooks_dataDF['isbn_no'].append(i)
        googlebooks_dataDF['print_type_id'].append(finalPrintId)
        googlebooks_dataDF['retail_price'].append(listPrice)
        
    else:
        # skip if ISBN is not found
        print(f"RECORD {prc_cntr}: {prcnt}% - ISBN not found. Skipping...")

endTime = datetime.datetime.now().strftime('%d/%m/%y %H:%M:%S')

# record start and completion time
print(f"START TIME:     {startTime} \nCOMPLETION TIME: {endTime}")

RECORD 1: 2.0% - Processing ISBN No. 0821224964
RECORD 2: 4.0% - Processing ISBN No. 0743470796
RECORD 3: 6.0% - Processing ISBN No. 0310243564
RECORD 4: 8.0% - ISBN not found. Skipping...
RECORD 5: 10.0% - Processing ISBN No. 0385471270
RECORD 6: 12.0% - Processing ISBN No. 0070183171
RECORD 7: 14.0% - Processing ISBN No. 0292713428
RECORD 8: 16.0% - Processing ISBN No. 0590020498
RECORD 9: 18.0% - Processing ISBN No. 0740715267
RECORD 10: 20.0% - Processing ISBN No. 0688093388
RECORD 11: 22.0% - Processing ISBN No. 0813523168
RECORD 12: 24.0% - ISBN not found. Skipping...
RECORD 13: 26.0% - Processing ISBN No. 0818403128
RECORD 14: 28.0% - ISBN not found. Skipping...
RECORD 15: 30.0% - Processing ISBN No. 0807735663
RECORD 16: 32.0% - Processing ISBN No. 1592000673
RECORD 17: 34.0% - Processing ISBN No. 0340677570
RECORD 18: 36.0% - Processing ISBN No. 0719059569
RECORD 19: 38.0% - ISBN not found. Skipping...
RECORD 20: 40.0% - Processing ISBN No. 0060539763
RECORD 21: 42.0% - Proces

In [9]:
# Convert directory of lists to DataFrames
categoryDF=pd.DataFrame(categoryDF)
isbn_categoryDF=pd.DataFrame(isbn_categoryDF)
authorDF=pd.DataFrame(authorDF)
isbn_authorDF=pd.DataFrame(isbn_authorDF)
print_typeDF=pd.DataFrame(print_typeDF)
googlebooks_dataDF=pd.DataFrame(googlebooks_dataDF)