In [None]:
import pandas as pd
import os
import requests
import json
import config
import datetime

In [None]:
dataDir='data'

column_names=['ISBN',
             'Name',
             'Authors',
             'Description',
             'Language',
             'pagesNumber',
             'Publisher',
             'PublishYear',
             'Rating',
             'CountsOfReview']

mainData=pd.DataFrame(columns = column_names)

# Loop through each file in the data directory and load the files in a dataframe for ETL
for file in os.listdir(dataDir):
    filePath = '' + dataDir + '/' + os.fsdecode(file)
    df = pd.read_csv(filePath)
    # workaround for files without 'description' column
    if 'Description' not in df.columns:
        df['Description']="None"
    # initial stage of ETL - filter required columns
    df = df[['ISBN',
             'Name',
             'Authors',
             'Description',
             'Language',
             'pagesNumber',
             'Publisher',
             'PublishYear',
             'Rating',
             'CountsOfReview']]
    # remove non-english characters from Name and Author
    df['Name']=df['Name'].str.replace('[^a-zA-Z0-9!@#$%^&*()-+?/`~"\':; ]', '')
    #df['Authors']=df['Authors'].str.replace('[^a-zA-Z0-9!@#$%^&*()-+?/`~"\':; ]', '')
    # drop rows with missing values
    df.dropna(how='any',inplace=True)
    # append CSV data to main dataframe
    mainData = mainData.append(df,ignore_index=True)

In [None]:
mainData

In [None]:
# View distinct languages
mainData.Language.unique()

In [None]:
# filter languages & drop duplicates
enLanguages=['en-US','eng','en-GB','en-CA']
mainData2=mainData[mainData.Language.isin(enLanguages)].copy()
# sorting will keep the records with most reviews when duplicates are dropped
mainData2.sort_values(by='CountsOfReview',ascending=0,inplace=True)
mainData2 = mainData[mainData.Language.isin(enLanguages)].drop_duplicates()
mainData2.drop_duplicates(subset='ISBN',inplace=True)
mainData2.drop_duplicates(subset='Name',inplace=True)
# Convert certain columns to INT
mainData2['CountsOfReview']=mainData2['CountsOfReview'].astype(int)
mainData2['pagesNumber']=mainData2['pagesNumber'].astype(int)
mainData2['PublishYear']=mainData2['PublishYear'].astype(int)
mainData2.info()

In [None]:
# View Statistical overview for numerical columns
mainData2.describe()

In [None]:
# Investgate why pagesNumber has minumim value of 0. (Mostly they seem to be Audiobooks) 
low_page_data= mainData2.loc[(mainData2["pagesNumber"] >= 0) & (mainData2["pagesNumber"] <= 5)]
low_page_data

In [None]:
mainData2.sort_values(by='Name',inplace=True)

In [None]:
maxData = config.maximum_data
dataCut=mainData2.head(maxData).reset_index(drop=True)
dataCut

In [None]:
# initialise DF's
categoryDF = {"category_id":[],
             "category_name":[]}

isbn_categoryDF = {"isbn_no":[],
                   "category_id":[]}

authorDF = {"author_id":[],
            "author_name":[]}

isbn_authorDF = {"isbn_no":[],
                 "author_id":[]}

print_typeDF = {"print_type_id":[],
                "print_type":[]}

googlebooks_dataDF= {"isbn_no":[],
                     "print_type_id":[],
                     "retail_price":[]}
# get a list of ISBNs
isbn = dataCut['ISBN']

In [None]:
# initialise ID's
category_id = 0
author_id = 0
print_type_id = 0

# initialise counters
prc_cntr=0
record=0
recs_fetched=1
set_no = 1

# create URL
url=f'https://www.googleapis.com/books/v1/volumes?key={config.g_key}&q=isbn:'

# record runtime
startTime = datetime.datetime.now().strftime('%d/%m/%y %H:%M:%S')

# loop through ISBNs and do a googlebooks API call
for i in isbn:

    # GET the API data
    response = requests.get(f"{url}{i}").json()
    prc_cntr += 1
    prcnt=round((prc_cntr/maxData)*100,0)

    # if response returns data then process the data
    if response['totalItems'] != 0:
        
        # initialise authors list
        authors=[]
        
        print(f"RECORD {prc_cntr}: {prcnt}% - Processing ISBN No. {i}")
        
        # get author data
        try:
            authors=response['items'][0]['volumeInfo']['authors']
        except (KeyError, IndexError):
            authors.append(dataCut.loc[dataCut['ISBN'] == i]["Authors"].iloc[0])
        
        # get print_type data
        print_type=response['items'][0]['volumeInfo']['printType']
        
        # get categories data
        try:
            categories=response['items'][0]['volumeInfo']['categories']
        except (KeyError, IndexError):
            categories=[]
        
        # get list price data
        try:
            listPrice=response['items'][0]['saleInfo']['listPrice']['amount']
        except (KeyError, IndexError):
            listPrice=0.00   
        
        # load categories data in objects
        if len(categories) > 0:
            for c in categories:
                cCaps = c.upper()
                if cCaps not in categoryDF['category_name']: 
                    category_id += 1
                    categoryDF['category_id'].append(category_id)
                    categoryDF['category_name'].append(cCaps)
                    finalCatId = category_id
                else: 
                    finalCatId = categoryDF['category_id'][categoryDF['category_name'].index(cCaps)]

                isbn_categoryDF['isbn_no'].append(i)
                isbn_categoryDF['category_id'].append(finalCatId)
        
        # load authors data in objects
        for a in authors:
            aCaps = a.upper()
            if aCaps not in authorDF['author_name']: 
                author_id += 1
                authorDF['author_id'].append(author_id)
                authorDF['author_name'].append(aCaps)
                finalAuthId = author_id
            else: 
                finalAuthId = authorDF['author_id'][authorDF['author_name'].index(aCaps)]

            isbn_authorDF['isbn_no'].append(i)
            isbn_authorDF['author_id'].append(finalAuthId)
        
        # load print type data
        if print_type not in print_typeDF['print_type']:
            ptCaps = print_type.upper()
            print_type_id += 1
            print_typeDF['print_type_id'].append(print_type_id)
            print_typeDF['print_type'].append(ptCaps)
            finalPrintId = print_type_id
        else:
            finalPrintId = print_typeDF['print_type_id'][print_typeDF['print_type'].index(ptCaps)]
        
        # load google books data
        googlebooks_dataDF['isbn_no'].append(i)
        googlebooks_dataDF['print_type_id'].append(finalPrintId)
        googlebooks_dataDF['retail_price'].append(listPrice)
        
    else:
        # skip if ISBN is not found
        print(f"RECORD {prc_cntr}: {prcnt}% - ISBN not found. Skipping...")

endTime = datetime.datetime.now().strftime('%d/%m/%y %H:%M:%S')

# record start and completion time
print(f"START TIME:     {startTime} \nCOMPLETION TIME: {endTime}")

In [None]:
# Convert directory of lists to DataFrames
categoryDF=pd.DataFrame(categoryDF)
isbn_categoryDF=pd.DataFrame(isbn_categoryDF)
authorDF=pd.DataFrame(authorDF)
isbn_authorDF=pd.DataFrame(isbn_authorDF)
print_typeDF=pd.DataFrame(print_typeDF)
googlebooks_dataDF=pd.DataFrame(googlebooks_dataDF)

## Swobabika's Code starts here

In [None]:
categoryDF.head()

In [None]:
isbn_categoryDF.head()

In [None]:
authorDF.head()

In [None]:
isbn_authorDF.head()

In [None]:
print_typeDF

In [None]:
googlebooks_dataDF.head()

## MERGE KAGGLE DATA AND GOOGLE API DATA TO MAIN DATAFRAME FOR ANALYSIS

### Add Book Category Data to main dataframe.

In [None]:
# Merge isbn_category and category dataframes to get category names for every ISBN. 
# Rename 'isbn_no' column to 'ISBN' so that we can merge with kaggle dataframe.
isbn_category_merge = pd.merge(isbn_categoryDF,categoryDF, on  = "category_id")
isbn_category = isbn_category_merge.rename(columns = {"isbn_no":"ISBN"})
isbn_category.head()

In [None]:
# Merge above dataframe with main dataframe to add category column to main dataframe.
maindf_category_merge = pd.merge(dataCut,isbn_category, on  = "ISBN")
maindf_category_add1 = maindf_category_merge[['ISBN',
             'Name',
             'Authors',
             'Description',
             'Language',
             'pagesNumber',
             'Publisher',
             'PublishYear',
             'Rating',
             'CountsOfReview',
              'category_name']]

maindf_category_add1.head()

### Add Google Books Author Data and compare with Author data from Kaggle and Google Books

In [None]:
# Merge isbn_authorDF and authorDF dataframes to get author names for every ISBN. 
# Rename 'isbn_no' column to 'ISBN' so that we can merge with kaggle dataframe.
isbn_author_merge = pd.merge(isbn_authorDF,authorDF, on  = "author_id")
isbn_author = isbn_author_merge.rename(columns = {"isbn_no":"ISBN","author_name":"author_GB"})
isbn_author = isbn_author[["ISBN","author_GB"]]
isbn_author.head()

In [None]:
# Merge above dataframe with main dataframe to add category column to main dataframe.
maindf_author_merge = pd.merge(maindf_category_add1 , isbn_author, on  = "ISBN")
maindf_authorGB_add2 = maindf_author_merge[["ISBN","Name","Authors","author_GB","Description","Language","pagesNumber",
                                           "Publisher","PublishYear","Rating","CountsOfReview","category_name"]]
maindf_authorGB_add2

### Add Retail Price Data to main dataframe.

In [None]:
# Merge print_typeDF and googlebooks_dataDF dataframes to get print_type and retail price for every ISBN. 
# Rename 'isbn_no' column to 'ISBN' so that we can merge with kaggle dataframe.
printTyp_price_merge = pd.merge(print_typeDF, googlebooks_dataDF, on  = "print_type_id")
printTyp_price = printTyp_price_merge.rename(columns = {"isbn_no":"ISBN", })
printTyp_price = printTyp_price[["ISBN","print_type","retail_price"]]
printTyp_price.head()

In [None]:
# Merge above dataframe with main dataframe to add print_type and retail_price column to main dataframe.
mainDF_all = pd.merge(maindf_authorGB_add2 , printTyp_price, on  = "ISBN")
#maindf_all = maindf_printTyp_price_merge[["ISBN","Name","Authors","author_GB","Description","Language","pagesNumber",
                                           #"Publisher","Rating","CountsOfReview","category_name"]]
mainDF_all

## Analysis Main

## Top 10 Publishers in this dataset

In [None]:
# Get the top 10 publishers for this book data collection.
top_publishers = mainDF_all.groupby('Publisher')['Name'].count().sort_values().tail(10)
top_publishers

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Plot the top 10 publishers with highest number of published books for this book data collection.

bplot=top_publishers.plot(kind="barh",figsize=(16,8),color="royalblue")
plt.xlabel("No. of Published Books")
plt.title("Top 10 Publishers between 2004 to 2010")
for b in bplot.patches:
    width = b.get_width()
    plt.text(0.1+b.get_width(), b.get_y()+0.5*b.get_height(),
             '{:2.0f}'.format(width),
             ha='center', va='center')
#plt.savefig("../01-Project_Documents/01-Presentation_Slides/Price_Output/Apt_Top10_Price.png")

## Books Published Each Decade

In [None]:
# Find the entire year duration
# min_year = mainDF_all['PublishYear'].min()
# max_year = mainDF_all['PublishYear'].max()
# print(f'The books in this dataframe were published between {min_year} - {max_year}')
import numpy as np
from scipy.stats import linregress

In [None]:
# Define bins of 10 years too seggregate the data.
#bins=np.linspace(min_year,max_year, num=13)
bins=np.linspace(1900 ,2020, num=13)
labels = ["1900-1910","1911-1920", "1921-1930","1931-1940", "1941-1950","1951-1960","1961-1970","1971-1980","1981-1990",
         "1991-2000", "2001-2010", "2011-2020"]

In [None]:
# A separte dataframe with only PublisherYear data to work with
decade_df = mainDF_all[["PublishYear"]]
decade_df.head()

In [None]:
# Add a new column for decade group category. 
decade_df["Decade_group"] = pd.cut(decade_df["PublishYear"], bins, labels = labels, include_lowest = True).copy()
decade_df.head()

In [None]:
# Count the number of instances(books) of each decade group.
decade_df = decade_df.groupby("Decade_group").count()
decade_df 

In [None]:
# Define x and y axis as lists.
x = decade_df.index.tolist()
y = decade_df['PublishYear'].values.tolist()
print(x)
print(y)

In [None]:
# Plot the bar graph
plt.figure(figsize = (16,8))
plt.yscale("log")
plt.bar(x, y, width= 0.9, align='center',color='orange', edgecolor = 'red')
i = 1.0
j = 1.0
for i in range(len(x)):
    plt.annotate(y[i], (-0.1 + i, y[i] + j))
plt.legend(labels = ['Total Number of Books'])
plt.title("Bar plot representing the trend of Total number of Published Books each Decade")
plt.xlabel('Decade Durations')
plt.ylabel('Number of Books')
plt.show()

## Correlation between Pagenumbers and ratings

In [None]:
import scipy.stats as st

In [None]:
# Correlation between Pagenumbers and ratings.
x_values = mainDF_all['pagesNumber']
y_values = mainDF_all['Rating']
correlation = st.pearsonr(x_values,y_values)
print(f'The correlation between the number of pages in a book and rating is: {round(correlation[0],2)}')
plt.scatter(x_values,y_values, color = 'blue')
plt.xlabel('No.of Pages in a Book')
plt.ylabel('Rating')
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(200,3.5),fontsize=15,color="red")
plt.xlabel('***')
plt.ylabel('***')
plt.show()

### Observation: There seems to be no correlation between number of pages of a book and it's rating. Hence, number of pages doent seem to affect the likeability of a book at all. 

## Book Type Data Distribution

In [None]:
bookTyp_count = mainDF_all['category_name'].value_counts()
bookTyp_count

In [None]:
bookTyp_count.plot.pie(startangle=30,autopct='%1.1f%%',figsize=(12, 12) )
plt.title('Distribution of Book Categories')
plt.show()