In [46]:
# !unzip data.zip # (if you haven't already)

In [47]:
import pandas as pd

In [48]:
# Load the dataset
books_df = pd.read_csv('data/books.csv', sep=',', header=0)
# Error while loading the data pointed out some commas in the Author column, 
# after removing those commas (or replacing them with a "/"), the data loaded correctly
print("Dataset books.csv loaded successfully")

Dataset books.csv loaded successfully


In [49]:
# the \t\t\t at the end of the publisher column seems to be a parsing error, let's have a look
# print(books_df.columns)  # Check column names
# print(books_df.head())   # Check the first few rows

In [50]:
# Since we don't seem to have lost any data, let's remove the \t\t\t from the publisher column
books_df.rename(columns=lambda x: x.strip().replace('\t\t\t', ''), inplace=True)
books_df['publisher'] = books_df['publisher'].str.replace('\t\t\t', '').str.strip()

# print(books_df.columns)  # Check column names
# print(books_df.head())   # Check the first few rows

In [51]:
# Since there seems to be several spellings/wordings for the same publisher, 
# let's have a look at the unique values in the publisher column
# unique_publishers = sorted(books_df['publisher'].unique().tolist())
# for publisher in unique_publishers:
#    print(publisher)
# Looks like a lot of the publishers are the same, but with different spellings or wordings
# Perhaps some approach like stemming would work well

In [52]:
# Since isbn and isbn13 are the same thing and isbn13 is the newer, 
# globally applied format, we will drop isbn and keep isbn13
books_df.drop('isbn', axis=1, inplace=True)
print("isbn column dropped, isbn13 column kept")

isbn column dropped, isbn13 column kept


In [53]:
# Replace language codes with 'en'
books_df['language_code'] = books_df['language_code'].replace({'en-US': 'en', 'en-GB': 'en', 'en-CA': 'en', 'eng': 'en'})
print("Language codes eng, en-US, en-GB, en-CA unified to 'en'")

Language codes eng, en-US, en-GB, en-CA unified to 'en'


In [54]:
# Looked up the publication dates for the books with missing publication dates
books_df.iloc[books_df[books_df['isbn13'] == 9780553575101].index, books_df.columns.get_loc('publication_date')] = '10/31/2000'
books_df.iloc[books_df[books_df['isbn13'] == 9782070323289].index, books_df.columns.get_loc('publication_date')] = '06/01/1982'
print("Missing publication dates filled in")
print("2nd (large) dataset is being loaded...")

Missing publication dates filled in
2nd (large) dataset is being loaded...


#### Import the large datasets found [here](https://drive.google.com/file/d/1JTY2O32QSn4L3UCB-BxeL0qY-5D1DCIz/view?usp=drive_link)

In [55]:
import os
import requests

#### We're no longer using books2.csv, as we've scraped more data and saved the updated version into df_clean.csv

In [None]:
# Direct download link (embed the file ID directly in the URL)
# url = "https://drive.google.com/uc?export=download&id=1JTY2O32QSn4L3UCB-BxeL0qY-5D1DCIz"

# Download and save the file in the 'data/' folder
# with open("data/books2.csv", "wb") as file:
#     file.write(requests.get(url).content)

# print("Dataset downloaded and saved as data/books2.csv")

## Uncomment this following cell if you're importing the data for the first time:

In [56]:
# Users
url = "https://drive.google.com/uc?export=download&id=1i9xdUCFn052HqrFQXSYHEAo0_9vcZjgU"

# Download and save the file in the 'data/' folder
with open("data/users.csv", "wb") as file:
    file.write(requests.get(url).content)

print("Dataset downloaded and saved as data/users.csv")

## Uncomment this following cell if you're importing the data for the first time:

In [57]:
# ratings
url = "https://drive.google.com/uc?export=download&id=11ckhvFWy_n4br-_UEZW3Aci3cgClYRqD"

# Download and save the file in the 'data/' folder
with open("data/ratings.csv", "wb") as file:
    file.write(requests.get(url).content)

print("Dataset downloaded and saved as data/ratings.csv")

## Changed to df_clean.csv instead of books2.csv after scraping and cleaning:

In [58]:
# books_big = pd.read_csv('data/books2.csv', sep=',', header=0, low_memory=False)
# (Use the scraped and cleaned dataset instead of original books2.csv)
books_big = pd.read_csv("data/df_clean.csv", sep=";", encoding="utf-8", engine="python", na_filter=False)
ratings = pd.read_csv('data/ratings.csv', sep=',', header=0)
users = pd.read_csv('data/users.csv', sep=',', header=0)
print("Pandas dataframes (books_df, books_big, users, ratings) loaded successfully")

Pandas dataframes (books_df, books_big, users, ratings) loaded successfully


## Cleaning the column names:

In [59]:
# books_big.columns = books_big.columns.str.lower().str.replace('-', '_') # done in df_clean.csv
books_df.columns = books_df.columns.str.lower().str.replace('-', '_')
users.columns = users.columns.str.lower().str.replace('-', '_')
ratings.columns = ratings.columns.str.lower().str.replace('-', '_')
books_big['year_of_publication'] = pd.to_datetime(books_big['year_of_publication'], format='%Y-%m-%d', errors='coerce').dt.year
books_df['publication_date'] = pd.to_datetime(books_df['publication_date'], format='%m/%d/%Y', errors='coerce')

print("Columns renamed and dates converted to dtype: datetime")

Columns renamed and dates converted to dtype: datetime


## Replacing "No author found" and "Error fetching author" with pd.NA:

In [None]:
print("replacing author fetching errors and missing values with NA")

In [60]:
books_big.book_author[books_big['isbn'] == '738702943']

5530    No author found
Name: book_author, dtype: object

In [62]:
books_big.book_author[books_big['book_author'] == 'No author found'].count()

90

In [63]:
books = books_big.copy()
print("dataframe books copied from books_big for ease of use")

dataframe books copied from books_big for ease of use


In [64]:
# books.book_author[books_big['book_author'] == 'No author found'].count()

90

In [65]:
# books.book_author[books_big['book_author'] == 'Error fetching data'].count()

0

In [66]:
# books.book_author[books_big['book_author'] == 'Error fetching author'].count()

81

In [67]:
# books.book_author[books_big['book_author'] == pd.NA].count()

0

In [68]:
# Replace 'No author found' with NA in the 'book_author' column
books['book_author'] = books['book_author'].replace('No author found', pd.NA)

In [69]:
# books.book_author[books_big['book_author'] == 'No author found'].count()

0

In [70]:
books['book_author'] = books['book_author'].replace('Error fetching author', pd.NA)

In [71]:
# books.book_author[books_big['book_author'] == 'Error fetching author'].count()

0

In [72]:
# books.book_author.isna().sum()

171

In [22]:
# books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226493 entries, 0 to 226492
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 226493 non-null  object
 1   book_title           226493 non-null  object
 2   book_author          226403 non-null  object
 3   year_of_publication  226493 non-null  int32 
 4   publisher            226493 non-null  object
 5   image_url_s          226493 non-null  object
 6   image_url_m          226493 non-null  object
 7   image_url_l          226493 non-null  object
 8   genre                226493 non-null  object
dtypes: int32(1), object(8)
memory usage: 14.7+ MB


In [91]:
# books.isbn[books['book_author'] == 'No author found'].count()

0

In [26]:
# books.book_author[books['book_author'] == pd.NA].count()

0

## Replacing genre "Error fetching data" with pd.NA:

In [None]:
print("replacing genre fetching errors with NA")

In [75]:
# books.isbn[books['genre'] == 'Error fetching data'].count()

903

In [76]:
# books.genre.isna().sum()

0

In [77]:
books['genre'] = books['genre'].replace('Error fetching data', pd.NA)

In [78]:
# books.genre.isna().sum()

903

In [79]:
# books.isbn[books['genre'] == 'Error fetching data'].count()

0

In [80]:
# books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226493 entries, 0 to 226492
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 226493 non-null  object
 1   book_title           226493 non-null  object
 2   book_author          226322 non-null  object
 3   year_of_publication  226493 non-null  int32 
 4   publisher            226493 non-null  object
 5   image_url_s          226493 non-null  object
 6   image_url_m          226493 non-null  object
 7   image_url_l          226493 non-null  object
 8   genre                225590 non-null  object
dtypes: int32(1), object(8)
memory usage: 14.7+ MB


## Checking for any further relevant values that are actually missing:

In [81]:
# books.year_of_publication.isna().sum()

0

In [82]:
# books.year_of_publication.describe()

count    226493.000000
mean       1993.548105
std           8.496009
min        1806.000000
25%        1989.000000
50%        1996.000000
75%        2000.000000
max        2021.000000
Name: year_of_publication, dtype: float64

In [83]:
# books.publisher.isna().sum()

0

In [86]:
# books.publisher[books['publisher'].str.contains('error', case=False, na=False)].count()

0

In [87]:
# books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226493 entries, 0 to 226492
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 226493 non-null  object
 1   book_title           226493 non-null  object
 2   book_author          226322 non-null  object
 3   year_of_publication  226493 non-null  int32 
 4   publisher            226493 non-null  object
 5   image_url_s          226493 non-null  object
 6   image_url_m          226493 non-null  object
 7   image_url_l          226493 non-null  object
 8   genre                225590 non-null  object
dtypes: int32(1), object(8)
memory usage: 14.7+ MB


## Now books.info() has become less misleading.

In [89]:
books_big = books.copy()

In [None]:
print("missing values in books dataframe turned into pd.NA for more accurate analysis")

In [90]:
# books_big.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226493 entries, 0 to 226492
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   isbn                 226493 non-null  object
 1   book_title           226493 non-null  object
 2   book_author          226322 non-null  object
 3   year_of_publication  226493 non-null  int32 
 4   publisher            226493 non-null  object
 5   image_url_s          226493 non-null  object
 6   image_url_m          226493 non-null  object
 7   image_url_l          226493 non-null  object
 8   genre                225590 non-null  object
dtypes: int32(1), object(8)
memory usage: 14.7+ MB


In [19]:
print("Ready to go!")

Ready to go!
