In [7]:
# !unzip data.zip # (if you haven't already)

In [8]:
import pandas as pd

In [9]:
# Load the dataset
books_df = pd.read_csv('data/books.csv', sep=',', header=0)
# Error while loading the data pointed out some commas in the Author column, 
# after removing those commas (or replacing them with a "/"), the data loaded correctly
print("Dataset books.csv loaded successfully")

Dataset books.csv loaded successfully


In [10]:
# the \t\t\t at the end of the publisher column seems to be a parsing error, let's have a look
# print(books_df.columns)  # Check column names
# print(books_df.head())   # Check the first few rows

In [11]:
# Since we don't seem to have lost any data, let's remove the \t\t\t from the publisher column
books_df.rename(columns=lambda x: x.strip().replace('\t\t\t', ''), inplace=True)
books_df['publisher'] = books_df['publisher'].str.replace('\t\t\t', '').str.strip()

# print(books_df.columns)  # Check column names
# print(books_df.head())   # Check the first few rows

In [12]:
# Since there seems to be several spellings/wordings for the same publisher, 
# let's have a look at the unique values in the publisher column
# unique_publishers = sorted(books_df['publisher'].unique().tolist())
# for publisher in unique_publishers:
#    print(publisher)
# Looks like a lot of the publishers are the same, but with different spellings or wordings
# Perhaps some approach like stemming would work well

In [13]:
# Since isbn and isbn13 are the same thing and isbn13 is the newer, 
# globally applied format, we will drop isbn and keep isbn13
books_df.drop('isbn', axis=1, inplace=True)
print("isbn column dropped, isbn13 column kept")

isbn column dropped, isbn13 column kept


In [14]:
# Replace language codes with 'en'
books_df['language_code'] = books_df['language_code'].replace({'en-US': 'en', 'en-GB': 'en', 'en-CA': 'en', 'eng': 'en'})
print("Language codes eng, en-US, en-GB, en-CA unified to 'en'")

Language codes eng, en-US, en-GB, en-CA unified to 'en'


In [15]:
# Looked up the publication dates for the books with missing publication dates
books_df.iloc[books_df[books_df['isbn13'] == 9780553575101].index, books_df.columns.get_loc('publication_date')] = '10/31/2000'
books_df.iloc[books_df[books_df['isbn13'] == 9782070323289].index, books_df.columns.get_loc('publication_date')] = '06/01/1982'
print("Missing publication dates filled in")
print("2nd (large) dataset is being loaded...")

Missing publication dates filled in
2nd (large) dataset is being downloaded...


#### Importing the large datasets found [here](https://drive.google.com/file/d/1JTY2O32QSn4L3UCB-BxeL0qY-5D1DCIz/view?usp=drive_link)

In [16]:
import os
import requests

This is obsolete, since we have a new dataset

In [None]:
# Direct download link (embed the file ID directly in the URL)
# url = "https://drive.google.com/uc?export=download&id=1JTY2O32QSn4L3UCB-BxeL0qY-5D1DCIz"

# Download and save the file in the 'data/' folder
# with open("data/books2.csv", "wb") as file:
#     file.write(requests.get(url).content)

# print("Dataset downloaded and saved as data/books2.csv")

In [17]:
# Users
url = "https://drive.google.com/uc?export=download&id=1i9xdUCFn052HqrFQXSYHEAo0_9vcZjgU"

# Download and save the file in the 'data/' folder
with open("data/users.csv", "wb") as file:
    file.write(requests.get(url).content)

print("Dataset downloaded and saved as data/users.csv")

In [18]:
# ratings
url = "https://drive.google.com/uc?export=download&id=11ckhvFWy_n4br-_UEZW3Aci3cgClYRqD"

# Download and save the file in the 'data/' folder
with open("data/ratings.csv", "wb") as file:
    file.write(requests.get(url).content)

print("Dataset downloaded and saved as data/ratings.csv")

Don't let the new filepath scare you, it's been changed to df_clean.csv instead of books2.csv after scraping and cleaning and it works with the baseline model using the same code:

In [19]:
# books_big = pd.read_csv('data/books2.csv', sep=',', header=0, low_memory=False)
# (Use the scraped and cleaned dataset instead of original books2.csv)
books_big = pd.read_csv("data/df_clean.csv", sep=";", encoding="utf-8", engine="python", na_filter=False)
ratings = pd.read_csv('data/ratings.csv', sep=',', header=0)
users = pd.read_csv('data/users.csv', sep=',', header=0)
print("Pandas dataframes (books_df, books_big, users, ratings) loaded successfully")

Pandas dataframes (books_df, books_big, users, ratings) loaded successfully


## Cleaning the column names:

In [21]:
# books_big.columns = books_big.columns.str.lower().str.replace('-', '_') # done in df_clean.csv
books_df.columns = books_df.columns.str.lower().str.replace('-', '_')
users.columns = users.columns.str.lower().str.replace('-', '_')
ratings.columns = ratings.columns.str.lower().str.replace('-', '_')
books_big['year_of_publication'] = pd.to_datetime(books_big['year_of_publication'], format='%Y', errors='coerce')
books_df['publication_date'] = pd.to_datetime(books_df['publication_date'], format='%m/%d/%Y', errors='coerce')

print("Columns renamed and dates converted to dtype: datetime")

Columns renamed and dates converted to dtype: datetime


In [22]:
# df_clean = pd.read_csv("data/df_clean.csv", sep=";", encoding="utf-8", engine="python", na_filter=False)

In [23]:
# print("final dataset loaded as df_clean")

In [24]:
# books_big.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226494 entries, 0 to 226493
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   isbn                 226494 non-null  object        
 1   book_title           226494 non-null  object        
 2   book_author          226494 non-null  object        
 3   year_of_publication  226494 non-null  datetime64[ns]
 4   publisher            226494 non-null  object        
 5   image_url_s          226494 non-null  object        
 6   image_url_m          226494 non-null  object        
 7   image_url_l          226494 non-null  object        
 8   genre                226494 non-null  object        
dtypes: datetime64[ns](1), object(8)
memory usage: 15.6+ MB


In [19]:
print("Ready to go!")

Ready to go!
