#### This notebook is here to show you how test_big_cleaned.csv was made, but you don't have to run it again

In [2]:
import pandas as pd

In [13]:
input_file = 'data/test_big.csv'

In [14]:
df = pd.read_csv(input_file, sep=',', names=['isbn', 'year', 'description1', 'description2'])

In [15]:
df.isbn.nunique()

17170

In [68]:
df.head()

Unnamed: 0,isbn,year,description1,description2
0,isbn,publish_date,genres,
1,0195153448,2003,"Classical Mythology, Mythology, Classical, Myt...",
2,0002005018,2001,"Actresses, Fiction, Literature, Sisters, Women...",
3,0060973129,Error fetching data,"Campaigns, History, History, Military, Militar...",
4,0374157065,1999,"Influenza, History, Grippe humaine, Disease Ou...",


In [69]:
# The first row is the header with the wrong number of columns
df = df.iloc[1:]  # Slice the DataFrame to exclude the first row
df = df.reset_index(drop=True)
df.head()


Unnamed: 0,isbn,year,description1,description2
0,195153448,2003,"Classical Mythology, Mythology, Classical, Myt...",
1,2005018,2001,"Actresses, Fiction, Literature, Sisters, Women...",
2,60973129,Error fetching data,"Campaigns, History, History, Military, Militar...",
3,374157065,1999,"Influenza, History, Grippe humaine, Disease Ou...",
4,393045218,1999,"Antiquities, Bronze age, Mummies, Prehistoric ...",


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69921 entries, 0 to 69920
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   isbn          69921 non-null  object
 1   year          69921 non-null  object
 2   description1  30784 non-null  object
 3   description2  52752 non-null  object
dtypes: object(4)
memory usage: 2.1+ MB


In [71]:
# The 2 columns "description1" and "description2" are not clearly defined, 
# so we will concatenate them into a single column "description"
df['description'] = df['description1'].astype(str) + ' ' + df['description2'].astype(str)
df = df.drop(columns=['description1', 'description2'])

In [72]:
df.head()

Unnamed: 0,isbn,year,description
0,195153448,2003,"Classical Mythology, Mythology, Classical, Myt..."
1,2005018,2001,"Actresses, Fiction, Literature, Sisters, Women..."
2,60973129,Error fetching data,"Campaigns, History, History, Military, Militar..."
3,374157065,1999,"Influenza, History, Grippe humaine, Disease Ou..."
4,393045218,1999,"Antiquities, Bronze age, Mummies, Prehistoric ..."


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69921 entries, 0 to 69920
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   isbn         69921 non-null  object
 1   year         69921 non-null  object
 2   description  69921 non-null  object
dtypes: object(3)
memory usage: 1.6+ MB


In [74]:
df.to_csv('data/test_big_cleaned', index=False)

In [38]:
%run import_data.ipynb

Dataset books.csv loaded successfully
isbn column dropped, isbn13 column kept
Language codes eng, en-US, en-GB, en-CA unified to 'en'
Missing publication dates filled in
2nd (large) dataset is being downloaded...
Dataset downloaded and saved as data/books2.csv
Dataset downloaded and saved as data/users.csv
Dataset downloaded and saved as data/ratings.csv
Pandas dataframes (books_df, books_big, users, ratings) loaded successfully
Columns renamed and dates converted to dtype: datetime
Ready to go!


In [75]:
books_big.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,image_url_s,image_url_m,image_url_l
0,195153448,Classical Mythology,Mark P. O. Morford,2002-01-01,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001-01-01,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991-01-01,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999-01-01,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999-01-01,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [76]:
# let's merge the available data:
df_test = pd.merge(books_big, df, on='isbn', how='inner')

In [77]:
df_test.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,image_url_s,image_url_m,image_url_l,year,description
0,195153448,Classical Mythology,Mark P. O. Morford,2002-01-01,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2003,"Classical Mythology, Mythology, Classical, Myt..."
1,195153448,Classical Mythology,Mark P. O. Morford,2002-01-01,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2003,"No description found Classical Mythology, Myth..."
2,2005018,Clara Callan,Richard Bruce Wright,2001-01-01,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,2001,"Actresses, Fiction, Literature, Sisters, Women..."
3,2005018,Clara Callan,Richard Bruce Wright,2001-01-01,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,2001,"No description found Actresses, Fiction, Liter..."
4,60973129,Decision in Normandy,Carlo D'Este,1991-01-01,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,Error fetching data,"Campaigns, History, History, Military, Militar..."


In [78]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69921 entries, 0 to 69920
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   isbn                 69921 non-null  object        
 1   book_title           69921 non-null  object        
 2   book_author          69921 non-null  object        
 3   year_of_publication  68616 non-null  datetime64[ns]
 4   publisher            69921 non-null  object        
 5   image_url_s          69921 non-null  object        
 6   image_url_m          69921 non-null  object        
 7   image_url_l          69921 non-null  object        
 8   year                 69921 non-null  object        
 9   description          69921 non-null  object        
dtypes: datetime64[ns](1), object(9)
memory usage: 5.3+ MB


In [79]:
# Let's try to use the scraped 'year' values as replacement for missing 'year_of_publication':
df_test['year_of_publication'] = df_test['year_of_publication'].fillna(df_test['year'])

In [83]:
df_test.year_of_publication.isnull().sum()

0

In [84]:
# Now we can drop the 'year' column:
df_test = df_test.drop(columns=['year'])

In [85]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69921 entries, 0 to 69920
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   isbn                 69921 non-null  object
 1   book_title           69921 non-null  object
 2   book_author          69921 non-null  object
 3   year_of_publication  69921 non-null  object
 4   publisher            69921 non-null  object
 5   image_url_s          69921 non-null  object
 6   image_url_m          69921 non-null  object
 7   image_url_l          69921 non-null  object
 8   description          69921 non-null  object
dtypes: object(9)
memory usage: 4.8+ MB


In [87]:
# let's extract the year from 'year_of_publication'
df_test['year_of_publication'] = pd.to_datetime(df_test['year_of_publication'], errors='coerce')
df_test['year_of_publication'] = df_test['year_of_publication'].dt.year

In [88]:
df_test.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,image_url_s,image_url_m,image_url_l,description
0,195153448,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,"Classical Mythology, Mythology, Classical, Myt..."
1,195153448,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,"No description found Classical Mythology, Myth..."
2,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"Actresses, Fiction, Literature, Sisters, Women..."
3,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,"No description found Actresses, Fiction, Liter..."
4,60973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,"Campaigns, History, History, Military, Militar..."


In [89]:
df_test['year_of_publication'].describe()

count    69732.000000
mean      1995.402154
std          7.378164
min       1920.000000
25%       1992.000000
50%       1997.000000
75%       2001.000000
max       2022.000000
Name: year_of_publication, dtype: float64

In [90]:
# df_test.to_csv('data/test_big_cleaned', index=False)

In [16]:
# 1 day later, we have more data so let's have a look:
df_wednesday = pd.read_csv('data/test_big_2.csv', sep=',', names=['isbn', 'year', 'description1', 'description2'])

  df_wednesday = pd.read_csv('data/test_big_2.csv', sep=',', names=['isbn', 'year', 'description1', 'description2'])


In [17]:
df_wednesday.isbn.nunique()

121863

In [18]:
df_wednesday = df_wednesday.drop_duplicates()

In [19]:
df_wednesday.isbn.nunique()

121863

In [20]:
df_wednesday.shape

(148960, 4)

In [9]:
# df_big = pd.read_csv('data/books2.csv')

  df_big = pd.read_csv('data/books2.csv')


In [11]:
# df_big.ISBN.nunique()

271360