<a href="https://colab.research.google.com/github/Dona134/Book-Reviews-Similarity/blob/main/Data%20preprocessing_cooking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import ast

## Downloading and Extracting the Dataset

In [2]:
import os
os.environ['KAGGLE_USERNAME'] = "xxxxxx"
os.environ['KAGGLE_KEY'] = "xxxxxx"
!kaggle datasets download -d mohamedbakhet/amazon-books-reviews

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
Downloading amazon-books-reviews.zip to /content
 96% 1.02G/1.06G [00:10<00:00, 48.8MB/s]
100% 1.06G/1.06G [00:10<00:00, 112MB/s] 


In [3]:
import zipfile

# Define the zip file name
zip_file = "amazon-books-reviews.zip"

# Extract all contents
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall("amazon-books-reviews")  # Extracts into a folder

In [4]:
os.listdir("amazon-books-reviews")

['Books_rating.csv', 'books_data.csv']

## Loading the Books Data

In [5]:
books_data = pd.read_csv("amazon-books-reviews/books_data.csv")

In [6]:
books_data.head(5)

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,


In [7]:
books_data.shape

(212404, 10)

##  Preprocessing the Books Data

In [8]:
# keep only columns 'Title', 'categories'
books_data = books_data[['Title', 'categories', 'authors', 'publishedDate']]
books_data.head(5)

Unnamed: 0,Title,categories,authors,publishedDate
0,Its Only Art If Its Well Hung!,['Comics & Graphic Novels'],['Julie Strain'],1996
1,Dr. Seuss: American Icon,['Biography & Autobiography'],['Philip Nel'],2005-01-01
2,Wonderful Worship in Smaller Churches,['Religion'],['David R. Ray'],2000
3,Whispers of the Wicked Saints,['Fiction'],['Veronica Haddon'],2005-02
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],2003-03-01


In [9]:
# show unique values in the column categories
books_data['categories'].nunique()

10883

In [10]:
# show top 20 categories
books_data['categories'].value_counts().head(20)

Unnamed: 0_level_0,count
categories,Unnamed: 1_level_1
['Fiction'],23419
['Religion'],9459
['History'],9330
['Juvenile Fiction'],6643
['Biography & Autobiography'],6324
['Business & Economics'],5625
['Computers'],4312
['Social Science'],3834
['Juvenile Nonfiction'],3446
['Science'],2623


In [11]:
books_data.loc[:, 'categories'] = books_data['categories'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)


## Filtering for Cooking Books

In [12]:
# Filter the dataframe to include only rows where the categories column contains 'Cooking' (and not missing)
df_cooking = books_data[
    books_data['categories'].apply(
        lambda x: isinstance(x, list) and any(cat.strip().lower() == 'cooking' for cat in x)
    )
]

In [13]:
df_cooking.head()

Unnamed: 0,Title,categories,authors,publishedDate
19,Alaska Sourdough,[Cooking],['Ruth Allman'],1976
199,Old-Fashioned Ckbk,[Cooking],['Don Holm'],1969
280,Basil: A Book of Recipes (The Little Recipe Bo...,[Cooking],,1997
351,Vino Para Dummies,[Cooking],"['Ed McCarthy', 'Mary Ewing-Mulligan']",2011-03-03
423,Flavors of Korea: Delicious Vegetarian Cuisine...,[Cooking],"['Deborah Coultrip-Davis', 'Young Sook Ramsay']",1998-01-01


In [14]:
df_cooking.shape

(2452, 4)

In [15]:
# remove rows with the same values for authors and publishedDate
df_cooking = df_cooking.drop_duplicates(subset=['authors', 'publishedDate'])

# remove rows with the same values for title
df_cooking = df_cooking.drop_duplicates(subset=['Title'])

In [16]:
df_cooking.shape

(2319, 4)

In [17]:
# keep only column Title
df_cooking = df_cooking[['Title']]

In [18]:
df_cooking.head()

Unnamed: 0,Title
19,Alaska Sourdough
199,Old-Fashioned Ckbk
280,Basil: A Book of Recipes (The Little Recipe Bo...
351,Vino Para Dummies
423,Flavors of Korea: Delicious Vegetarian Cuisine...


## Loading and Preprocessing Books Rating Data

In [19]:
books_rating = pd.read_csv("amazon-books-reviews/Books_rating.csv")

In [20]:
books_rating.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [21]:
# drop the columns that are not needed (Price, review/time, review/summary, profileName, review/text, review/helpfulness)
books_rating = books_rating[['Title', 'review/text', 'User_id', 'review/time']]
books_rating.head()

Unnamed: 0,Title,review/text,User_id,review/time
0,Its Only Art If Its Well Hung!,This is only for Julie Strain fans. It's a col...,AVCGYZL8FQQTD,940636800
1,Dr. Seuss: American Icon,I don't care much for Dr. Seuss but after read...,A30TK6U7DNS82R,1095724800
2,Dr. Seuss: American Icon,"If people become the books they read and if ""t...",A3UH4UZ4RSVO82,1078790400
3,Dr. Seuss: American Icon,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",A2MVUWT453QH61,1090713600
4,Dr. Seuss: American Icon,Philip Nel - Dr. Seuss: American IconThis is b...,A22X4XUPKF66MR,1107993600


In [22]:
# remove rows with the same unique values for User_id and review/time
books_rating = books_rating.drop_duplicates(subset=['User_id', 'review/time']) #unix timestamp
books_rating.shape

(1548183, 4)

In [23]:
# drop the column review/time
books_rating = books_rating.drop(columns=['review/time'])

In [24]:
# display null values in the dataset
books_rating.isnull().sum()

Unnamed: 0,0
Title,17
review/text,1
User_id,5558


In [25]:
# remove null values
books_rating = books_rating.dropna()

In [26]:
books_rating.shape

(1542607, 3)

## Merging Cooking Books with Ratings

In [27]:
# match the books_rating data with the df_romance data using the Title column

df_cooking = df_cooking.merge(books_rating, on='Title', how='inner')
df_cooking.head()

Unnamed: 0,Title,review/text,User_id
0,Alaska Sourdough,"I have been using this book since 1988, the ei...",AC58Z72OB2DDX
1,Alaska Sourdough,"My poor dogeared, stained copy of this book ca...",A3CNQIKVTG9QYO
2,Alaska Sourdough,"As a former Alaskan, I didn't want to have to ...",A2UMP9TJTJ6A6B
3,Alaska Sourdough,For those of us who would prefer to use sourdo...,AC2TK7NHKB5C0
4,Alaska Sourdough,Make the most sublime waffles - crispy outside...,A22T74YNRM8NTK


## Final Cleaning

In [28]:
# drop rows where User_id is not all caps and that has spaces
df_cooking = df_cooking[df_cooking['User_id'].str.isupper() & ~df_cooking['User_id'].str.contains(' ')]
df_cooking.shape

(22434, 3)

In [29]:
# null values
df_cooking.isnull().sum()

Unnamed: 0,0
Title,0
review/text,0
User_id,0


In [30]:
# display the distribution of the ratings per book (count records per book)

df_cooking['Title'].value_counts()

Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
The Bread Lover's Bread Machine Cookbook: A Master Baker's 300 Favorite Recipes for Perfect-Every-Time Bread-From Every Kind of Machine,398
Bread Baker's Apprentice,358
How To Cook Everything: Simple Recipes for Great Food,293
Eating For Life,263
"Saving Dinner: The Menus, Recipes, and Shopping Lists to Bring Your Family Back to the Table",250
...,...
The Co-ed cookbook (Scholastic starline),1
Cooking Vegetables,1
Cooking for Heart and Soul: 100 Delicious Low-Fat Recipes from San Francisco's Top ChefsA Cookbook to Benefit the San Francisco Food Bank,1
The art of Irish cooking,1


In [31]:
# average length of the reviews
df_cooking['review/text'].apply(lambda x: len(str(x).split())).mean()


np.float64(112.57805117232772)

In [32]:
# save the cleaned data to a new csv file
df_cooking.to_csv('df_cooking.csv', index=False)