## Data Prep

In [185]:
import pandas as pd
import numpy as np

### 1. Load data from csv file

In [164]:
book = pd.read_csv('data/amazon_book.csv')

In [165]:
book

Unnamed: 0,Name,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


Filter columns not used in clustering

In [166]:
col = [item for item in range(2, 7)]

In [167]:
col.insert(0, 0)

In [168]:
col

[0, 2, 3, 4, 5, 6]

In [169]:
book = book.iloc[:, col]

In [170]:
book

Unnamed: 0,Name,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,4.7,14331,8,2018,Non Fiction


Create a new column genre_n map genre column to numerical value for clustering

In [171]:
book['Genre_n'] = [1 if i == 'Fiction' else 0 for i in book['Genre']]

Filter the string column of genere

In [172]:
book = book.iloc[:, [*range(0, 5), 6]]

In [173]:
book

Unnamed: 0,Name,User Rating,Reviews,Price,Year,Genre_n
0,10-Day Green Smoothie Cleanse,4.7,17350,8,2016,0
1,11/22/63: A Novel,4.6,2052,22,2011,1
2,12 Rules for Life: An Antidote to Chaos,4.7,18979,15,2018,0
3,1984 (Signet Classics),4.7,21424,6,2017,1
4,"5,000 Awesome Facts (About Everything!) (Natio...",4.8,7665,12,2019,0
...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),4.9,9413,8,2019,1
546,You Are a Badass: How to Stop Doubting Your Gr...,4.7,14331,8,2016,0
547,You Are a Badass: How to Stop Doubting Your Gr...,4.7,14331,8,2017,0
548,You Are a Badass: How to Stop Doubting Your Gr...,4.7,14331,8,2018,0


### 2. Data Normalization

In [174]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [175]:
std_scaler = StandardScaler()

Apply Z-score for column ['user rating', 'reviews', 'price']

In [176]:
book.iloc[:, 1:4] = std_scaler.fit_transform(book.iloc[:, 1:4])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book.iloc[:, 1:4] = std_scaler.fit_transform(book.iloc[:, 1:4])


In [177]:
book

Unnamed: 0,Name,User Rating,Reviews,Price,Year,Genre_n
0,10-Day Green Smoothie Cleanse,0.359990,0.460453,-0.470810,2016,0
1,11/22/63: A Novel,-0.080978,-0.844786,0.821609,2011,1
2,12 Rules for Life: An Antidote to Chaos,0.359990,0.599440,0.175400,2018,0
3,1984 (Signet Classics),0.359990,0.808050,-0.655441,2017,1
4,"5,000 Awesome Facts (About Everything!) (Natio...",0.800958,-0.365880,-0.101547,2019,0
...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),1.241926,-0.216739,-0.470810,2019,1
546,You Are a Badass: How to Stop Doubting Your Gr...,0.359990,0.202869,-0.470810,2016,0
547,You Are a Badass: How to Stop Doubting Your Gr...,0.359990,0.202869,-0.470810,2017,0
548,You Are a Badass: How to Stop Doubting Your Gr...,0.359990,0.202869,-0.470810,2018,0


Save result as csv file

In [179]:
book.to_csv('data/book_clustering.csv')

Generate sample data

In [181]:
sample = book.sample(frac=0.5, replace=True, random_state=1)

In [190]:
import dataframe_image as dfi
# df_styled = sample.head(5).style.background_gradient()
# dfi.export(df_styled,"mytable.png")
dfi.export(sample.head(5),"sample_data.png")

objc[42581]: Class WebSwapCGLLayer is implemented in both /System/Library/Frameworks/WebKit.framework/Versions/A/Frameworks/WebCore.framework/Versions/A/Frameworks/libANGLE-shared.dylib (0x7ffb4a224ec8) and /Applications/Google Chrome.app/Contents/Frameworks/Google Chrome Framework.framework/Versions/110.0.5481.177/Libraries/libGLESv2.dylib (0x1178a3220). One of the two will be used. Which one is undefined.
[0226/111255.014789:INFO:headless_shell.cc(107)] 53405 bytes written to file /var/folders/0t/vj81lwzn36xcslx3y2f148t40000gn/T/tmpmpsgtxu2/temp.png
