# Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 

import re

# Load Data

In [2]:
data_raw = pd.read_csv("data/books.csv")

In [3]:
data_raw.head()

Unnamed: 0,image,name,author,format,book_depository_stars,price,currency,old_price,isbn,category,img_paths
0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,This is Going to Hurt,Adam Kay,Paperback,4.5,7.6,$,11.4,9781509858637,Medical,dataset/Medical/0000001.jpg
1,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,"Thinking, Fast and Slow",Daniel Kahneman,Paperback,4.0,11.5,$,15.0,9780141033570,Medical,dataset/Medical/0000002.jpg
2,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,When Breath Becomes Air,Paul Kalanithi,Paperback,4.5,9.05,$,11.5,9781784701994,Medical,dataset/Medical/0000003.jpg
3,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,The Happiness Trap,Russ Harris,Paperback,4.0,8.34,$,13.9,9781845298258,Medical,dataset/Medical/0000004.jpg
4,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,Man's Search For Meaning,Viktor E. Frankl,Paperback,4.5,9.66,$,,9781846041242,Medical,dataset/Medical/0000005.jpg


# Exploratory Data Analysis

In [4]:
dtypes = list(zip(data_raw.columns, map(str,data_raw.dtypes)))
dtypes

[('image', 'object'),
 ('name', 'object'),
 ('author', 'object'),
 ('format', 'object'),
 ('book_depository_stars', 'float64'),
 ('price', 'object'),
 ('currency', 'object'),
 ('old_price', 'float64'),
 ('isbn', 'int64'),
 ('category', 'object'),
 ('img_paths', 'object')]

In [5]:
for col, dtype in dtypes:
    if(dtype=='object' and not (col=='image' or col=='img_paths')):
        print(col)
        print(data_raw[col].unique())

name
['This is Going to Hurt' 'Thinking, Fast and Slow'
 'When Breath Becomes Air' ...
 'Complete Finnish Beginner to Intermediate Course: Learn to read, write, speak and understand a new language (Teach Yourself)'
 'Simple Thai Food' "L'Appart"]
author
['Adam Kay' 'Daniel Kahneman' 'Paul Kalanithi' ... 'Insun Lee'
 'Terttu Leney' 'Leela Punyaratabandhu']
format
['Paperback' 'Hardback' nan 'Spiral bound' 'Mixed media product' 'Cards'
 'Fold-out book or chart' 'CD-Audio' 'Book' 'Sheet music' 'Game'
 'Board book' 'Bath book' 'Notebook / blank book' 'Postcard book or pack'
 'Calendar' 'Leather / fine binding' 'Diary' 'Novelty book' 'Loose-leaf'
 'DVD video' 'CD-ROM' 'Boxed Set' 'Pamphlet' 'Sheet map'
 'Sheet map, folded' 'Toy' 'General merchandise' 'Other merchandise'
 'Miscellaneous print' 'Address book' 'Multiple copy pack']
price
['7.6' '11.5' '9.05' ... '79.73' '22.51' '58.1']
currency
['$']
category
['Medical' 'Science-Geography' 'Art-Photography' 'Biography'
 'Business-Finance-Law' 

In [6]:
data_raw.shape

(32581, 11)

In [7]:
data_raw.isna().sum()

image                       0
name                        0
author                    198
format                     33
book_depository_stars       0
price                       0
currency                    0
old_price                5114
isbn                        0
category                    0
img_paths                   0
dtype: int64

### Initial inference and cleaning method
* currency can be converted for the database storage as per requirements, base currency : US-dollars  
* price field needs to be converted to float64 type  
* price field - actual float value have to be extracted as certain price include '\$' symbol
* change name -> Title
* category can be used to sub-categorize the books - UX
* ISBN field needs to be converted to type 'object'
* remove img_paths, currency  
* rename book_depository_stars to 'Rating'
* Remove all data with missing values
* Subset to a minimal set of records



#### Backup

In [8]:
data_tmp = data_raw.copy()

### Price

In [9]:
data_tmp.price = data_tmp.price.str.extract(r'(\d+.\d+)').astype('float64')

We will be storing price in original currency US-Dollars which can be manipulated on client side based on user inputs

### ISBN

In [10]:
data_tmp.isbn = data_tmp.isbn.astype(str)

### Drop unnecessary columns

In [11]:
data_tmp.drop(['currency','img_paths'], axis = 1, inplace = True)

### Title

In [12]:
data_tmp.rename(
    {
        'name':'Title',
        'author':'Author',
        'book_depository_stars': 'Rating',
        'image': 'Image',
        'format': 'Format',
        'price': 'Price',
        'old_price': 'OldPrice',
        'isbn': 'ISBN',
        'category': 'Category',
    },
    axis = 1,
    inplace = True
)

### Missing data

In [13]:
data_tmp.dropna(inplace = True)

### Subsetting

In [14]:
data_tmp.shape

(27294, 9)

In [15]:
data_tmp = data_tmp.sample(n = 2500, random_state = 197)

#### Cleaned data

In [22]:
data_clean = data_tmp.copy()

In [23]:
data_clean.reset_index(inplace = True, drop = True)

In [24]:
data_clean.head()

Unnamed: 0,Image,Title,Author,Format,Rating,Price,OldPrice,ISBN,Category
0,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,The Practice of Spiritual Direction,William A. Barry,Paperback,4.0,13.29,17.0,9780061652639,Medical
1,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,The Story of the Stone: Golden Days Volume 1,Xueqin Cao,Paperback,4.0,15.2,19.1,9780140442939,Poetry-Drama
2,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,Playful Learning,Mariah Bruehl,Paperback,4.0,16.73,22.0,9781590308196,Medical
3,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,The Complete Guide to Yin Yoga,Bernie Clark,Paperback,4.5,14.75,14.99,9781935952503,Health
4,https://d1w7fb2mkkr3kw.cloudfront.net/assets/i...,Judgement Detox,Gabrielle Bernstein,Paperback,4.0,11.65,16.0,9781788170734,Personal-Development


### Saving data

In [25]:
data_clean.to_csv('./data/data_cleaned.csv', index=False)