#### Notebook 1: This Jupyter notebook processes raw books csv data performing basic data cleaning:

1. Selecting only desired features (raw/derived)
2. String cleaning
3. Handling missing values

In [2]:
# Import Libraries --------------------------------------------------------------
import os
import pandas as pd


# Load Data (only required columns) ----------------------------------
path = "C:\\Users\\juhic\\OneDrive\\Desktop\\goodreads_kaggle_books.csv"
books = pd.read_csv(path, usecols = ['title',                       
                                   'series',
                                   'author',
                                   'rating_count',
                                   'review_count',
                                   'average_rating',
                                   'five_star_ratings',
                                   'four_star_ratings',
                                   'three_star_ratings',
                                   'two_star_ratings',
                                   'one_star_ratings',
                                   'number_of_pages',
                                   'date_published',
                                   'publisher',
                                   'genre_and_votes'])


# Rename columns ------------------------------------------------------
cols = {'number_of_pages': 'page_count',
        'date_published': 'published_year',
        'genre_and_votes': 'genre'}
books.rename(columns = cols, inplace = True)



# String cleaning -----------------------------------------------------
cols = ['title','series','author','publisher','genre']

# lowercase, strip spaces from ends
books[cols] = books[cols].apply(func = lambda x: x.str.lower().str.strip(), axis = 1)

# strip spaces in between
for c in cols:
    books[c] = books[c].str.replace(r" +", " ")

    
# Author: select single author
books['author'] = books['author'].str.split(',', expand = True)[0]

# Series: extract alphabetical characters
books['series'] = books['series'].str.lstrip('(').str.rstrip(')').str.split('#', expand = True)[0]

# Genre: select most voted genre
books['genre'] = books['genre'].str.split(',', expand = True)[0].str.rsplit(' ', 1, expand = True)[0]

# Published Date: extract year
books['published_year'] = books['published_year'].str.extract(r'(\d{4})')

  books[c] = books[c].str.replace(r" +", " ")


In [3]:
# Download intermediate processed books data ---------------------------------------
books.to_csv('books_processed.csv')

#### Handling Missing values (WIP)

In [None]:
# Fill missing values --------------------------------------------------------------
cols_categorical = ['series','author','published_year','publisher','genre']
cols_numerical = ['rating_count','review_count','average_rating','five_star_ratings','four_star_ratings','three_star_ratings','two_star_ratings','one_star_ratings','page_count']

# Step 2: Fillna/Dropna/Impute as needed
books[cols_categorical] = books[cols_categorical].fillna('')
books[cols_numerical] = books[cols_numerical].fillna(0)


# volume flag column depicting if the book is part of a series ---
books['volume'] = ['No' if i == '' else 'Yes' for i in books['series']]