## Master Data Preparation for Statistical Analysis

This Jupyter notebook merges processed books and authors datasets, and prepares the master data for performing hypothesis testing, including basic data cleaning and feature generation.

#### Import Libraries

In [1]:
import pandas as pd

#### Data Upload

In [3]:
books_path = "C:\\Users\\juhic\\OneDrive\\Desktop\\books_processed.csv"
authors_path = "C:\\Users\\juhic\\OneDrive\\Desktop\\authors_processed.csv"
books = pd.read_csv(books_path)
authors = pd.read_csv(authors_path)

books.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)
authors.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)

#### Merge 

In [4]:
data = pd.merge(left = books, right = authors, how = 'inner', on = 'author')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35814 entries, 0 to 35813
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   title                35814 non-null  object 
 1   series               18309 non-null  object 
 2   author               35814 non-null  object 
 3   rating_count         35814 non-null  int64  
 4   review_count         35814 non-null  int64  
 5   average_rating       35814 non-null  float64
 6   five_star_ratings    35814 non-null  int64  
 7   four_star_ratings    35814 non-null  int64  
 8   three_star_ratings   35814 non-null  int64  
 9   two_star_ratings     35814 non-null  int64  
 10  one_star_ratings     35814 non-null  int64  
 11  page_count           35814 non-null  float64
 12  published_year       35814 non-null  int64  
 13  genre                35814 non-null  object 
 14  is_volume            35814 non-null  object 
 15  work_count           35814 non-null 

In [5]:
cols = ['page_count','author_rating_count','work_count']
for c in cols:
    data.loc[data[c].isna(),c] = 0
    data[c] = data[c].astype(int)

#### Data Cleaning

In [6]:
# Dropping columns that are not required
drop_cols = ['series',
             'review_count',
             'average_rating',
             'five_star_ratings',
             'four_star_ratings',
             'three_star_ratings',
             'two_star_ratings',
             'one_star_ratings',
             'author_review_count',
             'published_year']
data.drop(axis = 1, columns = drop_cols, inplace = True)

data.rename(columns = {'title':'book',
                       'work_count':'author_work_count',
                       'sex':'author_sex'}, inplace = True)

In [7]:
# Removing non-book records (audiobooks, collections poetry etc.)
mask1 = data['page_count'] < 10
mask2 = data['page_count'] >= 5000
data.drop(axis = 0, index = data[mask1 | mask2].index, inplace = True)

list_genre = ['audiobook','poetry','plays']
data.drop(axis = 0, index = data[data['genre'].isin(list_genre)].index, inplace = True)

list_books = ['series','box set','boxed set','volume set','vol.set','volumes','complete collection']
data.drop(axis = 0, index = data[data['book'].isin(list_books)].index, inplace = True)

In [8]:
# Creating derived features required for hypothesis testing

#1. Genre Mapping
genre = pd.read_csv('C:\\Users\\juhic\\OneDrive\\Desktop\\genre mapping.csv')
data_1 = pd.merge(left = data, right = genre, how = 'inner', on = 'genre')

#2. Author Work Count
def classify_work_exp(x):
    if x < 80:
        return 'newbie'
    elif x >= 80 & x <= 320:
        return 'average'
    else:
        return 'experienced'

data_1['author_exp'] = data_1['author_work_count'].apply(func = lambda x: classify_work_exp(x))

#3. Page Count
def classify_book_size(x):
    if x < 100:
        return 'light'
    elif x >= 100 & x <= 500:
        return 'average'
    else:
        return 'bulky'

data_1['book_size'] = data_1['page_count'].apply(func = lambda x: classify_book_size(x))

In [9]:
# Drop columns
drop_cols = ['page_count','genre','author_work_count','author_avg_rating','author_rating_count']
data_1.drop(axis = 1, columns = drop_cols, inplace = True)
data_1.head()

Unnamed: 0,book,author,rating_count,is_volume,author_sex,genre_category,author_exp,book_size
0,inner circle,kate brian,7597,yes,female,fiction,newbie,average
1,ambition,kate brian,6719,yes,female,fiction,newbie,average
2,revelation,kate brian,7431,yes,female,fiction,newbie,average
3,legacy,kate brian,7010,yes,female,fiction,newbie,average
4,vanished,kate brian,3724,yes,female,fiction,newbie,average


#### Data Download

In [11]:
# Download merged dataset
data.to_csv('master_dataset.csv')