In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [17]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [18]:
# Load and clean the data.
bestsellers_df = pd.read_csv('Resources/cleaned_best_seller_code.csv')

# Drop extra column.
bestsellers_df.drop(['Unnamed: 0'], axis=1).head()

Unnamed: 0,published_date,book_type,rank,title,author,description,price,weeks_on_list
0,2010-01-03,chapter_books,1,WITCH AND WIZARD,James Patterson and Gabrielle Charbonnet,"One of each, brother and sister, flex their ne...",17.99,1
1,2010-01-03,chapter_books,2,THE HUNGER GAMES,Suzanne Collins,"In a dystopian future, a girl fights for survi...",17.99,67
2,2010-01-03,chapter_books,3,CATCHING FIRE,Suzanne Collins,"The protagonist of ""The Hunger Games"" returns.",17.99,16
3,2010-01-03,chapter_books,4,THE MAGICIAN’S ELEPHANT,Kate DiCamillo,An orphan in search of his sister follows a fo...,16.99,15
4,2010-01-03,chapter_books,5,FALLEN,Lauren Kate,"Thwarted love among misfits at a Savannah, Ga....",17.99,2


In [20]:
# Columns
bestsellers_df.columns

Index(['Unnamed: 0', 'published_date', 'book_type', 'rank', 'title', 'author',
       'description', 'price', 'weeks_on_list'],
      dtype='object')

In [21]:
# Split the Data into Training and Testing.
# Create our features
X = pd.get_dummies(bestsellers_df, columns=['published_date', 'rank', 'title', 'author',
       'description', 'price', 'weeks_on_list']).drop('book_type', axis=1)

# Create our target
y = bestsellers_df['book_type']

In [29]:
X.describe()

Unnamed: 0.1,Unnamed: 0,published_date_2010-01-03,published_date_2010-01-10,published_date_2010-01-17,published_date_2010-01-24,published_date_2010-01-31,published_date_2010-02-07,published_date_2010-02-14,published_date_2010-02-21,published_date_2010-02-28,...,weeks_on_list_598,weeks_on_list_599,weeks_on_list_600,weeks_on_list_601,weeks_on_list_602,weeks_on_list_603,weeks_on_list_604,weeks_on_list_605,weeks_on_list_606,weeks_on_list_607
count,51997.0,51997.0,51997.0,51997.0,51997.0,51997.0,51997.0,51997.0,51997.0,51997.0,...,51997.0,51997.0,51997.0,51997.0,51997.0,51997.0,51997.0,51997.0,51997.0,51997.0
mean,28655.160971,0.001327,0.001346,0.001346,0.001346,0.001346,0.001346,0.001346,0.001346,0.001346,...,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05,1.9e-05
std,17621.342383,0.036404,0.036667,0.036667,0.036667,0.036667,0.036667,0.036667,0.036667,0.036667,...,0.004385,0.004385,0.004385,0.004385,0.004385,0.004385,0.004385,0.004385,0.004385,0.004385
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13080.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27990.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,43600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,61424.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
# Checkout the balance of our taget values
y.value_counts()

hardcover_fiction                       2610
series_books                            2610
trade_fiction_paperback                 2610
paperback_nonfiction                    2606
hardcover_nonfiction                    2605
picture_books                           2595
combined_print_and_e_book_nonfiction    2289
combined_print_and_e_book_fiction       2164
hardcover_graphic_books                 1850
manga                                   1850
mass_market_paperback                   1850
paperback_graphic_books                 1850
e_book_nonfiction                       1404
e_book_fiction                          1404
hardcover_political_books               1341
young_adult_hardcover                   1135
childrens_middle_grade_hardcover        1134
food_and_fitness                         895
paperback_advice                         865
hardcover_advice                         849
chapter_books                            770
paperback_books                          769
young_adul

In [31]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,y, random_state=1)

In [32]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [33]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7518216389077207

In [34]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[158,   0,   0, ...,   1,   0,   0],
       [  0,  60,   0, ...,   0,   0,   0],
       [  0,   0,  45, ...,   0,   0,   0],
       ...,
       [  4,   0,   0, ..., 115,   0,   0],
       [  0,   0,   0, ...,   0, 156,   0],
       [  0,   0,   0, ...,   0,   5, 236]], dtype=int64)

In [35]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                                            pre       rec       spe        f1       geo       iba       sup

                             animals       0.86      0.96      1.00      0.91      0.98      0.96       164
                       audio_fiction       0.62      0.95      1.00      0.75      0.97      0.95        63
                    audio_nonfiction       0.58      1.00      1.00      0.74      1.00      1.00        45
                         celebrities       0.49      0.60      0.99      0.54      0.77      0.57       142
                       chapter_books       0.46      0.93      0.98      0.62      0.96      0.91       197
              childrens_middle_grade       0.38      0.91      0.98      0.54      0.94      0.89       167
    childrens_middle_grade_hardcover       0.70      0.83      0.99      0.76      0.91      0.81       282
   combined_print_and_e_book_fiction       0.22      0.29      0.96      0.25      0.53      0.26       546
combined_print_and_e_book_n