In [37]:
from pprint import pprint

import pandas as pd
import numpy as np

import xgboost as xgb

import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.tree import DecisionTreeClassifier

%matplotlib inline
import matplotlib.pyplot as plt

import prepare as pr 
import explore as ex

In [2]:
df = pr.prep_data('all_books.csv')

In [3]:
save = df.copy()

In [4]:
save.head(1)

Unnamed: 0,title,summary,year_published,author,review_count,number_of_ratings,length,genre,rating,reviews,cleaned_title,cleaned_summary,successful,lemmatized_summary,neg,neutral,pos,compound,sentiment
0,Missing in Death,"Aboard the Staten Island ferry, a tourist come...",2009,J.D. Robb,334,9875,77.0,Mystery,4.24,[],missing in death,"aboard the staten island ferry, a tourist come...",False,aboard staten island ferry tourist come across...,0.185,0.804,0.011,-0.9534,very negative


In [5]:
save.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3665 entries, 0 to 3854
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               3665 non-null   object 
 1   summary             3665 non-null   object 
 2   year_published      3665 non-null   object 
 3   author              3665 non-null   object 
 4   review_count        3665 non-null   int64  
 5   number_of_ratings   3665 non-null   int64  
 6   length              3665 non-null   float64
 7   genre               3665 non-null   object 
 8   rating              3665 non-null   float64
 9   reviews             1696 non-null   object 
 10  cleaned_title       3665 non-null   object 
 11  cleaned_summary     3665 non-null   object 
 12  successful          3665 non-null   bool   
 13  lemmatized_summary  3665 non-null   object 
 14  neg                 3665 non-null   float64
 15  neutral             3665 non-null   float64
 16  pos   

## Getting dummies and splitting data

In [6]:
dummy_df = pd.get_dummies(df[['genre','sentiment']], dummy_na=False, drop_first=[True, True])
col_list = dummy_df.columns.tolist()

In [7]:
dummy_df.head()

Unnamed: 0,genre_Business,genre_Chick Lit,genre_Childrens,genre_Christian,genre_Classics,genre_Comics,genre_Fantasy,genre_Fiction,genre_Film,genre_Graphic Novels,...,genre_Short Stories,genre_Thriller,genre_Travel,genre_Urban Fantasy,genre_Vampires,genre_Young Adult,sentiment_neutral,sentiment_positive,sentiment_very negative,sentiment_very positive
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
df = pd.concat([df, dummy_df],axis= 1)

In [9]:
df.head()

Unnamed: 0,title,summary,year_published,author,review_count,number_of_ratings,length,genre,rating,reviews,...,genre_Short Stories,genre_Thriller,genre_Travel,genre_Urban Fantasy,genre_Vampires,genre_Young Adult,sentiment_neutral,sentiment_positive,sentiment_very negative,sentiment_very positive
0,Missing in Death,"Aboard the Staten Island ferry, a tourist come...",2009,J.D. Robb,334,9875,77.0,Mystery,4.24,[],...,0,0,0,0,0,0,0,0,1,0
1,The Last Boyfriend,"Owen is the organizer of the Montgomery clan, ...",2012,Nora Roberts,2545,47392,436.0,Romance,4.09,[],...,0,0,0,0,0,0,0,0,0,1
2,Just Me in the Tub,Taking a bath is a big job. Mercer Mayer's fam...,1994,Gina Mayer,62,19212,24.0,Childrens,4.25,[],...,0,0,0,0,0,0,0,0,0,1
3,Lucy in the Sky,Settling down for a 24-hour flight to Australi...,2007,Paige Toon,628,9524,390.0,Chick Lit,3.95,[],...,0,0,0,0,0,0,0,0,0,1
4,The Rats in the Walls,"""The Rats in the Walls"" is a short story by H....",1924,H.P. Lovecraft,531,9155,25.0,Horror,4.01,[],...,0,0,0,0,0,0,0,0,0,0


In [10]:
train, test = ex.split(df, 'successful')

In [11]:
train.head()

Unnamed: 0,title,summary,year_published,author,review_count,number_of_ratings,length,genre,rating,reviews,...,genre_Short Stories,genre_Thriller,genre_Travel,genre_Urban Fantasy,genre_Vampires,genre_Young Adult,sentiment_neutral,sentiment_positive,sentiment_very negative,sentiment_very positive
1976,Pigs in Heaven,"Mother and adopted daughter, Taylor and Turtle...",1993,Barbara Kingsolver,2420,59600,343.0,Fiction,3.98,[],...,0,0,0,0,0,0,0,0,0,0
2469,Lost in the Never Woods,When children go missing in the small coastal ...,2021,Aiden Thomas,3072,12584,384.0,Fantasy,3.73,[],...,0,0,0,0,0,0,0,0,1,0
2409,Never Judge a Lady by Her Cover,"She is the most powerful woman in Britain, A q...",2014,Sarah MacLean,1853,19892,376.0,Historical Romance,3.96,[],...,0,0,0,0,0,0,0,0,1,0
542,Is Everyone Hanging Out Without Me?,Mindy Kaling has lived many lives: the obedien...,2011,Mindy Kaling,23766,478097,222.0,Nonfiction,3.86,,...,0,0,0,0,0,0,0,0,0,1
3044,The Hangman's Daughter,"Magdalena, the clever and headstrong daughter ...",2008,Oliver Pötzsch,6528,76556,448.0,Historical Fiction,3.74,,...,0,0,0,0,0,0,0,0,1,0


In [12]:
X_train =  train.drop(columns= "successful")
y_train = train['successful']

X_test = test.drop(columns= "successful")
y_test = test['successful']

In [13]:
X_train_num = X_train.select_dtypes(exclude= ['string','object'])
X_train_num = X_train_num.drop(columns = ['neg','pos','neutral','compound'])
X_train_num = X_train_num[['review_count','number_of_ratings','length','rating']]

In [14]:
scaler = StandardScaler()

scaler.fit(X_train_num)

StandardScaler()

In [15]:
number_ls = X_train_num.columns.tolist()
number_ls

['review_count', 'number_of_ratings', 'length', 'rating']

In [16]:
X_train_scaled = scaler.transform(X_train[number_ls])
X_test_scaled = scaler.transform(X_test[number_ls])

In [17]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns= [number_ls])
X_test_scaled = pd.DataFrame(X_test_scaled, columns= [number_ls])

In [18]:
X_train_scaled.head()

Unnamed: 0,review_count,number_of_ratings,length,rating
0,-0.285385,-0.152962,-0.098488,-0.09516
1,-0.24239,-0.294007,0.133131,-1.040061
2,-0.322775,-0.272083,0.087937,-0.170752
3,1.122228,1.102503,-0.782048,-0.548713
4,-0.014492,-0.102095,0.494684,-1.002265


In [19]:
X_train = X_train.drop(columns=['review_count', 'number_of_ratings', 'length', 'rating'])

In [20]:
X_train = X_train.reset_index(drop=True)

In [21]:
X_train.head(1)

Unnamed: 0,title,summary,year_published,author,genre,reviews,cleaned_title,cleaned_summary,lemmatized_summary,neg,...,genre_Short Stories,genre_Thriller,genre_Travel,genre_Urban Fantasy,genre_Vampires,genre_Young Adult,sentiment_neutral,sentiment_positive,sentiment_very negative,sentiment_very positive
0,Pigs in Heaven,"Mother and adopted daughter, Taylor and Turtle...",1993,Barbara Kingsolver,Fiction,[],pigs in heaven,"mother and adopted daughter, taylor and turtle...",mother adopt daughter taylor turtle greer back...,0.086,...,0,0,0,0,0,0,0,0,0,0


In [22]:
X_train_scaled.head(1)

Unnamed: 0,review_count,number_of_ratings,length,rating
0,-0.285385,-0.152962,-0.098488,-0.09516


In [23]:
X_train = pd.concat([X_train, X_train_scaled],axis= 1)

In [24]:
X_train.head()

Unnamed: 0,title,summary,year_published,author,genre,reviews,cleaned_title,cleaned_summary,lemmatized_summary,neg,...,genre_Vampires,genre_Young Adult,sentiment_neutral,sentiment_positive,sentiment_very negative,sentiment_very positive,"(review_count,)","(number_of_ratings,)","(length,)","(rating,)"
0,Pigs in Heaven,"Mother and adopted daughter, Taylor and Turtle...",1993,Barbara Kingsolver,Fiction,[],pigs in heaven,"mother and adopted daughter, taylor and turtle...",mother adopt daughter taylor turtle greer back...,0.086,...,0,0,0,0,0,0,-0.285385,-0.152962,-0.098488,-0.09516
1,Lost in the Never Woods,When children go missing in the small coastal ...,2021,Aiden Thomas,Fantasy,[],lost in the never woods,when children go missing in the small coastal ...,child go miss small coastal town astoria peopl...,0.144,...,0,0,0,0,1,0,-0.24239,-0.294007,0.133131,-1.040061
2,Never Judge a Lady by Her Cover,"She is the most powerful woman in Britain, A q...",2014,Sarah MacLean,Historical Romance,[],never judge a lady by her cover,"she is the most powerful woman in britain, a q...",powerful woman britain queen london underworld...,0.183,...,0,0,0,0,1,0,-0.322775,-0.272083,0.087937,-0.170752
3,Is Everyone Hanging Out Without Me?,Mindy Kaling has lived many lives: the obedien...,2011,Mindy Kaling,Nonfiction,,is everyone hanging out without me,mindy kaling has lived many lives the obedient...,mindy kaling live many life obedient child imm...,0.036,...,0,0,0,0,0,1,1.122228,1.102503,-0.782048,-0.548713
4,The Hangman's Daughter,"Magdalena, the clever and headstrong daughter ...",2008,Oliver Pötzsch,Historical Fiction,,the hangman's daughter,"magdalena, the clever and headstrong daughter ...",magdalena clever headstrong daughter bavarian ...,0.19,...,0,0,0,0,1,0,-0.014492,-0.102095,0.494684,-1.002265


In [25]:
X_train.columns

Index([                   'title',                  'summary',
                 'year_published',                   'author',
                          'genre',                  'reviews',
                  'cleaned_title',          'cleaned_summary',
             'lemmatized_summary',                      'neg',
                        'neutral',                      'pos',
                       'compound',                'sentiment',
                 'genre_Business',          'genre_Chick Lit',
                'genre_Childrens',          'genre_Christian',
                 'genre_Classics',             'genre_Comics',
                  'genre_Fantasy',            'genre_Fiction',
                     'genre_Film',     'genre_Graphic Novels',
       'genre_Historical Fiction', 'genre_Historical Romance',
                  'genre_History',             'genre_Horror',
                    'genre_Humor',             'genre_Memoir',
                    'genre_Music',            'genre_My

In [26]:
X_train_scaled.columns

MultiIndex([(     'review_count',),
            ('number_of_ratings',),
            (           'length',),
            (           'rating',)],
           )

## Using count vectorizer

In [27]:
data = df.copy()

In [28]:
data.head(1)

Unnamed: 0,title,summary,year_published,author,review_count,number_of_ratings,length,genre,rating,reviews,...,genre_Short Stories,genre_Thriller,genre_Travel,genre_Urban Fantasy,genre_Vampires,genre_Young Adult,sentiment_neutral,sentiment_positive,sentiment_very negative,sentiment_very positive
0,Missing in Death,"Aboard the Staten Island ferry, a tourist come...",2009,J.D. Robb,334,9875,77.0,Mystery,4.24,[],...,0,0,0,0,0,0,0,0,1,0


In [29]:
document = data['lemmatized_summary']

In [30]:
document

0       aboard staten island ferry tourist come across...
1       owen organizer montgomery clan run family cons...
2       take bath big job mercer mayer famous little c...
3       settle flight australia lucy find text message...
4       rat wall short lovecraft write augustseptember...
                              ...                        
3850    time adolf hitler attempt take western world a...
3851    enchant sequel number one bestseller five peop...
3852    one war hero search oldest language ancient ar...
3853    unsolved murder farm family still haunt white ...
3854    haden lord disgrace prince underrealm send mor...
Name: lemmatized_summary, Length: 3665, dtype: object

In [32]:
cv = CountVectorizer()

bag_of_words = cv.fit_transform(document)

bag_of_words.todense()

<3665x34609 sparse matrix of type '<class 'numpy.int64'>'
	with 295542 stored elements in Compressed Sparse Row format>

In [34]:
bow = pd.DataFrame(bag_of_words.todense())
bow.columns = cv.get_feature_names_out()

In [35]:
bow

Unnamed: 0,aa,aahz,aambc,aanen,aarav,aaron,aaronovitch,aaronsohn,ab,aba,...,zuckoff,zuckoffs,zula,zum,zumindest,zuni,zusak,zuversicht,zwanzig,zwischen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3661,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3662,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3663,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
bow.apply(lambda row: row / row.sum(), axis=1)

Unnamed: 0,aa,aahz,aambc,aanen,aarav,aaron,aaronovitch,aaronsohn,ab,aba,...,zuckoff,zuckoffs,zula,zum,zumindest,zuni,zusak,zuversicht,zwanzig,zwischen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
X = df['lemmatized_summary']
y = df['successful']

In [None]:
X_train, X_test, y_train, y_test = \
train_test_split(X, y, 
                 test_size=0.2, 
                 random_state=42)