In [1]:
# imports

import pandas as pd
import numpy as np
import unicodedata
import re

from bs4 import BeautifulSoup
import requests
import os
import json

import nltk
from nltk.corpus import stopwords

import prepare as prep

from sklearn.model_selection import train_test_split
import sklearn.model_selection

## import data

In [2]:
# original df

f = pd.read_csv('all_books.csv', index_col=0)

In [3]:
f.head()

Unnamed: 0,title,summary,year_published,author,review_count,number_of_ratings,length,genre,rating,reviews,book_tag
48,Missing in Death,"Aboard the Staten Island ferry, a tourist come...",2009,J.D. Robb,334,9875,77.0,Mystery,4.24,[],
53,The Last Boyfriend,"Owen is the organizer of the Montgomery clan, ...",2012,Nora Roberts,2545,47392,436.0,Romance,4.09,[],
205,Just Me in the Tub,Taking a bath is a big job. Mercer Mayer's fam...,1994,Gina Mayer,62,19212,24.0,Childrens,4.25,[],
104,Lucy in the Sky,Settling down for a 24-hour flight to Australi...,2007,Paige Toon,628,9524,390.0,Chick Lit,3.95,[],
334,The Rats in the Walls,"""The Rats in the Walls"" is a short story by H....",1924,H.P. Lovecraft,531,9155,25.0,Horror,4.01,[],


In [4]:
# nytbs df

b = pd.read_csv('books_feat_on_NYBS').drop(columns = 'Unnamed: 0')

In [5]:
b.shape

(1045, 4)

In [6]:
# looking at best sellers

b.isna().sum()

Date        0
Book        0
Author      0
Month     968
dtype: int64

In [7]:
# read in the dataframe of the books

df = prep.prep_data('all_books.csv')

In [8]:
# books with neutral sentiment 

df[df['sentiment'] == 'neutral']

Unnamed: 0,title,summary,year_published,author,review_count,number_of_ratings,length,genre,rating,reviews,cleaned_title,cleaned_summary,successful,lemmatized_summary,neg,neutral,pos,compound,sentiment
64,Thing Explainer: Complicated Stuff in Simple W...,In Thing Explainer: Complicated Stuff in Simpl...,2015,Randall Munroe,939,10777,64.0,Science,4.14,[],thing explainer complicated stuff in simple words,in thing explainer complicated stuff in simple...,False,thing explainer complicate stuff simple word t...,0.0,1.0,0.0,0.0,neutral
96,First Impressions,The first biography of Monet written especiall...,1991,Ann Waldron,2,23,92.0,Nonfiction,3.57,,first impressions,the first biography of monet written especiall...,False,first biography monet write especially young r...,0.0,1.0,0.0,0.0,neutral
284,Nejma,all of the unsleeping. gold sweeping. poems. i...,2014,Nayyirah Waheed,533,5881,172.0,Poetry,4.02,,nejma,all of the unsleeping. gold sweeping. poems. i...,False,unsleeping gold sweeping poem hand,0.0,1.0,0.0,0.0,neutral
320,Redhead by the Side of the Road,Micah Mortimer is a creature of habit. A self-...,2020,Anne Tyler,5334,41838,178.0,Fiction,3.62,[],redhead by the side of the road,micah mortimer is a creature of habit. a selfe...,False,micah mortimer creature habit selfemployed tec...,0.088,0.823,0.088,0.0,neutral
374,King Arthur,A ruler said to be the model of goodness over ...,1918,Andrew Lang,62,598,192.0,Fantasy,3.5,,king arthur,a ruler said to be the model of goodness over ...,False,ruler say model goodness evil formidable comra...,0.084,0.832,0.084,-0.0,neutral
420,The Present,Old Version,Inc.,Kenneth Thomas,79,2383,200.0,Science Fiction,3.59,,the present,old version,False,old version,0.0,1.0,0.0,0.0,neutral
487,The Mermaid's Voice Returns in This One,Goodreads Choice Award-winning poet and USA TO...,2019,Amanda Lovelace,2185,16801,210.0,Poetry,3.68,[],the mermaid's voice returns in this one,goodreads choice awardwinning poet and usa tod...,False,goodreads choice awardwinning poet usa today b...,0.0,1.0,0.0,0.0,neutral
490,You Learn by Living: Eleven Keys for a More Fu...,Mrs. Roosevelt expresses her philosophy of lif...,1960,Eleanor Roosevelt,566,4084,211.0,Nonfiction,3.98,[],you learn by living eleven keys for a more ful...,mrs. roosevelt expresses her philosophy of lif...,False,roosevelt express philosophy life relate exper...,0.0,1.0,0.0,0.0,neutral
619,"Burning in Water, Drowning in Flame","Burning in Water, Drowning in Flame is poetry ...",1974,Charles Bukowski,482,8041,232.0,Poetry,4.08,[],"burning in water, drowning in flame","burning in water, drowning in flame is poetry ...",False,burn water drown flame poetry full gamble drin...,0.0,1.0,0.0,0.0,neutral
1318,H is for Hawk,"As a child, Helen Macdonald was determined to ...",2014,Helen Macdonald,9407,70020,300.0,Nonfiction,3.74,,h is for hawk,"as a child, helen macdonald was determined to ...",False,child helen macdonald determine become falcone...,0.127,0.76,0.113,-0.0,neutral


## cleaning, exploring`

In [9]:
# how many nan

df.isna().sum()

title                    0
summary                  0
year_published           0
author                   0
review_count             0
number_of_ratings        0
length                   0
genre                    0
rating                   0
reviews               1976
cleaned_title            0
cleaned_summary          0
successful               0
lemmatized_summary       0
neg                      0
neutral                  0
pos                      0
compound                 0
sentiment                0
dtype: int64

**No NaNs appear in important columns.**

## train—test split

In [10]:
def split(df):
    train, test = train_test_split(df, test_size = .2, random_state = 42, stratify = df.target)
    return train, test

In [11]:
train, test = split(df)
train.shape, test.shape

AttributeError: 'DataFrame' object has no attribute 'target'

#### We'll be doing k-folds, so no need to have a validate portion here

### Does the length of a book have a relationship to its success ?

In [None]:
train.sample()

In [None]:
# successful books

besties = train[train['target'] == 'best seller']

**129 bestseller books in train. Assigning to a variable in order to explore page length.**

In [None]:
# mean length of pages

besties['length'].mean()

In [None]:
# median length of pages

besties['length'].median()

**The mean length of best sellers is 474 pages, the median is 400 pages.**

In [None]:
# standard deviation

besties['length'].std()

**Standard deviation of about 208 pages. So, 68% of NYT best sellers have a length of 266 to 682 pages.**

In [None]:
long_books = besties[besties['length'] > 682]
long_books.shape

**19 books have more than 682 pages.** Now to compare with generalised / random books list.

In [None]:
# isolating unsuccessful books 

sadness = train[train['target'] == 'unsuccessful']

In [None]:
sadness['length'].max()

In [None]:
sadness[sadness['length'] > 3000]