In [1]:
# imports

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt


import unicodedata
import re

from bs4 import BeautifulSoup
import requests
import os
import json

import nltk
from nltk.corpus import stopwords

import prepare as prep

from sklearn.model_selection import train_test_split
import sklearn.model_selection

from scipy import stats
from scipy.stats import norm, binom

## import data

In [2]:
# original df

f = pd.read_csv('all_books.csv', index_col=0)

In [3]:
f.shape

Unnamed: 0,title,summary,year_published,author,review_count,number_of_ratings,length,genre,rating,reviews,book_tag
48,Missing in Death,"Aboard the Staten Island ferry, a tourist come...",2009,J.D. Robb,334,9875,77.0,Mystery,4.24,[],
53,The Last Boyfriend,"Owen is the organizer of the Montgomery clan, ...",2012,Nora Roberts,2545,47392,436.0,Romance,4.09,[],
205,Just Me in the Tub,Taking a bath is a big job. Mercer Mayer's fam...,1994,Gina Mayer,62,19212,24.0,Childrens,4.25,[],
104,Lucy in the Sky,Settling down for a 24-hour flight to Australi...,2007,Paige Toon,628,9524,390.0,Chick Lit,3.95,[],
334,The Rats in the Walls,"""The Rats in the Walls"" is a short story by H....",1924,H.P. Lovecraft,531,9155,25.0,Horror,4.01,[],


In [4]:
# nytbs df

b = pd.read_csv('books_feat_on_NYBS').drop(columns = 'Unnamed: 0')

In [5]:
b.shape

(1045, 4)

In [6]:
# looking at best sellers

b.isna().sum()

Date        0
Book        0
Author      0
Month     968
dtype: int64

In [7]:
# read in the dataframe of the books

df = prep.prep_data('all_books.csv')

In [8]:
# dropping 'Picture Books'

df = df[df['genre'] != 'Picture Books']

Unnamed: 0,title,summary,year_published,author,review_count,number_of_ratings,length,genre,rating,reviews,cleaned_title,cleaned_summary,successful,lemmatized_summary,neg,neutral,pos,compound,sentiment
64,Thing Explainer: Complicated Stuff in Simple W...,In Thing Explainer: Complicated Stuff in Simpl...,2015,Randall Munroe,939,10777,64.0,Science,4.14,[],thing explainer complicated stuff in simple words,in thing explainer complicated stuff in simple...,False,thing explainer complicate stuff simple word t...,0.0,1.0,0.0,0.0,neutral
96,First Impressions,The first biography of Monet written especiall...,1991,Ann Waldron,2,23,92.0,Nonfiction,3.57,,first impressions,the first biography of monet written especiall...,False,first biography monet write especially young r...,0.0,1.0,0.0,0.0,neutral
284,Nejma,all of the unsleeping. gold sweeping. poems. i...,2014,Nayyirah Waheed,533,5881,172.0,Poetry,4.02,,nejma,all of the unsleeping. gold sweeping. poems. i...,False,unsleeping gold sweeping poem hand,0.0,1.0,0.0,0.0,neutral
320,Redhead by the Side of the Road,Micah Mortimer is a creature of habit. A self-...,2020,Anne Tyler,5334,41838,178.0,Fiction,3.62,[],redhead by the side of the road,micah mortimer is a creature of habit. a selfe...,False,micah mortimer creature habit selfemployed tec...,0.088,0.823,0.088,0.0,neutral
374,King Arthur,A ruler said¬†to be the model of goodness over ...,1918,Andrew Lang,62,598,192.0,Fantasy,3.5,,king arthur,a ruler said to be the model of goodness over ...,False,ruler say model goodness evil formidable comra...,0.084,0.832,0.084,-0.0,neutral
420,The Present,Old Version,Inc.,Kenneth Thomas,79,2383,200.0,Science Fiction,3.59,,the present,old version,False,old version,0.0,1.0,0.0,0.0,neutral
487,The Mermaid's Voice Returns in This One,Goodreads Choice Award-winning poet and USA TO...,2019,Amanda Lovelace,2185,16801,210.0,Poetry,3.68,[],the mermaid's voice returns in this one,goodreads choice awardwinning poet and usa tod...,False,goodreads choice awardwinning poet usa today b...,0.0,1.0,0.0,0.0,neutral
490,You Learn by Living: Eleven Keys for a More Fu...,Mrs. Roosevelt expresses her philosophy of lif...,1960,Eleanor Roosevelt,566,4084,211.0,Nonfiction,3.98,[],you learn by living eleven keys for a more ful...,mrs. roosevelt expresses her philosophy of lif...,False,roosevelt express philosophy life relate exper...,0.0,1.0,0.0,0.0,neutral
619,"Burning in Water, Drowning in Flame","Burning in Water, Drowning in Flame is poetry ...",1974,Charles Bukowski,482,8041,232.0,Poetry,4.08,[],"burning in water, drowning in flame","burning in water, drowning in flame is poetry ...",False,burn water drown flame poetry full gamble drin...,0.0,1.0,0.0,0.0,neutral
1318,H is for Hawk,"As a child, Helen Macdonald was determined to ...",2014,Helen Macdonald,9407,70020,300.0,Nonfiction,3.74,,h is for hawk,"as a child, helen macdonald was determined to ...",False,child helen macdonald determine become falcone...,0.127,0.76,0.113,-0.0,neutral


In [None]:
# setting year to int

df.year_published = df.year_published.astype('int64')

In [None]:
# finding rows with non-years

[re.findall(r"[^0-9]", str(x)) for x in df.year_published]

# setting to a series and DF of non-years
empty = pd.DataFrame(pd.Series([re.findall(r"[^0-9]", str(x)) for x in df.year_published]))



## cleaning, exploring`

In [9]:
# how many nan

df.isna().sum()

title                    0
summary                  0
year_published           0
author                   0
review_count             0
number_of_ratings        0
length                   0
genre                    0
rating                   0
reviews               1976
cleaned_title            0
cleaned_summary          0
successful               0
lemmatized_summary       0
neg                      0
neutral                  0
pos                      0
compound                 0
sentiment                0
dtype: int64

**No NaNs appear in important columns.**

## train‚Äîtest split

In [10]:
def split(df):
    train, test = train_test_split(df, test_size = .2, random_state = 42, stratify = df.succesful)
    return train, test

**1 / True = successful (bestseller), 0 / False = not a bestseller**

In [11]:
train, test = split(df)
train.shape, test.shape

AttributeError: 'DataFrame' object has no attribute 'target'

#### We'll be doing k-folds, so no need to have a validate portion here

### Does the length of a book have a relationship to its success ?

In [None]:
train.sample()

In [None]:
# successful books

besties = train[train['target'] == 'best seller']

**127 bestseller books in train. Assigning to a variable in order to explore page length.**

In [None]:
# mean length of pages

besties['length'].mean()

In [None]:
# median length of pages

besties['length'].median()

**The mean length of best sellers is 477 pages, the median is 400 pages.**

In [None]:
# standard deviation

besties['length'].std()

**Standard deviation of about 205 pages. So, 68% of NYT bestsellers have a length of 272 to 682 pages.**

In [None]:
long_books = besties[besties['length'] > 682]
long_books.shape

**19 books have more than 682 pages.** Now to compare with generalised / random books list.

### Non-bestsellers

**18 books have more than 677 pages.** Now to compare with generalised / random books list.

In [None]:
# isolating unsuccessful books 

sadness = train[train['target'] == False]

In [None]:
sadness['length'].max(), sadness['length'].min()

In [None]:
sadness['length'].mean()

In [None]:
# standard deviation of non-bestsellers

sadness['length'].std()

**Standard deviation of about 175 pages. So, 68% of non-bestsellers have a length between 180 and 530 pages.**

### Exploring length and year published

H_O : There is no relationship between the length of a book and the year that it was published.

H_a : There is a relationship between the length of a book and the year that it was published.

In [None]:
plt.figure(figsize=(12, 8))

plt.title('Non-Bestseller Lengths, By Year Published')
sns.barplot(y = sadness['length'], x = sadness['year_published'])

plt.xticks(rotation = 45)

plt.show()


In [None]:
plt.figure(figsize=(12, 8))


plt.title('Bestseller Lengths, By Year Published')

sns.barplot(y = besties['length'], x = besties['year_published'])

plt.xticks(rotation = 45)

plt.show()

In [None]:
plt.figure(figsize=(20, 8))


plt.title('All Books Lengths, By Year')

sns.barplot(y = train['length'], x = train['year_published'])

plt.xticks(rotation = 45)

plt.show()

**The distribution for all there dataframes is relatively uniform; a chi-square test is appropriate here.**

In [None]:
# chi-square function

def chi_sq(a, b):
    '''
    This function will take in two arguments in the form of two discrete variables 
    and runs a chi^2 test to determine if the the two variables are independent of 
    each other and prints the results based on the findings.
    '''
    alpha = 0.05
    
    result = pd.crosstab(a, b)

    chi2, p, degf, expected = stats.chi2_contingency(result)

    print(f'Chi-square  : {chi2:.4f}') 
    print("")
    print(f'P-value : {p:.4f}')
    print("")
    if p / 2 > alpha:
        print("We fail to reject the null hypothesis.")
    else:
        print(f'We reject the null hypothesis ; there is a relationship between the target variable and the feature examined.')
        

In [None]:
# chi-square on train for length and year published

a = train['length']
b = train['year_published']

In [None]:
chi_sq(a, b)

In [None]:
# chi-square on besties  for length and year published

v = besties['length']
w = besties['year_published']

In [None]:
chi_sq(v, w)

In [None]:
# chi-square on sadness for length and year published

t = sadness['length']
u = sadness['year_published']

In [None]:
chi_sq(t, u)

**TAKEAWAYS: There is a relationship between the length of the book (positive correlation) and the year that it was published, particularly for books not on the NYT Best Seller list, and for the train dataset. The length of the book and the year that it was published did not have a relationship for NYT Best Sellers.**

## Exploring length and successs

H_O : There is no relationship between the length of a book and its landing on the NYT Best Seller list.

H_a : There is a relationship between the length of a book and its landing on the NYT Best Seller list.

In [None]:
# plotting all books

def book_len_success():
    
    '''
    this function uses the training dataset to plot 
    the target ('successful') against the length in 
    pages of each book. it puts out a barplot.
    '''
    plt.figure(figsize=(8, 5))

    plt.title('Success Of Book Based On Average Page Length')

    graphed = sns.barplot(x = train['successful'], y = train['length'], palette = 'CMRmap')

    # set xtick labels and properties
    plt.xticks([0, 1], 
               [ 'Not On List', 'Bestseller'],
               rotation = 25)

    # plt.legend([],[]) --this line unnecessary here
    plt.yticks(np.arange(0, 600, 100))

    # display y axis grids
    # graphed.yaxis.grid(True)

    plt.ylabel('Count')
    plt.xlabel('Appearance On NYT Best Seller List')

    plt.show()

In [None]:
book_len_success

**It appears that bestsellers have, on average, a longer average page count than books that are not NYT Best Sellers.**

In [None]:
# chi-square on train for length and success

r = train['length']
s = train['successful']

In [None]:
chi_sq(r, s)

**TAKEAWAYS: It appears, both from the bar plot and from the chi-square test, that there is a significant relationship between book length and its appearing on the NYT Best Seller list. Bestsellers, on average, have a longer page length than non-bestsellers. This discovery is also supported by the cumulative density function results.**

### Using .cdf on bestsellers and non-bestsellers

In [None]:
# bestsellers

# mean
m = 477

# standard deviation
s = 205

# Define the normal distribution
bestseller_len = stats.norm(m, s)

## Find the value where 95% of the values / variables are less than unknown-value-X : Use PPF.
best_cdf = bestseller_len.cdf(191)
best_cdf

**8pc chance of a successful book having a length of 191 pages or less.**

In [None]:
# mean of unsuccessful books
m = 355

# standard deviation
s = 175

nonbest_length = stats.norm(m, s)

nonbest_cdf = nonbest_length.cdf(191)
nonbest_cdf

**17.4pc chance of an unsuccessful book having 200 or less pages.**

### What about sentiment score distribution ?¬∂


In [None]:
# plotting bestseller books : length vs sentiment score


plt.figure(figsize=(8, 5))

plt.title('Sentiment Score Of Book Summary Based On Page Length : Bestsellers')

sns.barplot(x = besties['sentiment'], y = besties['length'])


plt.show()

In [None]:
# plotting unsuccessful books : length vs sentiment score


plt.figure(figsize=(8, 5))

plt.title('Sentiment Score Of Book Summary Based On Page Length : Non-Bestsellers')

sns.barplot(x = sadness['sentiment'], y = sadness['length'])

plt.show()


**TAKEAWAYS: Length of book does not seem to have much relationship to the book-summary sentiment score. There was one bestseller with a neutral score, which led to it not being able to have an average page length calculation.**

In [None]:
a = besties[besties['sentiment'] == 'very negative']
a['title'].value_counts().sum()

In [None]:
b = besties[besties['sentiment'] == 'negative']
b['title'].value_counts().sum()

In [None]:
e = besties[besties['sentiment'] == 'neutral']
e['title'].value_counts().sum()

In [None]:
d = besties[besties['sentiment'] == 'positive']
d['title'].value_counts().sum()

In [None]:
c = besties[besties['sentiment'] == 'very positive']
c['title'].value_counts().sum()

**Of the bestseller sentiment scores, 65 have very negative scores, 7 have negative, 1 has neutral, 11 have positive and 43 have very positive.**