## New prepare & explore using weekend_dataset.csv

In [1]:
# imports

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt


import unicodedata
import re

from bs4 import BeautifulSoup
import requests
import os
import json

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import prepare as prep

from sklearn.model_selection import train_test_split
import sklearn.model_selection

from scipy import stats
from scipy.stats import norm, binom



from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

In [2]:
# original df
df = pd.read_csv('weekend_dataset.csv')

# combining review cols
df['reviews'] = df['Review 1'] + ' ' + df['Review 2'] + ' ' + df['Review 3'] + ' ' + df['Review 4'] + ' ' +  df['Review 5']

# dropping cols
df.drop(['Review 1', 'Review 2', 'Review 3', 'Review 4', 'Review 5'], axis = 1, inplace = True)

# renaming columns
df = df.rename(columns = {'Book Name': 'title','Synopsis':'summary', 'Link':'link', 'page_count':'length'})

In [3]:
df.head()

Unnamed: 0,title,summary,length,year,review_count,rating,rating_count,genre,author,publisher,link,scraped_at,reviews
0,Raintree County,"Throughout a single day in 1892, John Shawness...","1088 pages, Paperback","First published January 1, 1948",91,3.95,"1,035 ratings",Fiction,Ross Lockridge Jr.,"April 1, 1994 by Penguin Books",https://www.goodreads.com/book/show/257233.Rai...,2023-03-25 01:31:35.686587,"Did they stop to think, in the midst of their ..."
1,The Eliots of Damerosehay #2\nPilgrim's Inn,"After WW II, Lucilla Eliot's soldier son Georg...","352 pages, Hardcover","First published January 1, 1948",238,4.35,"1,895 ratings",Fiction,Elizabeth Goudge,"March 21, 2013 by Amereon Limited",https://www.goodreads.com/book/show/205621.Pil...,2023-03-25 01:31:52.336480,I think I will start a bookshelf containing bo...
2,"An A.J. Cronin Trilogy: The Northern Light, Th...",Omnibus edition of two unabridged novels: The ...,Hardcover,"Published January 1, 1958",0,4.5,2 ratings,,A.J. Cronin,"January 1, 1958 by Little, Brown and Company",https://www.goodreads.com/book/show/2398522.An...,2023-03-25 01:32:06.053123,no_review no_review no_review no_review no_review
3,The Naked and the Dead by Norman Mailer Lesson...,The Naked and the Dead lesson plan contains a ...,"328 pages, Kindle Edition","First published June 18, 2012",0,0.0,0 ratings,,BookRags,"June 18, 2012",https://www.goodreads.com/book/show/20497230-t...,2023-03-25 01:32:19.143691,no_review no_review no_review no_review no_review
4,The Big Fisherman,"It was a calm, early summer noon in the southe...","459 pages, Hardcover","First published January 1, 1936",130,4.08,"1,795 ratings",Historical Fiction,Lloyd C. Douglas,"January 1, 1954",https://www.goodreads.com/book/show/778667.The...,2023-03-25 01:32:32.567861,There was a thing in the first half of the 20t...


In [4]:
df.dtypes

title            object
summary          object
length           object
year             object
review_count      int64
rating          float64
rating_count     object
genre            object
author           object
publisher        object
link             object
scraped_at       object
reviews          object
dtype: object

In [5]:
df.isna().sum()

title             0
summary         121
length           18
year             32
review_count      0
rating            0
rating_count      0
genre           297
author            0
publisher        25
link              0
scraped_at        0
reviews         398
dtype: int64

In [6]:
# convert 'year' column to string type using .astype()
df['year'] = df['year'].astype(str)

# extract the year from the 'year' column using the lambda function
df['year'] = df['year'].apply(lambda x: x[-4:])

In [7]:
df.head(3)

Unnamed: 0,title,summary,length,year,review_count,rating,rating_count,genre,author,publisher,link,scraped_at,reviews
0,Raintree County,"Throughout a single day in 1892, John Shawness...","1088 pages, Paperback",1948,91,3.95,"1,035 ratings",Fiction,Ross Lockridge Jr.,"April 1, 1994 by Penguin Books",https://www.goodreads.com/book/show/257233.Rai...,2023-03-25 01:31:35.686587,"Did they stop to think, in the midst of their ..."
1,The Eliots of Damerosehay #2\nPilgrim's Inn,"After WW II, Lucilla Eliot's soldier son Georg...","352 pages, Hardcover",1948,238,4.35,"1,895 ratings",Fiction,Elizabeth Goudge,"March 21, 2013 by Amereon Limited",https://www.goodreads.com/book/show/205621.Pil...,2023-03-25 01:31:52.336480,I think I will start a bookshelf containing bo...
2,"An A.J. Cronin Trilogy: The Northern Light, Th...",Omnibus edition of two unabridged novels: The ...,Hardcover,1958,0,4.5,2 ratings,,A.J. Cronin,"January 1, 1958 by Little, Brown and Company",https://www.goodreads.com/book/show/2398522.An...,2023-03-25 01:32:06.053123,no_review no_review no_review no_review no_review


### this works if the page-count is 4 digits, but with books of less than 100 pages, it would return letters, too. Need to use RegEx

In [8]:
# isolating the page count with RegEx

df['length'] = [re.findall(r'\d', str(x)) for x in df['length']]


In [9]:
# joining the pages together

df["length"] = df["length"].str.join("")

In [10]:
df.sample()

Unnamed: 0,title,summary,length,year,review_count,rating,rating_count,genre,author,publisher,link,scraped_at,reviews
2432,Otherworld #11\nWaking the Witch,"At twenty-one, Savannah Levine-orphaned daught...",309,2010,998,4.17,"18,002 ratings",Urban Fantasy,Kelley Armstrong,"July 27, 2010 by Dutton Adult",https://www.goodreads.com/book/show/6725785-wa...,2023-03-25 18:48:18.604054,I loved Savannah finally getting her own story...


In [33]:
# looking at books with no publisher

df[df['genre'].isna()].sample()

Unnamed: 0,title,summary,length,year,review_count,rating,rating_count,genre,author,publisher,link,scraped_at,reviews
80,Tinker Tailor Soldier Spy by John le Carré (Bo...,Unlock the more straightforward side of Tinker...,0,2019,0,3.75,4 ratings,,Bright Summaries,BrightSummaries.com,https://www.goodreads.com/book/show/45183621-t...,2023-03-25 01:50:23.895206,no_review no_review no_review no_review no_review


In [12]:
# dropping duplicate title-author combos

df.drop_duplicates(subset = ['title', 'author'], inplace = True)

In [13]:
# clean up publishers

df.publisher = df.publisher.astype(str)

df.publisher = [s.split('by')[1].strip() if len(s.split('by')) > 1 else '' for s in df.publisher]

In [14]:
df.isna().sum()

title             0
summary         111
length            0
year              0
review_count      0
rating            0
rating_count      0
genre           282
author            0
publisher         0
link              0
scraped_at        0
reviews         364
dtype: int64

# Using prepare function prep_data(filename)

In [15]:
# ...or not



In [16]:
# replacing NaNs

df.replace(to_replace = '', value = 0, inplace = True)

In [38]:
df.replace(to_replace = 'NaN', value = 0, inplace = True)

In [24]:
df.isna().sum()

title             0
summary         111
length            0
year              0
review_count      0
rating            0
rating_count      0
genre           282
author            0
publisher         0
link              0
scraped_at        0
reviews         364
dtype: int64

In [25]:
# convert to int

df['length'] = df['length'].astype('int64')



In [36]:
df['year']['9, 2']

KeyError: '9, 2'

In [26]:
pd.to_numeric(df['year'])

ValueError: Unable to parse string "9, 2" at position 516

In [21]:
df.dtypes

title            object
summary          object
length            int64
year             object
review_count      int64
rating          float64
rating_count     object
genre            object
author           object
publisher        object
link             object
scraped_at       object
reviews          object
dtype: object

In [20]:

df['year'] = df['year'].astype('int64')

ValueError: invalid literal for int() with base 10: 'nan'

In [None]:
sns.barplot(y = df['length'], x = df['year'])