In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bs4 import BeautifulSoup
import requests
import unicodedata
import re
import os
import json

import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from xgboost import XGBClassifier as xgb

from scipy import stats

import prepare as prep
import explore as ex

seed = 42

import warnings
warnings.filterwarnings("ignore")

In [42]:
df = pd.read_csv('weekend_dataset.csv', index_col=0)

In [38]:
#df = df.rename(columns = {'Book Name': 'title','Synopsis':'summary', 'Link':'link', 'page_count':'length'})

In [39]:
df.to_csv('weekend_dataset.csv')

In [5]:
df['reviews'] = df['Review 1'] + ' ' + df['Review 2'] + ' ' + df['Review 3'] + ' ' + df['Review 4'] + ' ' + df['Review 5']

In [40]:
df.columns

Index(['title', 'summary', 'length', 'year', 'review_count', 'rating',
       'rating_count', 'genre', 'author', 'publisher', 'Review 1', 'Review 2',
       'Review 3', 'Review 4', 'Review 5', 'link', 'scraped_at'],
      dtype='object')

In [24]:
# original df
df = pd.read_csv('weekend_dataset.csv')

# combining review cols
df['reviews'] = df['Review 1'] + ' ' + df['Review 2'] + ' ' + df['Review 3'] + ' ' + df['Review 4'] + ' ' +  df['Review 5']

# dropping cols
df.drop(['Review 1', 'Review 2', 'Review 3', 'Review 4', 'Review 5'], axis = 1, inplace = True)

#rename columns
df = df.rename(columns = {'Book Name': 'title','Synopsis':'summary', 'Link':'link', 'page_count':'length'})

In [25]:
# convert 'year' column to string type using .astype()
df['year'] = df['year'].astype(str)

# extract the year from the 'year' column using the lambda function
df['year'] = df['year'].apply(lambda x: x[-4:])

In [26]:
# use regex on col to isolate page length

df['length'] = [re.findall(r'\d', str(x)) for x in df['length']]

# join the page numbers together
df["length"]= df["length"].str.join("")

In [27]:
df.rating = df['year'].astype(str)

# remove the last 8 characters from each string and return the result
df.rating_count = [s[:-8] for s in df.rating_count]

In [28]:
# drop duplicate title-author combos

df.drop_duplicates(subset = ['title', 'author'], inplace = True)

In [29]:
df.publisher = df.publisher.astype(str)

df.publisher = [s.split('by')[1].strip() if len(s.split('by')) > 1 else '' for s in df.publisher]

In [21]:
def get_data(file):
    '''
    Will pull the current data from the 'almost_there' csv file, and prep it for deeper cleaning.
    '''
    # original df
    df = pd.read_csv('weekend_dataset.csv', index_col=0)    
    # combining review cols
    df['reviews'] = df['Review 1'] + ' ' + df['Review 2'] + ' ' + df['Review 3'] + ' ' + df['Review 4'] + ' ' +  df['Review 5']
    # dropping unneeded cols
    df.drop(['Review 1', 'Review 2', 'Review 3', 'Review 4', 'Review 5'], axis = 1, inplace = True)
    #rename columns
    df = df.rename(columns={'Book Name':'title','Synopsis':'summary', 'Link':'link', 'page_count':'length'})
    
    df = df.reset_index()
                                               
    return df


In [26]:
df = get_data('weekend_dataset.csv')

In [27]:
df

Unnamed: 0,index,title,summary,length,year,review_count,rating,rating_count,genre,author,publisher,link,scraped_at,reviews
0,0,Raintree County,"Throughout a single day in 1892, John Shawness...","1088 pages, Paperback","First published January 1, 1948",91,3.95,"1,035 ratings",Fiction,Ross Lockridge Jr.,"April 1, 1994 by Penguin Books",https://www.goodreads.com/book/show/257233.Rai...,2023-03-25 01:31:35.686587,"Did they stop to think, in the midst of their ..."
1,1,The Eliots of Damerosehay #2\nPilgrim's Inn,"After WW II, Lucilla Eliot's soldier son Georg...","352 pages, Hardcover","First published January 1, 1948",238,4.35,"1,895 ratings",Fiction,Elizabeth Goudge,"March 21, 2013 by Amereon Limited",https://www.goodreads.com/book/show/205621.Pil...,2023-03-25 01:31:52.336480,I think I will start a bookshelf containing bo...
2,2,"An A.J. Cronin Trilogy: The Northern Light, Th...",Omnibus edition of two unabridged novels: The ...,Hardcover,"Published January 1, 1958",0,4.50,2 ratings,,A.J. Cronin,"January 1, 1958 by Little, Brown and Company",https://www.goodreads.com/book/show/2398522.An...,2023-03-25 01:32:06.053123,no_review no_review no_review no_review no_review
3,3,The Naked and the Dead by Norman Mailer Lesson...,The Naked and the Dead lesson plan contains a ...,"328 pages, Kindle Edition","First published June 18, 2012",0,0.00,0 ratings,,BookRags,"June 18, 2012",https://www.goodreads.com/book/show/20497230-t...,2023-03-25 01:32:19.143691,no_review no_review no_review no_review no_review
4,4,The Big Fisherman,"It was a calm, early summer noon in the southe...","459 pages, Hardcover","First published January 1, 1936",130,4.08,"1,795 ratings",Historical Fiction,Lloyd C. Douglas,"January 1, 1954",https://www.goodreads.com/book/show/778667.The...,2023-03-25 01:32:32.567861,There was a thing in the first half of the 20t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4886,4886,Ada Byron Lovelace and the Thinking Machine,"Ada Lovelace, the daughter of the famous roman...","40 pages, Hardcover","First published October 13, 2015",169,4.20,757 ratings,Picture Books,Laurie Wallmark,"October 13, 2015 by Creston Books",https://www.goodreads.com/book/show/24694140-a...,2023-03-26 02:56:50.121486,This woman is the daughter of Lord Byron - the...
4887,4887,Warriors of the Lathar #9\nRescued by her Alie...,"Prepared to put her life on the line, she didn...","192 pages, Kindle Edition","First published March 31, 2020",89,4.43,"1,011 ratings",Romance,Mina Carter,"April 28, 2020 by Mina Carter",https://www.goodreads.com/book/show/51286105-r...,2023-03-26 02:56:59.711610,
4888,4888,Saved By Hope,Jenny Lowe was 34 when she sought help from fe...,"256 pages, Paperback","Published September 1, 2022",27,4.68,38 ratings,Nonfiction,Jenny Lowe,"September 1, 2022 by Muse Literary",https://www.goodreads.com/book/show/62360044-s...,2023-03-26 02:57:09.210450,I finished reading this book last night only a...
4889,4889,Henderson Family Saga #1\nBlindsided by Love,"If confidence was a women’s size twenty, then ...",Kindle Edition,"Published September 19, 2019",153,4.71,881 ratings,Romance,Monica Walters,"September 19, 2019 by B. Love Publications",https://www.goodreads.com/book/show/51853748-b...,2023-03-26 02:57:18.599072,3.5 stars\nA country romance story with amazin...


In [45]:
def creat_tar(df, ser):
    target_list = []
    for index, row in df.iterrows():
        if row['cleaned_title'] in ser.tolist():
            target_list.append(1)
        else:
            target_list.append(0)

    # Add the 'Target' column to the dataframe
    df['successful'] = target_list
    df['successful'] = df['successful'].astype(bool)

    return df

In [46]:
df1 = pd.read_csv('fiction-and-non-fiction-top-best-sellers.csv', index_col=0)
    
clean_article(df1, 'Book')
ser = df1['cleaned_Book']

The Ten Commandments
No List Published
Maid in Waiting
The Harbourmaster
Mr. and Mrs. Pennington
The End of Desire
Mary's Neck
Magnolia Street
Bright Skin
A Modern Hero
The Good Earth
District Nurse
Faraway
Lark Ascending
The Fountain
A New York Tempest
The Sheltered Life
Sons
Invitation to the Waltz
Flowering Wilderness
The Last Adam
Ann Vickers
Rain in the Doorway
The Werewolf of Paris
The Store
As the Earth Turns
Little Man, What Now?
The Thin Man
Work of Art
The Oppermanns
Seven Gothic Tales
Tender is the Night
Five Silver Daughters
Lamb in His Bosom
I, Claudius
Anthony Adverse
So Red the Rose
Lost Horizon
The Forty Days of Musa Dagh
Heaven's My Destination
Come and Get It
Of Time and the River
Now in November
Young Renny
Paths of Glory
Green Light
Lucy Gayheart
Vein of Iron
Europa
It Can't Happen Here
If I Have Four Apples
The Last Puritan
Sparkenbroke
The Weather in the Streets
The Doctor
Sanfelice
Gone with the Wind
Drums Along the Mohawk
Theatre
The Outward Room
The Years
North

In [48]:
df = creat_tar(df, ser)

In [56]:
df['successful'].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
4886    False
4887    False
4888    False
4889    False
4890    False
Name: successful, Length: 4891, dtype: bool

In [67]:
def prep_data(filename):
    
    df = get_data(filename)
    
    df.drop(columns='index')
    
    clean_article(df, 'title')
    clean_article(df, 'summary')
    clean_article(df, 'reviews')
    
    df1 = pd.read_csv('fiction-and-non-fiction-top-best-sellers.csv', index_col=0)
    
    clean_article(df1, 'Book')
    ser = df1['cleaned_Book']
    
    creat_tar(df, ser)
    
    # convert 'year' column to string type using .astype()
    df['year'] = df['year'].astype(str)
    # extract the year from the 'year' column using the lambda function
    df['year'] = df['year'].apply(lambda x: x[-4:])
    
    # use regex on col to isolate page length
    df['length'] = [re.findall(r'\d', str(x)) for x in df['length']]
    # join the page numbers together
    df["length"]= df["length"].str.join("")
    
    df.rating = df['year'].astype(str)
    # remove the last 8 characters from each string and return the result
    df.rating_count = [s[:-8] for s in df.rating_count]
    
    # drop duplicate title-author combos
    df.drop_duplicates(subset = ['title', 'author'], inplace = True)
    
    df.publisher = df.publisher.astype(str)
    df.publisher = [s.split('by')[1].strip() if len(s.split('by')) > 1 else '' for s in df.publisher]
    
    genre_counts = df['genre'].value_counts()
    genres_to_remove = genre_counts[genre_counts < 8].index
    # remove the rows with those genres "filtering"
    df = df[~df['genre'].isin(genres_to_remove)]
    
    df = df[df['genre'] != 'Picture Books']
    
    df['lemmatized_summary'] = df['cleaned_summary'].apply(lemmatize_text)
    df[['neg', 'neutral', 'pos', 'compound']] = df['summary'].apply(feat_sent)
    df['sentiment'] = df['compound'].apply(get_sentiment)
    
    return df

In [44]:
df.isna().sum()

index              0
title              0
summary          121
length            18
year              32
review_count       0
rating             0
rating_count       0
genre            297
author             0
publisher         25
link               0
scraped_at         0
reviews          398
cleaned_title      0
dtype: int64

In [2]:
prep.prep_data('weekend_dataset.csv')

TypeError: normalize() argument 2 must be str, not float

In [40]:
def clean_article(df, col_name):
    cleaned_summaries = []
    for summary in df[col_name]:
        # Normalize the summary text and convert to lowercase
        print(summary)
        cleaned_summary = unicodedata.normalize('NFKD', summary)\
            .encode('ascii', 'ignore')\
            .decode('utf-8', 'ignore')\
            .lower()
        cleaned_summary = re.sub(r"[^a-z0-9',\s.]", '', cleaned_summary)
        cleaned_summaries.append(cleaned_summary)
    df[f'cleaned_{col_name}'] = cleaned_summaries
    df[f'cleaned_{col_name}'].astype('string')

In [41]:
df.head()

Unnamed: 0,index,title,summary,length,year,review_count,rating,rating_count,genre,author,publisher,link,scraped_at,reviews,cleaned_title
0,0,Raintree County,"Throughout a single day in 1892, John Shawness...","1088 pages, Paperback","First published January 1, 1948",91,3.95,"1,035 ratings",Fiction,Ross Lockridge Jr.,"April 1, 1994 by Penguin Books",https://www.goodreads.com/book/show/257233.Rai...,2023-03-25 01:31:35.686587,"Did they stop to think, in the midst of their ...",raintree county
1,1,The Eliots of Damerosehay #2\nPilgrim's Inn,"After WW II, Lucilla Eliot's soldier son Georg...","352 pages, Hardcover","First published January 1, 1948",238,4.35,"1,895 ratings",Fiction,Elizabeth Goudge,"March 21, 2013 by Amereon Limited",https://www.goodreads.com/book/show/205621.Pil...,2023-03-25 01:31:52.336480,I think I will start a bookshelf containing bo...,the eliots of damerosehay 2\npilgrim's inn
2,2,"An A.J. Cronin Trilogy: The Northern Light, Th...",Omnibus edition of two unabridged novels: The ...,Hardcover,"Published January 1, 1958",0,4.5,2 ratings,,A.J. Cronin,"January 1, 1958 by Little, Brown and Company",https://www.goodreads.com/book/show/2398522.An...,2023-03-25 01:32:06.053123,no_review no_review no_review no_review no_review,"an a.j. cronin trilogy the northern light, the..."
3,3,The Naked and the Dead by Norman Mailer Lesson...,The Naked and the Dead lesson plan contains a ...,"328 pages, Kindle Edition","First published June 18, 2012",0,0.0,0 ratings,,BookRags,"June 18, 2012",https://www.goodreads.com/book/show/20497230-t...,2023-03-25 01:32:19.143691,no_review no_review no_review no_review no_review,the naked and the dead by norman mailer lesson...
4,4,The Big Fisherman,"It was a calm, early summer noon in the southe...","459 pages, Hardcover","First published January 1, 1936",130,4.08,"1,795 ratings",Historical Fiction,Lloyd C. Douglas,"January 1, 1954",https://www.goodreads.com/book/show/778667.The...,2023-03-25 01:32:32.567861,There was a thing in the first half of the 20t...,the big fisherman


In [36]:
df.dtypes

index              int64
title             object
summary           object
length            object
year              object
review_count       int64
rating           float64
rating_count      object
genre             object
author            object
publisher         object
link              object
scraped_at        object
reviews           object
cleaned_title     object
dtype: object

In [37]:
clean_article(df, 'title')

In [42]:
clean_article(df, 'summary')

Throughout a single day in 1892, John Shawnessy recalls the great moments of his lifefrom the love affairs of his youth in Indiana, to the battles of the Civil War, to the politics of the Gilded Age, to his homecoming as schoolteacher, husband, and father. Shawnessy is the epitome of the place and period in which he lives, a rural land of springlike women, shady gamblers, wandering vagabonds, and soapbox orators. Yet here on the banks of the Shawmucky River, which weaves its primitive course through Raintree County, Indiana, he also feels and obeys ancient rhythms. A number-one bestseller when it was first published in 1948, this powerful novel is a compelling vision of 19th-century America with timeless resonance.
After WW II, Lucilla Eliot's soldier son George and his beautiful wife Nadine lived with their five children. They acquired an ancient pilgrim's inn on the river, that touches not only its new owners but also those strangers who stop there for a rest on their pilgrimages.



TypeError: normalize() argument 2 must be str, not float

In [39]:
clean_article(df, 'reviews')

TypeError: normalize() argument 2 must be str, not float