# notebook of functions

In [1]:
# imports

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt


import unicodedata
import re

from bs4 import BeautifulSoup
import requests
import os
import json

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

import prepare as prep

from sklearn.model_selection import train_test_split
import sklearn.model_selection

from scipy import stats
from scipy.stats import norm, binom



from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier

In [2]:
# read in the dataframe of the books

df = prep.prep_data('all_books.csv')

## Get the TF (term frequency) for the entire df

In [15]:
def get_tf(column):

    # obtain TF-IDF score
    tfidf = TfidfVectorizer()

        # fit-transform
    bag_o_words = tfidf.fit_transform(df[column])

        # create df of transformed & fit data : TF-IDF score
    word_df = pd.DataFrame(bag_o_words.todense(),
                        columns = tfidf.get_feature_names_out())


    print()
    print('- - - - - - - - - - - - - - -')
    print('Dataframe of Term Frequency scores :')
    return word_df
    print()
    print('- - - - - - - - - - - - - - -')

In [27]:
lem_sum_tf = get_tf('lemmatized_summary')
lem_sum_tf.sample()


- - - - - - - - - - - - - - -
Dataframe of Term Frequency scores :


Unnamed: 0,aa,aahz,aambc,aanen,aarav,aaron,aaronovitch,aaronsohn,ab,aba,...,zuckoff,zuckoffs,zula,zum,zumindest,zuni,zusak,zuversicht,zwanzig,zwischen
1518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Get the IDF for the entire df

In [17]:
# function to get the idf of a column

def get_idf(column):

    # obtain TF-IDF score
    tfidf = TfidfVectorizer()

#         # fit-transform
    bag_o_words = tfidf.fit_transform(df[column])

    words_series = pd.Series(dict(zip(
                            tfidf.get_feature_names_out(), 
                            tfidf.idf_)))
    print()
    print('- - - - - - - - - - - - - - -')
    print('Key / value pairs of words and frequencies :')
    print(words_series)
    return words_series

In [18]:
# idf of lemmatised sum

words_series = get_idf('lemmatized_summary')


- - - - - - - - - - - - - - -
Key / value pairs of words and frequencies :
aa            8.513709
aahz          8.513709
aambc         8.513709
aanen         8.513709
aarav         8.513709
                ...   
zuni          8.513709
zusak         8.108244
zuversicht    8.513709
zwanzig       8.513709
zwischen      7.820562
Length: 34611, dtype: float64


## Turn the IDF into a df and rename cols

In [19]:
def idf_to_df(column):
    
    '''
    this function operates off of the get_idf(column)
    function and uses the same column name. It returns
    the original IDF list, the min and the max IDF for 
    the column, along with the idf as a useable dataframe
    '''

    idf = get_idf(column)

    # turning into a DF & resetting the index

    idf= pd.DataFrame(idf)

    idf = idf.reset_index()

    # renaming columns

    idf_df = idf.rename(columns = {'index' : 'word', 0 : 'IDF'})
    
    print()
    print(f'Min IDF : ', idf_df['IDF'].min(), f', max IDF : ',idf_df['IDF'].max())
    return idf_df

In [20]:
lem_sum_idf = idf_to_df('lemmatized_summary')


- - - - - - - - - - - - - - -
Key / value pairs of words and frequencies :
aa            8.513709
aahz          8.513709
aambc         8.513709
aanen         8.513709
aarav         8.513709
                ...   
zuni          8.513709
zusak         8.108244
zuversicht    8.513709
zwanzig       8.513709
zwischen      7.820562
Length: 34611, dtype: float64

Min IDF :  1.808070152979702 , max IDF :  8.513709247839705


## Words to keep from IDF list and join them to the TF df

In [21]:
def keep_words(lem_sum_idf):
    
    '''
    this function takes in the df created in 
    the function "idf_to_df and removes the
    noise, isolates the "word" column, turns
    it into a list, concatenates it on the
    original df and keeps only the matches.
    '''

    # getting rid of words with an IDF of greater than 8

    lem_sum_idf = lem_sum_idf[lem_sum_idf['IDF'] < 7.5]

    # getting rid of words with an IDF of less than 3

    lem_sum_idf = lem_sum_idf[lem_sum_idf['IDF'] > 3.5]

    # isolate word col

    lem_sum_words = lem_sum_idf["word"]

    # turn to list

    lem_sum_words_list = lem_sum_words.to_list()

    # subset a list into an existing df and keep only words from the list

    lem_sum_keepers = lem_sum_tf[lem_sum_words_list]

    return lem_sum_keepers

In [22]:
lem_sum_keepers = keep_words(lem_sum_idf)

In [26]:
lem_sum_keepers.head()

Unnamed: 0,aaron,abandon,abandonment,abbey,abby,abduct,abduction,abide,abigail,ability,...,youth,youthful,youtube,youve,zach,zen,zero,zoey,zombie,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## split data into train and test

In [24]:
def split(df):
    train, test = train_test_split(df, test_size = .2, random_state = 42, stratify = df.successful)
    return train, test

**Stratifying on 'successful' : 1 / True = successful (bestseller), 0 / False = not a bestseller.**

In [25]:
train, test = split(df)
train.shape, test.shape

((2932, 19), (733, 19))