In [9]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [31]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
from collections import Counter
from nltk.corpus import gutenberg
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
import nltk
import warnings

warnings.filterwarnings("ignore")

<IPython.core.display.Javascript object>

In [2]:
# Utility function for standard text cleaning
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation that spaCy doesn't
    # recognize: the double dash '--'. Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [3]:
# Load and clean the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [4]:
# Parse the cleaned novels. This can take some time.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [27]:
# Group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one DataFrame
orig_sentences = pd.DataFrame(
    alice_sents + persuasion_sents, columns=["text", "author"]
)
orig_sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


<IPython.core.display.Javascript object>

In [74]:
# Get rid of stop words and punctuation,
# and lemmatize the tokens
sentences = orig_sentences.copy()
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = " ".join(
        [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop]
    )

<IPython.core.display.Javascript object>

In [75]:
vectorizer = CountVectorizer(analyzer="word")
X = vectorizer.fit_transform(sentences["text"])
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([bow_df, sentences[["text", "author"]]], axis=1)

<IPython.core.display.Javascript object>

In [76]:
sentences.head()

Unnamed: 0,1st,29th,abbreviation,abdication,abide,ability,able,abode,abominable,abominate,...,younker,youth,youthful,zeal,zealand,zealous,zealously,zigzag,text,author
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Alice begin tired sit sister bank have twice p...,Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,consider mind hot day feel sleepy stupid pleas...,Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,remarkable Alice think way hear Rabbit,Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,oh dear,Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,oh dear,Carroll


<IPython.core.display.Javascript object>

In [77]:
# Utility function to calculate how frequently words appear in the text
def pos_frequencies(text):

    # Build a list of words
    # Strip out punctuation
    parts = []
    for token in text:
        if not token.is_punct:
            parts.append(token.pos_)

    # Build and return a `Counter` object containing word counts
    return Counter(parts)

<IPython.core.display.Javascript object>

In [78]:
sentences["text"][0]

'Alice begin tired sit sister bank have twice peep book sister read picture conversation use book think Alice picture conversation'

<IPython.core.display.Javascript object>

In [79]:
orig_sentences

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll
...,...,...
5627,"(Her, spring, of, felicity, was, in, the, glow...",Austen
5628,"(Anne, was, tenderness, itself, ,, and, she, h...",Austen
5629,"(His, profession, was, all, that, could, ever,...",Austen
5630,"(She, gloried, in, being, a, sailor, 's, wife,...",Austen


<IPython.core.display.Javascript object>

In [80]:
pos_count = pos_frequencies(orig_sentences["text"][0])

<IPython.core.display.Javascript object>

In [81]:
pos_counts = []
for o_sent in orig_sentences["text"]:
    pos_count = pos_frequencies(o_sent)
    pos_counts.append(dict(pos_count))

pos_counts_df = pd.DataFrame(pos_counts).fillna(0)

<IPython.core.display.Javascript object>

In [82]:
pos_counts_df = pos_counts_df.astype(int)

<IPython.core.display.Javascript object>

In [83]:
sentences = pd.concat([pos_counts_df, sentences], 1)

<IPython.core.display.Javascript object>

In [127]:
all_sent_stats = []
all_unique_words = []
for o_sent in orig_sentences["text"]:
    puncts = []
    words = []
    for token in o_sent:
        if token.is_punct:
            puncts.append(token)
        else:
            words.append(token.string)
    unique_words = set(words)
    sent_stats = {"num_puncts": len(puncts), "num_words": len(words)}
    all_sent_stats.append(sent_stats)
    all_unique_words.append({"unique_words": unique_words})

<IPython.core.display.Javascript object>

In [128]:
all_unique_words

[{'unique_words': {'Alice ',
   'a ',
   'and ',
   'bank',
   'beginning ',
   'book',
   'book ',
   'but ',
   'by ',
   'conversation',
   'conversations ',
   'do',
   'get ',
   'had ',
   'having ',
   'her ',
   'in ',
   'into ',
   'is ',
   'it',
   'it ',
   'no ',
   'nothing ',
   'of ',
   'on ',
   'once ',
   'or ',
   'peeped ',
   'pictures ',
   'reading',
   'she ',
   'sister ',
   'sitting ',
   'the ',
   'thought ',
   'tired ',
   'to ',
   'twice ',
   'use ',
   'very ',
   'was ',
   'what ',
   'without '}},
 {'unique_words': {'Rabbit ',
   'So ',
   'White ',
   'a ',
   'and ',
   'as ',
   'be ',
   'by ',
   'chain ',
   'close ',
   'considering ',
   'could',
   'daisies',
   'daisy',
   'day ',
   'eyes ',
   'feel ',
   'for ',
   'getting ',
   'her',
   'her ',
   'hot ',
   'in ',
   'made ',
   'making ',
   'mind ',
   'of ',
   'own ',
   'picking ',
   'pink ',
   'pleasure ',
   'ran ',
   'she ',
   'sleepy ',
   'stupid',
   'suddenly ',


<IPython.core.display.Javascript object>

In [129]:
# words = []
# o_sent = orig_sentences["text"][0]
# for token in o_sent:
#     if token.is_punct:
#         puncts.append(token.string)
#     else:
#         words.append(token.string)
# set(words)

<IPython.core.display.Javascript object>

In [130]:
unique_words_df = pd.DataFrame(all_unique_words)

<IPython.core.display.Javascript object>

In [132]:
orig_sentences = pd.concat([unique_words_df, orig_sentences], 1)

<IPython.core.display.Javascript object>

In [138]:
orig_sentences

Unnamed: 0,unique_words,text,author
0,"{pictures , once , or , by , reading, is , tho...","(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"{mind , considering , by , pleasure , own , fo...","(So, she, was, considering, in, her, own, mind...",Carroll
2,"{out , Alice , much , nothing , There , hear ,...","(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"{Oh , dear}","(Oh, dear, !)",Carroll
4,"{Oh , dear}","(Oh, dear, !)",Carroll
...,...,...,...
5627,"{felicity , in , her , spirits, glow , heart, ...","(Her, spring, of, felicity, was, in, the, glow...",Austen
5628,"{in , itself, and , Wentworth, affection, it ,...","(Anne, was, tenderness, itself, ,, and, she, h...",Austen
5629,"{all , tenderness , His , that , profession , ...","(His, profession, was, all, that, could, ever,...",Austen
5630,"{more , wife, but , for , possible, that , imp...","(She, gloried, in, being, a, sailor, 's, wife,...",Austen


<IPython.core.display.Javascript object>

In [149]:
carroll_orig = orig_sentences[orig_sentences["author"] == "Carroll"]
austen_orig = orig_sentences[orig_sentences["author"] == "Austen"].reset_index()

<IPython.core.display.Javascript object>

In [159]:
carroll_orig.shape[0]

1807

<IPython.core.display.Javascript object>

In [160]:
repeated_words = [{"repeated_words": 0}]
for i in range(1, carroll_orig.shape[0]):
    print(repeated_words)
    prev = carroll_orig["unique_words"][i - 1]
    curr = carroll_orig["unique_words"][i]
    num_repeated = len(prev.intersection(curr))
    repeated_words.append({"repeated_words": num_repeated})
repeated_words.append({"repeated_words": 0})
for j in range(1, austen_orig.shape[0]):
    print(repeated_words)
    prev = austen_orig["unique_words"][j - 1]
    curr = austen_orig["unique_words"][j]
    num_repeated = len(prev.intersection(curr))
    repeated_words.append({"repeated_words": num_repeated})

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



<IPython.core.display.Javascript object>

In [153]:
austen_orig

Unnamed: 0,index,unique_words,text,author
0,1807,"{amusement, by , changed , contemplating , res...","(Sir, Walter, Elliot, ,, of, Kellynch, Hall, ,...",Austen
1,1808,"{This , at , ELLIOT , favourite , was , opened...","(This, was, the, page, at, which, the, favouri...",Austen
2,1809,"{Elliot, by , Stevenson, son, Park, still, Mar...","(Walter, Elliot, ,, born, March, ,, ,, married...",Austen
3,1810,"{Somerset, Uppercross, inserting , family, han...","("", Precisely, such, had, the, paragraph, orig...",Austen
4,1811,"{Somerset, Principal , Elliot, parliaments, fa...","(Then, followed, the, history, and, rise, of, ...",Austen
...,...,...,...,...
3820,5627,"{felicity , in , her , spirits, glow , heart, ...","(Her, spring, of, felicity, was, in, the, glow...",Austen
3821,5628,"{in , itself, and , Wentworth, affection, it ,...","(Anne, was, tenderness, itself, ,, and, she, h...",Austen
3822,5629,"{all , tenderness , His , that , profession , ...","(His, profession, was, all, that, could, ever,...",Austen
3823,5630,"{more , wife, but , for , possible, that , imp...","(She, gloried, in, being, a, sailor, 's, wife,...",Austen


<IPython.core.display.Javascript object>

In [162]:
len(repeated_words)

5632

<IPython.core.display.Javascript object>

In [163]:
repeated_df = pd.DataFrame(repeated_words)
repeated_df.shape

(5632, 1)

<IPython.core.display.Javascript object>

In [85]:
sentences = pd.concat([pd.DataFrame(all_sent_stats), sentences], 1)

<IPython.core.display.Javascript object>

In [164]:
sentences = pd.concat([repeated_df, sentences], 1)

<IPython.core.display.Javascript object>

In [91]:
a = set(orig_sentences["text"][0])
b = set(orig_sentences["text"][1])
a.intersection(b)

set()

<IPython.core.display.Javascript object>

In [165]:
sentences

Unnamed: 0,repeated_words,num_puncts,num_words,PROPN,AUX,VERB,PART,ADV,ADJ,ADP,...,younker,youth,youthful,zeal,zealand,zealous,zealously,zigzag,text,author
0,0,10,57,2,7,6,2,3,1,8,...,0,0,0,0,0,0,0,0,Alice begin tired sit sister bank have twice p...,Carroll
1,10,7,56,2,2,9,0,8,7,6,...,0,0,0,0,0,0,0,0,consider mind hot day feel sleepy stupid pleas...,Carroll
2,5,3,27,2,2,3,1,5,1,3,...,0,0,0,0,0,0,0,0,remarkable Alice think way hear Rabbit,Carroll
3,0,1,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,oh dear,Carroll
4,2,1,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,oh dear,Carroll
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5627,0,2,23,1,2,0,1,0,0,5,...,0,0,0,0,0,0,0,0,spring felicity glow spirit friend Anne warmth...,Austen
5628,5,2,17,3,2,0,1,0,1,2,...,0,0,0,0,0,0,0,0,Anne tenderness worth Captain Wentworth affection,Austen
5629,4,2,26,0,1,5,0,2,1,1,...,0,0,0,0,0,0,0,0,profession friend wish tenderness dread future...,Austen
5630,5,4,37,0,2,4,1,1,5,6,...,0,0,0,0,0,0,0,0,glory sailor wife pay tax quick alarm belong p...,Austen


<IPython.core.display.Javascript object>

In [166]:
Y = sentences["author"]
X = np.array(sentences.drop(["text", "author"], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.4, random_state=123
)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print("Training set score:", lr.score(X_train, y_train))
print("\nTest set score:", lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print("Training set score:", rfc.score(X_train, y_train))
print("\nTest set score:", rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print("Training set score:", gbc.score(X_train, y_train))
print("\nTest set score:", gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9269014501331755

Test set score: 0.8801597869507324
----------------------Random Forest Scores----------------------
Training set score: 0.9926013613495117

Test set score: 0.8615179760319573
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8617934300088783

Test set score: 0.8535286284953395


<IPython.core.display.Javascript object>

## Question 2


In [168]:
vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 2))
X = vectorizer.fit_transform(sentences["text"])
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([bow_df, sentences[["text", "author"]]], axis=1)
sentences.head()

Unnamed: 0,1st,29th,29th september,abbreviation,abbreviation living,abdication,abdication neighbour,abide,abide consequence,abide figure,...,zealand australia,zealous,zealous officer,zealous subject,zealously,zealously discharge,zigzag,zigzag go,text,author
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Alice begin tired sit sister bank have twice p...,Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,consider mind hot day feel sleepy stupid pleas...,Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,remarkable Alice think way hear Rabbit,Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,oh dear,Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,oh dear,Carroll


<IPython.core.display.Javascript object>

In [169]:
Y = sentences["author"]
X = np.array(sentences.drop(["text", "author"], 1))

# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.4, random_state=123
)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print("Training set score:", lr.score(X_train, y_train))
print("\nTest set score:", lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print("Training set score:", rfc.score(X_train, y_train))
print("\nTest set score:", rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print("Training set score:", gbc.score(X_train, y_train))
print("\nTest set score:", gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.9556081680970702

Test set score: 0.875277407900577
----------------------Random Forest Scores----------------------
Training set score: 0.9795797573246523

Test set score: 0.8575233022636485
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8490677715300384

Test set score: 0.833555259653795


<IPython.core.display.Javascript object>