In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [18]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
from nltk.corpus import gutenberg
import nltk
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# nltk.download("gutenberg")
# !python -m spacy download en

<IPython.core.display.Javascript object>

In [3]:
# Utility function for standard text cleaning
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation that spaCy doesn't
    # recognize: the double dash --. Better get rid of it now!
    text = re.sub(r"--", " ", text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = " ".join(text.split())
    return text

<IPython.core.display.Javascript object>

In [4]:
# Load and clean the data
persuasion = gutenberg.raw("austen-persuasion.txt")
alice = gutenberg.raw("carroll-alice.txt")

# The chapter indicator is idiosyncratic
persuasion = re.sub(r"Chapter \d+", "", persuasion)
alice = re.sub(r"CHAPTER .*", "", alice)

alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

<IPython.core.display.Javascript object>

In [5]:
# Parse the cleaned novels. This can take some time.
nlp = spacy.load("en")
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

<IPython.core.display.Javascript object>

In [14]:
# Group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one DataFrame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns=["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


<IPython.core.display.Javascript object>

In [15]:
# Get rid of stop words and punctuation,
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = " ".join(
        [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop]
    )

<IPython.core.display.Javascript object>

In [16]:
sentences.head()

Unnamed: 0,text,author
0,Alice begin tired sit sister bank have twice p...,Carroll
1,consider mind hot day feel sleepy stupid pleas...,Carroll
2,remarkable Alice think way hear Rabbit,Carroll
3,oh dear,Carroll
4,oh dear,Carroll


<IPython.core.display.Javascript object>

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_df=0.7, min_df=1, use_idf=True, norm=u"l2", smooth_idf=True
)


# Applying the vectorizer
X = vectorizer.fit_transform(sentences["text"][-3:])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
last_3_sentences = pd.concat([tfidf_df, sentences[["text", "author"]]], axis=1)

# Keep in mind that log base 2 of 1 is 0,
# so a TF-IDF score of 0 indicates that the word was present once in that sentence.
last_3_sentences.head()

Unnamed: 0,alarm,belong,dim,distinguished,domestic,dread,finis,friend,future,glory,...,sailor,sunshine,tax,tenderness,virtue,war,wife,wish,text,author
0,0.0,0.0,0.341426,0.0,0.0,0.341426,0.0,0.341426,0.341426,0.0,...,0.0,0.341426,0.0,0.341426,0.0,0.341426,0.0,0.341426,Alice begin tired sit sister bank have twice p...,Carroll
1,0.261906,0.261906,0.0,0.261906,0.261906,0.0,0.0,0.0,0.0,0.261906,...,0.261906,0.0,0.261906,0.0,0.261906,0.0,0.261906,0.0,consider mind hot day feel sleepy stupid pleas...,Carroll
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,remarkable Alice think way hear Rabbit,Carroll
3,,,,,,,,,,,...,,,,,,,,,oh dear,Carroll
4,,,,,,,,,,,...,,,,,,,,,oh dear,Carroll


<IPython.core.display.Javascript object>

In [19]:
vectorizer = TfidfVectorizer(
    max_df=0.5, min_df=2, use_idf=True, norm=u"l2", smooth_idf=True, ngram_range=(1, 2)
)


# Applying the vectorizer
X = vectorizer.fit_transform(sentences["text"])

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([tfidf_df, sentences[["text", "author"]]], axis=1)

# Keep in mind that log base 2 of 1 is 0,
# so a TF-IDF score of 0 indicates that the word was present once in that sentence.
sentences.head()

Unnamed: 0,abide,ability,able,able bear,able persuade,abominate,abroad,absence,absence home,absent,...,young people,young person,young sister,young woman,youth,youth say,zeal,zealous,text,author
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Alice begin tired sit sister bank have twice p...,Carroll
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,consider mind hot day feel sleepy stupid pleas...,Carroll
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,remarkable Alice think way hear Rabbit,Carroll
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,oh dear,Carroll
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,oh dear,Carroll


<IPython.core.display.Javascript object>

In [20]:
Y = sentences["author"]
X = np.array(sentences.drop(["text", "author"], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.4, random_state=123
)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print("Training set score:", lr.score(X_train, y_train))
print("\nTest set score:", lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print("Training set score:", rfc.score(X_train, y_train))
print("\nTest set score:", rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print("Training set score:", gbc.score(X_train, y_train))
print("\nTest set score:", gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.912696063924238

Test set score: 0.8717265867731913
----------------------Random Forest Scores----------------------
Training set score: 0.9786919206865937

Test set score: 0.873058144695961
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8490677715300384

Test set score: 0.831779849090102


<IPython.core.display.Javascript object>

In [None]:
# ----------------------Logistic Regression Scores----------------------
# Training set score: 0.8494825964252116

# Test set score: 0.7986829727187206
# ----------------------Random Forest Scores----------------------
# Training set score: 0.9357165255566008

# Test set score: 0.8419567262464722
# ----------------------Gradient Boosting Scores----------------------
# Training set score: 0.8143618689244277

# Test set score: 0.7958607714016933


Looks like having both is better on all fronts.