In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
import nltk
from nltk.corpus import gutenberg
import gensim
import warnings

warnings.filterwarnings("ignore")

<IPython.core.display.Javascript object>

In [3]:
# Utility function for standard text cleaning
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation that spaCy doesn't
    # recognize: the double dash --. Better get rid of it now!
    text = re.sub(r"--", " ", text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = " ".join(text.split())
    return text

<IPython.core.display.Javascript object>

In [4]:
# Load and clean the data
persuasion = gutenberg.raw("austen-persuasion.txt")
alice = gutenberg.raw("carroll-alice.txt")

# The chapter indicator is idiosyncratic
persuasion = re.sub(r"Chapter \d+", "", persuasion)
alice = re.sub(r"CHAPTER .*", "", alice)

alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

<IPython.core.display.Javascript object>

In [5]:
# Parse the cleaned novels. This can take some time.
nlp = spacy.load("en")
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

<IPython.core.display.Javascript object>

In [56]:
# Group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one DataFrame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns=["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


<IPython.core.display.Javascript object>

In [57]:
# Get rid of stop words and punctuation,
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = [
        token.lemma_ for token in sentence if not token.is_punct and not token.is_stop
    ]

<IPython.core.display.Javascript object>

In [82]:
# Train word2vec on the sentences
samp_size = 150
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=6,
    min_count=1,
    window=20,
    sg=0,
    sample=1e-2,
    size=samp_size,
    hs=1,
)

<IPython.core.display.Javascript object>

In [83]:
word2vec_arr = np.zeros((sentences.shape[0], samp_size))

for i, sentence in enumerate(sentences["text"]):
    word2vec_arr[i, :] = np.mean([model[lemma] for lemma in sentence], axis=0)

word2vec_arr = pd.DataFrame(word2vec_arr)
vec_sentences = pd.concat([sentences[["author", "text"]], word2vec_arr], axis=1)
vec_sentences.dropna(inplace=True)

# sentences.head()

<IPython.core.display.Javascript object>

In [84]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = vec_sentences["author"]
X = np.array(vec_sentences.drop(["text", "author"], 1))

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.4, random_state=123
)

# Models
lr = LogisticRegression(n_jobs=-1)
rfc = RandomForestClassifier(n_jobs=-1)
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print("Training set score:", lr.score(X_train, y_train))
print("\nTest set score:", lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print("Training set score:", rfc.score(X_train, y_train))
print("\nTest set score:", rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print("Training set score:", gbc.score(X_train, y_train))
print("\nTest set score:", gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.8549287042777434

Test set score: 0.8554160855416085
----------------------Random Forest Scores----------------------
Training set score: 0.9950402975821451

Test set score: 0.8754067875406788
----------------------Gradient Boosting Scores----------------------
Training set score: 0.9296342219466832

Test set score: 0.8749418874941888


<IPython.core.display.Javascript object>

Was able to improve by upping the window and sample penalization. Changing number of vectors seem to have a minimal impact

In [85]:
#ORIGINAL SCORE
# ----------------------Logistic Regression Scores----------------------
# Training set score: 0.8078115313081216

# Test set score: 0.8214783821478382
# ----------------------Random Forest Scores----------------------
# Training set score: 0.9950402975821451

# Test set score: 0.8554160855416085
# ----------------------Gradient Boosting Scores----------------------
# Training set score: 0.8961562306261625

# Test set score: 0.8465829846582985


<IPython.core.display.Javascript object>