In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import gutenberg, stopwords

In [2]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text


# Import all the Austen in the Project Gutenberg corpus.
austen = ''
for novel in ['persuasion','emma','sense']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen = austen + work
    
chesterton=''
for novel in ['chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt']:
    work = gutenberg.raw(novel)
    chesterton += work

# Clean the data.
austen_clean = text_cleaner(austen[:1000000])
chesterton_clean = text_cleaner(chesterton[:1000000])

In [3]:
# Parse the data. This can take some time.
nlp = spacy.load('en')
austen_doc = nlp(austen_clean)
chesterton_doc=nlp(chesterton_clean)

In [4]:
# Organize the parsed doc into sentences, while filtering out punctuation
# and stop words, and converting words to lower case lemmas.

def doc2sentences(nlpdoc):
    sentences = []
    for sentence in nlpdoc.sents:
        sentence = [
            token.lemma_.lower()
            for token in sentence
            if not token.is_stop
            and not token.is_punct
        ]
        sentences.append(sentence)
    return sentences

sentences = doc2sentences(austen_doc)
chesterton_sentences = doc2sentences(chesterton_doc)


print(sentences[20])
print('Austen has {} sentences and {} tokens.'.format(len(sentences), len(austen_clean)))
print('Chesterton has {} sentences and {} tokens.'.format(len(chesterton_sentences), len(chesterton_clean)))

['for', 'daughter', 'eld', 'give', 'thing', 'tempt']
Austen has 8817 sentences and 990979 tokens.
Chesterton has 9835 sentences and 984704 tokens.


In [5]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.models import word2vec

vec_dim = 300
model = word2vec.Word2Vec(
    sentences + chesterton_sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=5,   # Minimum word count threshold.
    window=5,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=vec_dim,      # Word vector length.
    iter=6,
    hs=1           # Use hierarchical softmax.
)

print('done!')

done!


In [6]:
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['mr', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))


# One of these things is not like the other...
print(model.wv.doesnt_match("breakfast large dinner lunch".split()))

[('sherry', 0.7841025590896606), ('fallen', 0.7547245025634766), ('claude', 0.7529209852218628), ('wilson', 0.7263314127922058), ('seymour', 0.7219710946083069), ('walter', 0.7112399935722351), ('cowdray', 0.7021132707595825), ('reality', 0.6875311136245728), ('foam', 0.6855844259262085), ('anne', 0.6803672313690186)]
0.8873249348855472
0.6576083311205722
large


In [7]:
def sentence2vec(sentence):
    if any(word in sentence for word in vocab):
        return np.mean([model.wv.word_vec(word) for word in sentence if word in vocab], axis=0)
    else:
        return np.zeros(vec_dim)

In [8]:
# Group into sentences.
austen_rows = [[sent, 'austen'] for sent in sentences]
chesterton_rows = [[sent, 'chesterton'] for sent in chesterton_sentences]

#create dataframe for SVC
df = pd.DataFrame(austen_rows + chesterton_rows, columns=['sentence', 'author'])

#list of vectors to concat to data frame
vectors = []
for sentence in df.sentence:
    vectors.append(sentence2vec(sentence))

df2 = pd.DataFrame(vectors)

#concat setnences/author to vector dataframe
df = pd.concat([df, df2], axis=1)
df.head(2)

Unnamed: 0,sentence,author,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
0,"[sir, walter, elliot, kellynch, hall, somerset...",austen,0.1513,0.054095,-0.19563,0.066961,0.073186,0.024032,-0.21545,-0.15742,...,0.156527,-0.0371,-0.037329,-0.137503,0.181644,0.235,0.110988,0.086285,-0.09654,-0.112739
1,"[this, page, favourite, volume, open, elliot, ...",austen,0.184816,0.073297,-0.194881,0.035354,0.079643,0.041703,-0.207674,-0.067473,...,0.205798,-0.167607,-0.07321,-0.111982,0.182613,0.232798,0.09144,0.137984,-0.126915,-0.145583


In [9]:
X = df.drop(['author', 'sentence'], axis=1)
y = df.author

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)

In [10]:
from sklearn.svm import LinearSVC

svc = LinearSVC(C=29, random_state=42)
svc.fit(X_train, y_train)

print(svc.score(X_train, y_train))
print(svc.score(X_test, y_test))

0.8415492957746479
0.8378817413905133


In [12]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=2500, random_state=42)
lr.fit(X_train, y_train)

print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.854433418693982
0.8476283300844705


In [14]:
y_test.value_counts()

chesterton    3246
austen        2910
Name: author, dtype: int64