In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

### A. Bag of Words

#### 1. Data Pre-Processing

In [2]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
parents = gutenberg.raw('edgeworth-parents.txt')
ball = gutenberg.raw('chesterton-ball.txt')

# Remove chapter indicator.
parents = re.sub(r'CHAPTER .*', '', parents)
ball = re.sub(r'CHAPTER .*', '', ball)

# Apply text cleaner function.
parents = text_cleaner(parents)
ball = text_cleaner(ball)

# Parse the cleaned novels.
nlp = spacy.load('en_core_web_sm')
parents_doc = nlp(parents)
ball_doc = nlp(ball)

# Group into sentences.
parents_sents_bow = [[sent, "Edgeworth"] for sent in parents_doc.sents]
ball_sents_bow = [[sent, "Chesterton"] for sent in ball_doc.sents]
sentences_bow = pd.DataFrame(parents_sents_bow + ball_sents_bow)
sentences_bow.head()

Unnamed: 0,0,1
0,"(THE, ORPHANS, .)",Edgeworth
1,"(Near, the, ruins, of, the, castle, of, Rossmo...",Edgeworth
2,"(As, long, as, she, was, able, to, work, ,, sh...",Edgeworth
3,"(Mary, was, at, this, time, about, twelve, yea...",Edgeworth
4,"(One, evening, she, was, sitting, at, the, foo...",Edgeworth


In [3]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
parentswords = bag_of_words(parents_doc)
ballwords = bag_of_words(ball_doc)

# Combine bags to create a set of unique words.
common_words = set(parentswords + ballwords)

In [5]:
# Create the dataframe with features. 
word_counts = bow_features(sentences_bow, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000
Processing row 5500
Processing row 6000
Processing row 6500
Processing row 7000
Processing row 7500
Processing row 8000
Processing row 8500
Processing row 9000
Processing row 9500
Processing row 10000
Processing row 10500
Processing row 11000
Processing row 11500
Processing row 12000
Processing row 12500
Processing row 13000
Processing row 13500
Processing row 14000
Processing row 14500


Unnamed: 0,authority,shout,and,heath,seize,lustrous,imagination,inclined,candlestick,physical,...,family,miracle,coast,dunstable,utterly,quietly,leg,satisfied,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(THE, ORPHANS, .)",Edgeworth
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Near, the, ruins, of, the, castle, of, Rossmo...",Edgeworth
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(As, long, as, she, was, able, to, work, ,, sh...",Edgeworth
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Mary, was, at, this, time, about, twelve, yea...",Edgeworth
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(One, evening, she, was, sitting, at, the, foo...",Edgeworth


In [6]:
from sklearn.model_selection import train_test_split
# Set variables.
Y_bow = word_counts['text_source']
X_bow = np.array(word_counts.drop(['text_sentence','text_source'], 1))

# Train, test split.
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, Y_bow, test_size=0.3, random_state=123)

#### 1. Naive Bayes

In [7]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_bow, y_train_bow)
print('mnb train score:', mnb.score(X_train_bow, y_train_bow))
print('mnb test score:', mnb.score(X_test_bow, y_test_bow))

mnb train score: 0.915985773335
mnb test score: 0.906481273828


#### 2. Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_bow, y_train_bow)
print('lr train score:', lr.score(X_train_bow, y_train_bow))
print('lr test score:', lr.score(X_test_bow, y_test_bow))

lr train score: 0.934922618475
lr test score: 0.891455483292


#### 3. Test on a different corpus

In [9]:
# Clean the Brown data.
brown = gutenberg.raw('chesterton-brown.txt')
brown = re.sub(r'VOLUME \w+', '', brown)
brown = re.sub(r'CHAPTER \w+', '', brown)
brown = text_cleaner(brown)

In [11]:
# Parse the cleaned data.
brown_doc = nlp(brown)
brown_sents = [[sent, "Chesterton"] for sent in brown_doc.sents]
brown_sents = brown_sents[0:len(parents_sents_bow)]
brown_sentences = pd.DataFrame(brown_sents)
brown_bow = bow_features(brown_sentences, common_words)

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500


In [12]:
'''
X_brown_test = np.concatenate((
    X_train[y_train[y_train=='Edgeworth'].index],
    brown_bow.drop(['text_sentence','text_source'], 1)
), axis=0)
y_brown_test = pd.concat([y_train[y_train=='Edgeworth'],
                         pd.Series(['Chesterton'] * brown_bow.shape[0])])
'''
# Set variables.
Y_brown = brown_bow['text_source']
X_brown = np.array(brown_bow.drop(['text_sentence','text_source'], 1))

# RUn the models and return scores.
print('lr score: ', lr.score(X_brown, Y_brown))
print('mnb score: ', mnb.score(X_brown, Y_brown))

lr score:  0.456404736276
mnb score:  0.567814854682


### B. Tfidf

#### 1. Data Pre-Processing

In [13]:
# Process paragraphs.
parents_p = gutenberg.paras('edgeworth-parents.txt')
parents_paras=[]
for paragraph in parents_p:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    parents_paras.append(' '.join(para))

ball_p = gutenberg.paras('chesterton-ball.txt')
ball_paras=[]
for paragraph in ball_p:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    ball_paras.append(' '.join(para))

parents_sents_tfidf = [[sent, "Edgeworth"] for sent in parents_paras]
ball_sents_tfidf = [[sent, "Chesterton"] for sent in ball_paras]
sentences_tfidf = pd.DataFrame(parents_sents_tfidf + ball_sents_tfidf)
sentences_tfidf.head()    

Unnamed: 0,0,1
0,"[ The Parent ' s Assistant , by Maria Edgeworth ]",Edgeworth
1,THE ORPHANS .,Edgeworth
2,"Near the ruins of the castle of Rossmore , in ...",Edgeworth
3,Mary was at this time about twelve years old .,Edgeworth
4,""" No need to stop the wheel , Mary , dear , fo...",Edgeworth


In [14]:
# Train, test split.
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(sentences_tfidf[0], sentences_tfidf[1], test_size=0.3, random_state=123)

# Set the tfidf vectorizer.
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

#### 1. Naive Bayes

In [15]:
# Set the Naive Bayes pipeline to run on the train set and apply on the test set.
from sklearn.pipeline import Pipeline
mnb_pipe = Pipeline([('tfidf', vectorizer), ('clf', mnb)])

# Run train and test scores.
mnb_pipe.fit(X_train_tfidf, y_train_tfidf)
print('mnb train score:', mnb_pipe.score(X_train_tfidf, y_train_tfidf))
print('mnb test score:', mnb_pipe.score(X_test_tfidf, y_test_tfidf))

mnb train score: 0.938638799571
mnb test score: 0.891875


#### 2. Logistic Regression

In [16]:
# Set the Logistic Regression pipeline to run on the train set and apply on the test set.
lr_pipe = Pipeline([('tfidf', vectorizer), ('clf', lr)])

# Run train and test scores.
lr_pipe.fit(X_train_tfidf, y_train_tfidf)
print('mnb train score:', lr_pipe.score(X_train_tfidf, y_train_tfidf))
print('mnb test score:', lr_pipe.score(X_test_tfidf, y_test_tfidf))

mnb train score: 0.913183279743
mnb test score: 0.86375


#### 3. Test on a different corpus

In [17]:
brown_p = gutenberg.paras('chesterton-brown.txt')
brown_paras=[]
for paragraph in brown_p:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    brown_paras.append(' '.join(para))
    
brown_sents_tfidf = [[sent, "Chesterton"] for sent in brown_paras]
brown_tfidf = pd.DataFrame(brown_sents_tfidf)
brown_tfidf.head()

Unnamed: 0,0,1
0,[ The Wisdom of Father Brown by G . K . Cheste...,Chesterton
1,I .,Chesterton
2,"THE consulting - rooms of Dr Orion Hood , the ...",Chesterton
3,Dr Hood paced the length of his string of apar...,Chesterton
4,"Fate , being in a funny mood , pushed the door...",Chesterton


In [18]:
print('mnb score:', mnb_pipe.score(brown_tfidf[0], brown_tfidf[1]))
print('lr test score:', lr_pipe.score(brown_tfidf[0], brown_tfidf[1]))

mnb score: 0.310938845823
lr test score: 0.161068044789


### C. Modify

In [19]:
vectorizer2 = TfidfVectorizer(max_df=0.25,
                             min_df=4,
                             stop_words='english', 
                             lowercase=True, 
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True
                            )
mnb_pipe2 = Pipeline([('tfidf', vectorizer2), ('clf', mnb)])
mnb_pipe2.fit(X_train_tfidf, y_train_tfidf)
print('mnb train score:', mnb_pipe2.score(X_train_tfidf, y_train_tfidf))
print('mnb test score:', mnb_pipe2.score(X_test_tfidf, y_test_tfidf))

mnb train score: 0.924973204716
mnb test score: 0.9025


The bag of words Naive Bayes model achieved the highest score on the test set at 90%. Modifying the tfidf Naive Bayes model, on the other hand, marginally improved the score from 89% to 90%.