In [390]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import string
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn import ensemble
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

nltk.download('gutenberg')
nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Blaine\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Blaine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [418]:
class NLPModel():
    
    nlp = None
    
    df = None
    
    df_tfidf = None
        
    def __init__(self):
        self.nlp = spacy.load('en')

    def run_bow(self, corpus1, corpus1_auth, corpus2, corpus2_auth):
        print("Using Bag of Words Analysis:")
        corpus1 = self.get_corpus(corpus1)
        corpus2 = self.get_corpus(corpus2)

        corpus1 = self.text_cleaner(corpus1)
        corpus2 = self.text_cleaner(corpus2)
        
        #corpus1 = self.text_cleaner(corpus1[:int(len(corpus1)/10)])
        #corpus2 = self.text_cleaner(corpus2[:int(len(corpus2)/10)])
        
        corpus1_doc = self.tokenize(corpus1)
        corpus2_doc = self.tokenize(corpus2)
        
        corpus1_sents = self.sentences(corpus1_doc, corpus1_auth)
        corpus2_sents = self.sentences(corpus2_doc, corpus2_auth)
                
        sentences = pd.DataFrame(corpus1_sents + corpus2_sents)
                
        corpus1_bow = self.bag_of_words(corpus1_doc)
        corpus2_bow = self.bag_of_words(corpus2_doc)
        
        common_words = self.set_common_words(corpus1_bow, corpus2_bow)
        
        word_counts = self.bow_features(sentences, common_words)
        
        Y = word_counts['text_source']
        X = np.array(word_counts.drop(['text_sentence','text_source'], 1))
        
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
        
        self.try_rfc(X_train, X_test, y_train, y_test)
        self.try_lr(X_train, X_test, y_train, y_test)
        self.try_clf(X_train, X_test, y_train, y_test)
        self.try_svm(X_train, X_test, y_train, y_test)
        
    def get_corpus(self, corpus):
        print('  Getting Novel')
        corpus = gutenberg.raw(corpus)
        return corpus
    
    def text_cleaner(self, text, verbose = True):
        if verbose:
            print('  Running Text Cleaner')
        text = text.lower()
        text = re.sub(r'--',' ',text)
        text = re.sub(r'\*', '', text)
        text = re.sub(r"[\[].*?[\]]", "", text)
        text = re.sub(r'Chapter \d+', '', text)
        text = re.sub(r'CHAPTER .*', '', text)
        text = re.sub(r'chapter .*', '', text)
        text = re.sub(r'VOLUME \w+', '', text)
        text = re.sub(r'CHAPTER \w+', '', text)
        text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
        #text = re.sub(r'\w*\d\w*', '', text)
        text = ' '.join(text.split())
        return text
    
    def tokenize(self, corpus, verbose = True):
        if verbose:
            print('  Tokenizing')
        return self.nlp(corpus)
    
    def sentences(self, corpus, auth):
        print('  Getting Sentences')
        return [[sent, auth] for sent in corpus.sents]
    
    def bag_of_words(self, text):
        print('  Running Bag of Words')
        allwords = [token.lemma_
                    for token in text
                    if not token.is_punct
                    and not token.is_stop]
    
        return [item[0] for item in Counter(allwords).most_common(2000)]
    
    def set_common_words(self, corpus1, corpus2):
        print('  Setting Common Words')
        common_words = set(corpus1 + corpus2)
        return common_words
    
    def bow_features(self, sentences, common_words):
        print('  Running BoW Features')
        df = pd.DataFrame(columns=common_words)
        df['text_sentence'] = sentences[0]
        df['text_source'] = sentences[1]
        df.loc[:, common_words] = 0
    
        for i, sentence in enumerate(df['text_sentence']):
        
            words = [token.lemma_
                     for token in sentence
                     if (
                         not token.is_punct
                         and not token.is_stop
                         and token.lemma_ in common_words
                     )]
        
            for word in words:
                df.loc[i, word] += 1
        
            if i % 50 == 0:
                print("    Processing row {}".format(i))
            
        return df
    
    def try_rfc(self, X_train, X_test, y_train, y_test):
        print('  Training Random Forest Classifier')
        rfc = ensemble.RandomForestClassifier()
        
        train = rfc.fit(X_train, y_train)
        
        print('    Training set score: {},\n    Test set score: {},\n    Cross Validation: {}'.format(
            rfc.score(X_train, y_train), rfc.score(X_test, y_test), cross_val_score(rfc, X_train, y_train, cv=10)))

    
    def try_lr(self, X_train, X_test, y_train, y_test):
        print('  Training Logistic Regression')
        lr = LogisticRegression()
        
        train = lr.fit(X_train, y_train)
        
        print('    Training set score: {},\n    Test set score: {},\n    Cross Validation: {}'.format(
            lr.score(X_train, y_train), lr.score(X_test, y_test), cross_val_score(lr, X_train, y_train, cv=10)))

    
    def try_clf(self, X_train, X_test, y_train, y_test):
        print('  Training Boosted Classifier')
        clf = ensemble.GradientBoostingClassifier()
        
        train = clf.fit(X_train, y_train)
        
        print('    Training set score: {},\n    Test set score: {},\n    Cross Validation: {}'.format(
            clf.score(X_train, y_train), clf.score(X_test, y_test), cross_val_score(clf, X_train, y_train, cv=10)))
    
    def try_svm(self, X_train, X_test, y_train, y_test):
        print('  Training Support Vector Machine')
        svm = SVC(kernel='linear')
        
        train = svm.fit(X_train, y_train)
        
        print('    Training set score: {},\n    Test set score: {},\n    Cross Validation: {}'.format(
            svm.score(X_train, y_train), svm.score(X_test, y_test), cross_val_score(svm, X_train, y_train, cv=10)))
        
    def run_tfidf(self, corpus1, corpus1_auth, corpus2, corpus2_auth):
        print('Running tf-idf Analysis')
        corpus1 = self.get_paragraphs(corpus1)
        corpus2 = self.get_paragraphs(corpus2)
        
        corpus1 = self.clean_paragraph(corpus1)
        corpus2 = self.clean_paragraph(corpus2)
        
        corpus1_df = self.build_dataframe(corpus1, corpus1_auth)
        corpus2_df = self.build_dataframe(corpus2, corpus2_auth)
        
        self.df = corpus1_df.append(corpus2_df, ignore_index=True)
                                                                 
        print('  Creating Vectorizer (Unigram, and Bigram)')
        vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, ngram_range=(1,2), use_idf=True,norm=u'l2', smooth_idf=True)
        
        print('  Vectorizing Paragraphs')
        self.df_tfidf = vectorizer.fit_transform(self.df['Text'])
        print('  Number of features: %d' % self.df_tfidf.get_shape()[1])
        
        self.df['TF-IDF'] = self.df_tfidf
        
        X = self.df_tfidf
        Y = self.df['Author']
        
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
        
        print('  Number of features used: 900')
        svd= TruncatedSVD(900)
        lsa = make_pipeline(svd, Normalizer(copy=False))

        X_train_lsa = lsa.fit_transform(X_train)

        variance_explained=svd.explained_variance_ratio_
        total_variance = variance_explained.sum()
        
        print("  Percent variance captured by all components:",total_variance*100)
        
        self.try_rfc(X_train, X_test, y_train, y_test)
        self.try_lr(X_train, X_test, y_train, y_test)
        self.try_clf(X_train, X_test, y_train, y_test)
        self.try_svm(X_train, X_test, y_train, y_test)

    def get_paragraphs(self, corpus):
        print('  Getting Novel (Paragraphs)')
        corpus = gutenberg.paras(corpus)
        corpus_paras=[]
        for paragraph in corpus:
            para=paragraph[0]
            corpus_paras.append(' '.join(para))

        return corpus_paras
    
    def clean_paragraph(self, corpus):
        print('  Cleaning Paragraphs')
        cleaned = [self.text_cleaner(para, False) for para in corpus]
        reduced = [x for x in cleaned if x is not '']
            
        return reduced
    
    def build_dataframe(self, corpus, auth):
        return pd.DataFrame([[para, auth] for para in corpus], columns=['Text', 'Author'])
    
    def get_tfidf(self, corpus):
        print('  Getting TF-IDF for corpus')
        

In [419]:
nlpmodel = NLPModel()
nlpmodel.run_bow('carroll-alice.txt', 'Carroll', 'austen-persuasion.txt', 'Austen')
nlpmodel.run_tfidf('carroll-alice.txt', 'Carroll', 'austen-persuasion.txt', 'Austen')

Using Bag of Words Analysis:
  Getting Novel
  Getting Novel
  Running Text Cleaner
  Running Text Cleaner
  Tokenizing
  Tokenizing
  Getting Sentences
  Getting Sentences
  Running Bag of Words
  Running Bag of Words
  Setting Common Words
  Running BoW Features
    Processing row 0
    Processing row 50
    Processing row 100
    Processing row 150
    Processing row 200
    Processing row 250
    Processing row 300
    Processing row 350
    Processing row 400
    Processing row 450
    Processing row 500
    Processing row 550
    Processing row 600
    Processing row 650
    Processing row 700
    Processing row 750
    Processing row 800
    Processing row 850
    Processing row 900
    Processing row 950
    Processing row 1000
    Processing row 1050
    Processing row 1100
    Processing row 1150
    Processing row 1200
    Processing row 1250
    Processing row 1300
    Processing row 1350
    Processing row 1400
    Processing row 1450
    Processing row 1500
    Processing

# Results

## Bag of Words Approach

#### Random Forest Classifier
Train Set: 98.6%

Test Set: 87.4%

Cross Validation Average: 88.5%

#### Logistic Regression
Train Set: 96.0%

Test Set: 90.1%

Cross Validation Average: 90.5%

#### Boosted Classifier
Train Set: 89.5%

Test Set: 86.7%

Cross Validation Average: 87.0%

#### Support Vector Machine
Train Set: 97.9%

Test Set: 89.5%

Cross Validation Average: 90.2%

## TF-IDF Approach

#### Random Forest Classifier
Train Set: 98.0%

Test Set: 85.4%

Cross Validation Average: 83.5%

#### Logistic Regression
Train Set: 96.7%

Test Set: 84.6%

Cross Validation Average: 83.5%

#### Boosted Classifier
Train Set: 89.7%

Test Set: 82.4%

Cross Validation Average: 81.7%

#### Support Vector Machine
Train Set: 97.1%

Test Set: 87.8%

Cross Validation Average: 88.3%

# Conclusion

Using a bag of words approach, we gather the 2000 most common per author (4000 total). Prediction on which author wrote the test documents is done against this large dataset. Logistic regression presents us with the best output overall for this approach. 

Using a TF-IDF approach, which incorporates vectorizing single words and bigrams, gives us a term frequency for each author. Prediction on which author wrote the test documents is done against this dataset. A support vector machine presents us with the best output for this approach.

Logistic Regression with a bag of words approach seems to give us our best overall performance. This is supported with a difference of 5.9% between the training set score and test set score, as well as the highest cross validation score that almost matches the test set score.

We can conclude that for this experiment, prediction using the most common words of an author is greater than using the words that are unique to the author.