In [240]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk

from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Blaine\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Blaine\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [281]:
class NLPModel():
    
    nlp = None
    
    def __init__(self):
        self.nlp = spacy.load('en')

    def run_bow(self, corpus1, corpus1_auth, corpus2, corpus2_auth):
        print("Using Bag of Words Analysis:")
        corpus1 = self.get_corpus(corpus1)
        corpus2 = self.get_corpus(corpus2)

        corpus1 = self.text_cleaner(corpus1)
        corpus2 = self.text_cleaner(corpus2)
        
        corpus1_doc = self.tokenize(corpus1)
        corpus2_doc = self.tokenize(corpus2)
        
        corpus1_sents = self.sentences(corpus1_doc, corpus1_auth)
        corpus2_sents = self.sentences(corpus2_doc, corpus2_auth)
        
        sentences = pd.DataFrame(corpus1_sents + corpus2_sents)
        
        corpus1_bow = self.bag_of_words(corpus1_doc)
        corpus2_bow = self.bag_of_words(corpus2_doc)
        
        common_words = self.set_common_words(corpus1_bow, corpus2_bow)
        
        word_counts = self.bow_features(sentences, common_words)
        
        Y = word_counts['text_source']
        X = np.array(word_counts.drop(['text_sentence','text_source'], 1))
        
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
        
        print(self.try_rfc(X_train, X_test, y_train, y_test))
        print(self.try_lr(X_train, X_test, y_train, y_test))
        print(self.try_clf(X_train, X_test, y_train, y_test))
        
    def get_corpus(self, corpus):
        print('  Getting Novel')
        corpus = gutenberg.raw(corpus)
        return corpus
    
    def text_cleaner(self, text, verbose = True):
        if verbose:
            print('  Running Text Cleaner')
        text = re.sub(r'--',' ',text)
        text = re.sub("[\[].*?[\]]", "", text)
        text = re.sub(r'Chapter \d+', '', text)
        text = re.sub(r'CHAPTER .*', '', text)
        text = re.sub(r'VOLUME \w+', '', text)
        text = re.sub(r'CHAPTER \w+', '', text)
        text = ' '.join(text.split())
        return text
    
    def tokenize(self, corpus, verbose = True):
        if verbose:
            print('  Tokenizing')
        return self.nlp(corpus)
    
    def sentences(self, corpus, auth):
        print('  Getting Sentences')
        return [[sent, auth] for sent in corpus.sents]
    
    def bag_of_words(self, text):
        print('  Running Bag of Words')
        allwords = [token.lemma_
                    for token in text
                    if not token.is_punct
                    and not token.is_stop]
    
        return [item[0] for item in Counter(allwords).most_common(2000)]
    
    def bow_features(self, sentences, common_words):
        print('  Running BoW Features')
        df = pd.DataFrame(columns=common_words)
        df['text_sentence'] = sentences[0]
        df['text_source'] = sentences[1]
        df.loc[:, common_words] = 0
    
        for i, sentence in enumerate(df['text_sentence']):
        
            words = [token.lemma_
                     for token in sentence
                     if (
                         not token.is_punct
                         and not token.is_stop
                         and token.lemma_ in common_words
                     )]
        
            for word in words:
                df.loc[i, word] += 1
        
            if i % 50 == 0:
                print("    Processing row {}".format(i))
            
        return df
    
    def set_common_words(self, corpus1, corpus2):
        print('  Setting Common Words')
        common_words = set(corpus1 + corpus2)
        return common_words
    
    def try_rfc(self, X_train, X_test, y_train, y_test):
        print('  Training Random Forest Classifier')
        rfc = ensemble.RandomForestClassifier()
        
        train = rfc.fit(X_train, y_train)
        
        results = '    Training set score:', rfc.score(X_train, y_train)
        results += 'Test set score:', rfc.score(X_test, y_test)
        
        return results
    
    def try_lr(self, X_train, X_test, y_train, y_test):
        print('  Training Logistic Regression')
        lr = LogisticRegression()
        
        train = lr.fit(X_train, y_train)
        
        results = '    Training set score:', lr.score(X_train, y_train)
        results += 'Test set score:', lr.score(X_train, y_train)
        
        return results
    
    def try_clf(self, X_train, X_test, y_train, y_test):
        print('  Training Boosted Classifier')
        clf = ensemble.GradientBoostingClassifier()
        
        train = clf.fit(X_train, y_train)
        
        results = '    Training set score:', clf.score(X_train, y_train)
        results += 'Test set score:', clf.score(X_test, y_test)
        
        return results
    
    def run_tfidf(self, corpus):
        print('Running tf-idf Analysis')
        corpus_paras = self.get_paragraphs(corpus)
        cleaned_paras = self.clean_paragraphs(corpus_paras)
        token_paras = self.tokenize_paragraphs(cleaned_paras)

        X_train, X_test = train_test_split(cleaned_paras, test_size=0.4, random_state=0)
        
        vectorizer = TfidfVectorizer(max_df=0.5,
                                    min_df=2,
                                    stop_words='english',
                                    lowercase=True,
                                    use_idf=True,
                                    norm=u'l2',
                                    smooth_idf=True)
        print("  Vectorizer Built")
        
        corpus_paras_tfidf=vectorizer.fit_transform(token_paras)
        print("  Number of features: %d" % corpus_paras_tfidf.get_shape()[1])
        
        X_train_tfidf, X_test_tfidf= train_test_split(corpus_paras_tfidf, test_size=0.4, random_state=0)
        
        X_train_tfidf_csr = X_train_tfidf.tocsr()

        n = X_train_tfidf_csr.shape[0]

        tfidf_bypara = [{} for _ in range(0,n)]

        terms = vectorizer.get_feature_names()

        for i, j in zip(*X_train_tfidf_csr.nonzero()):
            tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

        print('  Original sentence:', X_train[5])
        print('  Tf_idf vector:', tfidf_bypara[5]) 
        
        svd= TruncatedSVD(700)
        lsa = make_pipeline(svd, Normalizer(copy=False))

        X_train_lsa = lsa.fit_transform(X_train_tfidf)
        
        variance_explained=svd.explained_variance_ratio_

        total_variance = variance_explained.sum()

        print("  Percent variance captured by all components:",total_variance*100)
        
        paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
        for i in range(5):
            print('  Component {}:'.format(i))
            print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

    def get_paragraphs(self, corpus):
        print('  Getting Novel paragraphs')
        para = gutenberg.paras(corpus)
        return para
    
    def clean_paragraphs(self, paras):
        print('  Cleaning Paragraphs')
        clean_paras = []
        for paragraph in paras:
            para = paragraph[0]
            para = ' '.join(para)
            para = nlpmodel.text_cleaner(para, False)
            clean_paras.append(para)
        
        return clean_paras
    
    def tokenize_paragraphs(self, paras):
        print('  Tokenizing Paragraphs')
        token_paras = []
        for para in paras:
            token_paras.append(str(self.tokenize(para, False)))

        return token_paras

In [282]:
nlpmodel = NLPModel()
nlpmodel.run_bow('carroll-alice.txt', 'Carroll', 'austen-persuasion.txt', 'Austen')
nlpmodel.run_tfidf('austen-emma.txt')

Using Bag of Words Analysis:
  Getting Novel
  Getting Novel
  Running Text Cleaner
  Running Text Cleaner
  Tokenizing
  Tokenizing
  Getting Sentences
  Getting Sentences
  Running Bag of Words
  Running Bag of Words
  Setting Common Words
  Running BoW Features
    Processing row 0
    Processing row 50
    Processing row 100
    Processing row 150
    Processing row 200
    Processing row 250
    Processing row 300
    Processing row 350
    Processing row 400
    Processing row 450
    Processing row 500
    Processing row 550
    Processing row 600
    Processing row 650
    Processing row 700
    Processing row 750
    Processing row 800
    Processing row 850
    Processing row 900
    Processing row 950
    Processing row 1000
    Processing row 1050
    Processing row 1100
    Processing row 1150
    Processing row 1200
    Processing row 1250
    Processing row 1300
    Processing row 1350
    Processing row 1400
    Processing row 1450
    Processing row 1500
    Processing



('    Training set score:', 0.9862068965517241, 'Test set score:', 0.8952067669172933)
  Training Logistic Regression




('    Training set score:', 0.9579937304075236, 'Test set score:', 0.9579937304075236)
  Training Boosted Classifier
('    Training set score:', 0.886833855799373, 'Test set score:', 0.8735902255639098)
Running tf-idf Analysis
  Getting Novel paragraphs
  Cleaning Paragraphs
  Tokenizing Paragraphs
  Vectorizer Built
  Number of features: 1931
  Original sentence: A very few minutes more , however , completed the present trial .
  Tf_idf vector: {'minutes': 0.7127450310382584, 'present': 0.701423210857947}
  Percent variance captured by all components: 92.93742258829678
  Component 0:
" Oh !    0.999171
" Oh !    0.999171
" Oh !    0.999171
" Oh !    0.999171
" Oh !    0.999171
" Oh !    0.999171
" Oh !    0.999171
" Oh !    0.999171
" Oh !    0.999171
" Oh !    0.999171
Name: 0, dtype: float64
  Component 1:
" There were misunderstandings between them , Emma ; he said so expressly .                                                                                                 0.51762