### NLP SUPERVISED MODEL

In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
%matplotlib inline

In [2]:
import nltk
from nltk.corpus import gutenberg
import spacy
import re
from sklearn.model_selection import train_test_split

In [3]:
# Grab and process the raw data.
print(gutenberg.fileids())
bible = gutenberg.raw('bible-kjv.txt')
paradise = gutenberg.raw('milton-paradise.txt')
print(bible[0:500])
print(paradise[0:100])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
[The King James Bible]

The Old Testament of the King James Bible

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.

1:2 And the earth was without form, and void; and darkness was upon
the face of the deep. And the Spirit of God moved upon the face of the
waters.

1:3 And God said, Let there be light: and there was light.

1:4 And God saw the light, that it was good: and God divided the light
from the darkness.

1:5 And God called the light Da
[Paradise Lost by John Milton 1667] 
 
 
Book I 
 
 
Of Man's first disobedienc

In [4]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

# The Chapter indicator is idiosyncratic
bible = re.sub(r'\d:*', '', bible)
bible = re.sub(r',:', '', bible)
paradise = re.sub(r'Book .*', '', paradise)

paradise = text_cleaner(paradise[:int(len(paradise)/10)])
bible = text_cleaner(bible[:int(len(bible)/50)])

In [5]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en_core_web_sm')
paradise_doc = nlp(paradise)
bible_doc = nlp(bible)

In [6]:
# Group into sentences.
paradise_sents = [[sent, "Milton"] for sent in paradise_doc.sents]
bible_sents = [[sent, "bible"] for sent in bible_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(paradise_sents + bible_sents)
sentences.tail()

Unnamed: 0,0,1
1033,"(And, Isaac, went, out, to, meditate, in, the,...",bible
1034,"(And, Rebekah, lifted, up, her, eyes, ,, and, ...",bible
1035,"(For, she, had, said, unto, the, servant, ,, W...",bible
1036,"(And, the, servant, had, said, ,)",bible
1037,"(It, is, my, master, :, t)",bible


In [7]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
paradisewords = bag_of_words(paradise_doc)
biblewords = bag_of_words(bible_doc)

# Combine bags to create a set of unique words.
common_words = set(paradisewords + biblewords)

#### Feature Creation using the BoW Method
In This method features are common words on the texts.

In [8]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450
Processing row 500
Processing row 550
Processing row 600
Processing row 650
Processing row 700
Processing row 750
Processing row 800
Processing row 850
Processing row 900
Processing row 950
Processing row 1000


Unnamed: 0,Accept,Rehoboth,wile,battlement,wilt,fight,obey,renown,suggestion,frequent,...,abroad,whale,hurl,grave,aileth,expatiate,pregnant,Danaw,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Of, Man, 's, first, disobedience, ,, and, the...",Milton
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(,, I, thence, Invoke, thy, aid, to, my, adven...",Milton
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, chiefly, thou, ,, O, Spirit, ,, that, do...",Milton
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(outspread, ,, Dove, -, like, sat'st, brooding...",Milton
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,"(it, pregnant, :, what, in, me, is, dark, Illu...",Milton


In [40]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

y = word_counts['text_source']
x = word_counts.drop(['text_source', 'text_sentence'], axis = 1)
svd= TruncatedSVD(20)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
x_lsa = lsa.fit_transform(x)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)


Percent variance captured by all components: 29.309532241506115


In [41]:
x_train, x_test, y_train, y_test = train_test_split(x_lsa, y, test_size=0.4, random_state=0)

In [42]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2') # No need to specify l2 as it's the default. But we put it for demonstration.
train = lr.fit(x_train, y_train)
print(x_train.shape, y_train.shape)
print('Training set score:', lr.score(x_train, y_train))
print('\nTest set score:', lr.score(x_test, y_test))

(622, 20) (622,)
Training set score: 0.9453376205787781

Test set score: 0.9302884615384616




#### Feature Creation using the tf-idf Method.
In this method however the features are the sentences in the texts.

In [9]:
#reading in the data, this time in the form of paragraphs
paradise_para = gutenberg.paras('milton-paradise.txt')
bible_para=gutenberg.paras('bible-kjv.txt')

paradise_paras=[]
for paragraph in paradise_para:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    para = [re.sub(r'\d*','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    paradise_paras.append(' '.join(para))
print(paradise_paras[0:4])

['[ Paradise Lost by John Milton  ]', 'Book I', "Of Man ' s first disobedience , and the fruit Of that forbidden tree whose mortal taste Brought death into the World , and all our woe , With loss of Eden , till one greater Man Restore us , and regain the blissful seat , Sing , Heavenly Muse , that , on the secret top Of Oreb , or of Sinai , didst inspire That shepherd who first taught the chosen seed In the beginning how the heavens and earth Rose out of Chaos : or , if Sion hill Delight thee more , and Siloa ' s brook that flowed Fast by the oracle of God , I thence Invoke thy aid to my adventurous song , That with no middle flight intends to soar Above th ' Aonian mount , while it pursues Things unattempted yet in prose or rhyme .", 'Book II']


In [10]:
bible_paras=[]
for paragraph in bible_para:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    para = [re.sub(r'\d*','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    bible_paras.append(' '.join(para))

print(bible_paras[0:4])

['[ The King James Bible ]', 'The Old Testament of the King James Bible', 'The First Book of Moses : Called Genesis', ' :  In the beginning God created the heaven and the earth .']


In [11]:
#importing the TfidfVectorizer which creates vectors of terms for each document.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
paradise_paras_tfidf=vectorizer.fit_transform(paradise_paras)
paradise_features = vectorizer.get_feature_names()
print("Number of features: {}".format(paradise_paras_tfidf.get_shape()[1]))

Number of features: 43


In [12]:
#Applying the vectorizer
bible_paras_tfidf=vectorizer.fit_transform(bible_paras)
bible_features = vectorizer.get_feature_names()
print("Number of features: {}".format(bible_paras_tfidf.get_shape()[1]))

Number of features: 7880


In [13]:
#Change the term frequency matrix data to pandas data frame.
paradise_paras_tfidf_df = pd.DataFrame(paradise_paras_tfidf.todense())
paradise_paras_tfidf_df = paradise_paras_tfidf_df.round(1)
#print the features as column names of the data frame.
paradise_paras_tfidf_df.columns = paradise_features
#create the labels
paradise_label = pd.Series(data = 29*['Milton'])
paradise_label = pd.DataFrame(paradise_label, columns = ['authors'])

# The dimension of the paradise data frame
print('Number of rows and colummns in paradise dataframe: {}'.format(paradise_paras_tfidf_df.shape))

# #Lets concatnate the data column with the label
paradise_paras_tfidf_df = pd.concat([paradise_paras_tfidf_df, paradise_label], axis = 1)
paradise_paras_tfidf_df.head()


Number of rows and colummns in paradise dataframe: (29, 43)


Unnamed: 0,adam,angel,beginning,book,celestial,earth,eve,fixed,flight,fruit,...,thee,thou,thy,till,unblam,voice,waked,woe,world,authors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Milton
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Milton
2,0.0,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.2,0.2,...,0.2,0.0,0.2,0.2,0.0,0.0,0.0,0.2,0.2,Milton
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Milton
4,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Milton


In [14]:
# Lets follow the same procedure as the above paradise data and create our bible pandas dataframe.

bible_paras_tfidf_df = pd.DataFrame(bible_paras_tfidf.todense())
print('Number of rows and colummns in bible dataframe: {}'.format(bible_paras_tfidf_df.shape))
bible_paras_tfidf_df.columns = bible_features
bible_label = pd.Series(data = 24608 *['kjv'])
bible_label = pd.DataFrame(bible_label, columns = ['authors'])
bible_paras_tfidf_df = pd.concat([bible_paras_tfidf_df, bible_label], axis = 1)
bible_paras_tfidf_df.head()

Number of rows and colummns in bible dataframe: (24608, 7880)


Unnamed: 0,aaron,aaronites,abarim,abase,abased,abated,abba,abda,abdi,abdon,...,zohar,zophah,zophar,zorah,zorobabel,zuar,zuph,zur,zurishaddai,authors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,kjv
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,kjv
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,kjv
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,kjv
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,kjv


In [15]:
new_data = pd.concat([bible_paras_tfidf_df,paradise_paras_tfidf_df],ignore_index=True, sort = False)

In [16]:
new_data = new_data.fillna(0)

In [17]:
new_data.head()

Unnamed: 0,aaron,aaronites,abarim,abase,abased,abated,abba,abda,abdi,abdon,...,zur,zurishaddai,authors,celestial,gorgeous,morn,rosy,soar,unblam,waked
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,kjv,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,kjv,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,kjv,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,kjv,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,kjv,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
Y = new_data['authors']
X = new_data.drop('authors', axis = 1)

**Feature Extraction and Dimension reduction**

Lets use SVD for exracting only important features from the high dimension data.

In [45]:
#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(100)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_lsa = lsa.fit_transform(X)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)


Percent variance captured by all components: 23.08566422332158


The SVD model with that amount of total components explains close to 66% of the total variance in the data.So lets go with it and move to supervised learning classification approach to identify if a paragraph is from the 'bible' text or the 'paradise' text.

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X_lsa, Y, test_size=0.4, random_state=0)

In [47]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2') # No need to specify l2 as it's the default. But we put it for demonstration.
Train = lr.fit(X_train, Y_train)
print(X_train.shape, Y_train.shape)
print('Training set score:', lr.score(X_train, Y_train))
print('\nTest set score:', lr.score(X_test, Y_test))

(14782, 100) (14782,)




Training set score: 0.9991205520227303

Test set score: 0.9983764586504312
