In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import spacy
from nltk.corpus import state_union, stopwords
from collections import Counter

# 4.4.5 Challenge - Build Your Own NLP Model

For this challenge, you will need to choose a corpus of data from nltk or another source that includes categories you can predict and create an analysis pipeline that includes the following steps:

1. Data cleaning / processing / language parsing
2. Create features using two different NLP methods: For example, BoW vs tf-idf.
3. Use the features to fit supervised learning models for each feature set to predict the category outcomes.  Use cross-validation and determine whether one model performed better.
5. Pick one of the models and try to increase accuracy by at least 5 percentage points.

## Data Processing

In [2]:
#See Corpus
import os
import nltk.corpus
print(os.listdir(nltk.data.find("corpora")))

['abc', 'abc.zip', 'alpino', 'alpino.zip', 'biocreative_ppi', 'biocreative_ppi.zip', 'brown', 'brown.zip', 'brown_tei', 'brown_tei.zip', 'cess_cat', 'cess_cat.zip', 'cess_esp', 'cess_esp.zip', 'chat80', 'chat80.zip', 'city_database', 'city_database.zip', 'cmudict', 'cmudict.zip', 'comparative_sentences', 'comparative_sentences.zip', 'comtrans.zip', 'conll2000', 'conll2000.zip', 'conll2002', 'conll2002.zip', 'conll2007.zip', 'crubadan', 'crubadan.zip', 'dependency_treebank', 'dependency_treebank.zip', 'dolch', 'dolch.zip', 'europarl_raw', 'europarl_raw.zip', 'floresta', 'floresta.zip', 'framenet_v15', 'framenet_v15.zip', 'framenet_v17', 'framenet_v17.zip', 'gazetteers', 'gazetteers.zip', 'genesis', 'genesis.zip', 'gutenberg', 'gutenberg.zip', 'ieer', 'ieer.zip', 'inaugural', 'inaugural.zip', 'indian', 'indian.zip', 'jeita.zip', 'kimmo', 'kimmo.zip', 'knbc.zip', 'lin_thesaurus', 'lin_thesaurus.zip', 'machado.zip', 'mac_morpho', 'mac_morpho.zip', 'masc_tagged.zip', 'movie_reviews', 'movie

In [3]:
# Import Presidential State of the Unions file ids
state_union.fileids()

['1945-Truman.txt',
 '1946-Truman.txt',
 '1947-Truman.txt',
 '1948-Truman.txt',
 '1949-Truman.txt',
 '1950-Truman.txt',
 '1951-Truman.txt',
 '1953-Eisenhower.txt',
 '1954-Eisenhower.txt',
 '1955-Eisenhower.txt',
 '1956-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1958-Eisenhower.txt',
 '1959-Eisenhower.txt',
 '1960-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1962-Kennedy.txt',
 '1963-Johnson.txt',
 '1963-Kennedy.txt',
 '1964-Johnson.txt',
 '1965-Johnson-1.txt',
 '1965-Johnson-2.txt',
 '1966-Johnson.txt',
 '1967-Johnson.txt',
 '1968-Johnson.txt',
 '1969-Johnson.txt',
 '1970-Nixon.txt',
 '1971-Nixon.txt',
 '1972-Nixon.txt',
 '1973-Nixon.txt',
 '1974-Nixon.txt',
 '1975-Ford.txt',
 '1976-Ford.txt',
 '1977-Ford.txt',
 '1978-Carter.txt',
 '1979-Carter.txt',
 '1980-Carter.txt',
 '1981-Reagan.txt',
 '1982-Reagan.txt',
 '1983-Reagan.txt',
 '1984-Reagan.txt',
 '1985-Reagan.txt',
 '1986-Reagan.txt',
 '1987-Reagan.txt',
 '1988-Reagan.txt',
 '1989-Bush.txt',
 '1990-Bush.txt',
 '1991-Bush-1.txt',
 '1991-B

In [4]:
# See the first speech of Truman and Eisenhower
truman = state_union.raw('1945-Truman.txt')
eisenhower = state_union.raw('1953-Eisenhower.txt')

In [5]:
# Parse using Spacy
nlp = spacy.load('en_core_web_sm')
truman_doc = nlp(truman)
eisenhower_doc = nlp(eisenhower)

In [6]:
# Let look at excerpts from each speech
print(truman_doc[:150])
print('\nTruman speech length:', len(truman_doc))

print('\n', eisenhower_doc[:150])
print('\neisenhower_doc speech length:', len(eisenhower_doc))

PRESIDENT HARRY S. TRUMAN'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS
 
April 16, 1945

Mr. Speaker, Mr. President, Members of the Congress:
It is with a heavy heart that I stand before you, my friends and colleagues, in the Congress of the United States.
Only yesterday, we laid to rest the mortal remains of our beloved President, Franklin Delano Roosevelt. At a time like this, words are inadequate. The most eloquent tribute would be a reverent silence.
Yet, in this decisive hour, when world events are moving so rapidly, our silence might be misunderstood and might give comfort to our enemies.
In His infinite wisdom, Almighty God has seen fit to take from us a great man who loved, and was beloved by

Truman speech length: 2194

 PRESIDENT DWIGHT D. EISENHOWER'S ANNUAL MESSAGE TO THE CONGRESS ON THE STATE OF THE UNION
 
February 2, 1953

Mr. President, Mr. Speaker, Members of the Eighty-third Congress:
I welcome the honor of appearing before you to deliver my first message to the C

In [7]:
#Group into sentences
truman_sentence = [[sent, 'Truman']for sent in truman_doc.sents]
eisenhower_sentence = [[sent, 'Eisenhower'] for sent in eisenhower_doc.sents ]

#Combine the sentences
sentences = pd.DataFrame(truman_sentence + eisenhower_sentence)
sentences

Unnamed: 0,0,1
0,"(PRESIDENT, HARRY, S., TRUMAN, 'S, ADDRESS, BE...",Truman
1,"(It, is, with, a, heavy, heart, that, I, stand...",Truman
2,"(Only, yesterday, ,, we, laid, to, rest, the, ...",Truman
3,"(At, a, time, like, this, ,, words, are, inade...",Truman
4,"(The, most, eloquent, tribute, would, be, a, r...",Truman
5,"(Yet, ,, in, this, decisive, hour, ,, when, wo...",Truman
6,"(In, His, infinite, wisdom, ,, Almighty, God, ...",Truman
7,"(No, man, could, possibly, fill, the, tremendo...",Truman
8,"(No, words, can, ease, the, aching, hearts, of...",Truman
9,"(The, world, knows, it, has, lost, a, heroic, ...",Truman


## Bag of Words(BoW) Features

In [8]:
# Create bag of words function for each text
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(500)]

# Get bags 
truman_words = bag_of_words(truman_doc)
eisenhower_words = bag_of_words(eisenhower_doc)

# Combine bags to create common set of unique words
common_words = set(truman_words + eisenhower_words)

In [9]:
print('\nLength of Common_words: ', len(common_words))
print('\n', common_words)



Length of Common_words:  861

 {'Admiral', 'unpunished', 'pay', 'equipment', 'hope', 'attack', 'embrace', 'committee', 'yesterday', 'wish', 'function', 'noble', 'opportunity', 'industrial', 'direction', 'alike', 'political', 'safety', 'resolute', 'good', 'democratic', 'devoid', 'Western', 'future', 'waste', 'frequently', 'call', 'enlighten', 'conservation', 'strength', 'calculated', 'sufficient', 'price', 'administrative', 'State', 'require', 'judge', 'dominate', 'fill', 'understanding', 'settlement', 'search', 'defeatism', 'employment', '30', 'American', 'unconscious', 'wisdom', 'worker', 'agricultural', 'shortage', 'humanity', 'match', 'cover', 'dark', 'expire', 'instruct', 'land', 'partial', 'million', 'staggering', 'destroy', 'nery', 'proper', 'vital', 'exchange', 'Mr.', 'beat', 'bitter', 'high', 'local', 'last', 'grow', 'colleague', 'barrier', 'proportion', 'today', 'want', 'population', 'wage', 'improve', 'offer', 'summon', 'join', 'conspiracy', 'insist', 'school', 'problem', 'v

In [10]:
# Create bag of words data frame using combined common words and sentences
def bow_features(sentences, common_words):
    
    # Build data frame
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentences in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentences
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
    
    return df

In [11]:
# Create bow features 
bow = bow_features(sentences, common_words)
bow.head()

Unnamed: 0,Admiral,unpunished,pay,equipment,hope,attack,embrace,committee,yesterday,wish,...,Hitler,income,accept,void,determine,year,situation,\n \n,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,"(PRESIDENT, HARRY, S., TRUMAN, 'S, ADDRESS, BE...",Truman
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(It, is, with, a, heavy, heart, that, I, stand...",Truman
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,"(Only, yesterday, ,, we, laid, to, rest, the, ...",Truman
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(At, a, time, like, this, ,, words, are, inade...",Truman
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, most, eloquent, tribute, would, be, a, r...",Truman


## TF-Features
### tdidf (term frequency–inverse document frequency)

In [12]:
# Grab sentence level documents in NLTK
truman = state_union.sents('1945-Truman.txt')
eisenhower = state_union.sents('1953-Eisenhower.txt')

In [13]:
# Create list of text
truman_list = ["".join(sent) for sent in truman]
eisenhower_list = [" ".join(sent) for sent in eisenhower]
joined = truman_list + eisenhower_list


In [14]:
# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, 
                             min_df=2, 
                             stop_words='english',   
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )

tfidf = vectorizer.fit_transform(joined).tocsr()

In [15]:
tfidf

<465x605 sparse matrix of type '<class 'numpy.float64'>'
	with 2333 stored elements in Compressed Sparse Row format>

## Supervised Learning Models
Evaluate each feature set using cross validation.  Models tested: Logistic Regression, Random Forest, & Gradient Boosting.

In [16]:
from sklearn.model_selection import cross_val_score

# Specify model inputs for each feature set

# BoW
X_bow = bow.drop(['text_sentence', 'text_source'], 1)
Y_bow = bow['text_source']

# Tfidf
X_tfidf = tfidf
Y_tfidf = ['Truman']*len(truman_list) + ['eisenhower']*len(eisenhower_list)

In [17]:
Y_tfidf

['Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',
 'Truman',

In [18]:
X_bow.head()

Unnamed: 0,Admiral,unpunished,pay,equipment,hope,attack,embrace,committee,yesterday,wish,...,determination,yearn,Hitler,income,accept,void,determine,year,situation,\n \n
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

#BoW
lr = LogisticRegression()
lr_bow = lr.fit(X_bow, Y_bow)
print('BoW LogReg Scores: ', cross_val_score(lr_bow, X_bow, Y_bow, cv = 5))
print('Avg Score: ', np.mean(cross_val_score(lr_bow, X_bow, Y_bow, cv = 5)))

#Tfidf
lr = LogisticRegression()
lr_tfidf = lr.fit(X_bow, Y_bow)
print('\nTfidf LogReg Scores:', cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5)))



BoW LogReg Scores:  [0.77319588 0.8125     0.79166667 0.83333333 0.78947368]




Avg Score:  0.8000339120998372





Tfidf LogReg Scores: [0.74468085 0.75268817 0.75268817 0.75268817 0.75      ]




Avg Score: 0.7505490734385724


### Random Forest

In [20]:
from sklearn import ensemble

# BoW
rfc = ensemble.RandomForestClassifier()
rfc_bow = rfc.fit(X_bow, Y_bow)
print('BoW Random Forest Scores: ', cross_val_score(rfc_bow, X_bow, Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(rfc_bow, X_bow, Y_bow, cv=5)))

# Tfidf
rfc = ensemble.RandomForestClassifier()
rfc_tfidf = rfc.fit(X_tfidf, Y_tfidf)
print('\nTfidf Random Forest Scores:', cross_val_score(rfc_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(rfc_tfidf, X_tfidf, Y_tfidf, cv=5)))



BoW Random Forest Scores:  [0.77319588 0.77083333 0.78125    0.78125    0.81052632]
Avg Score: 0.7876851600651112





Tfidf Random Forest Scores: [0.71276596 0.65591398 0.76344086 0.68817204 0.72826087]
Avg Score: 0.7182203853461052


### Gradient Boosting

In [21]:
# BoW
clf = ensemble.GradientBoostingClassifier()
clf_bow = clf.fit(X_bow, Y_bow)
print('Bow Gradient Boosting Scores:', cross_val_score(clf_bow, X_bow,Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(clf_bow, X_bow, Y_bow, cv=5)))

# Tfidf
clf = ensemble.GradientBoostingClassifier()
clf_tfidf = clf.fit(X_tfidf, Y_tfidf)
print('\nTfidf Random Forest Scores:', cross_val_score(clf_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(clf_tfidf, X_tfidf, Y_tfidf, cv=5)))

Bow Gradient Boosting Scores: [0.71134021 0.80208333 0.77083333 0.75       0.77894737]
Avg Score: 0.7668294447458853

Tfidf Random Forest Scores: [0.75531915 0.76344086 0.75268817 0.75268817 0.75      ]
Avg Score: 0.7548272706474493


## Try to increase Accuracy by 5%

### Use BoW to increase

In [22]:
# Create bag of words function for each text
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(1000)]

# Get bags 
truman_words = bag_of_words(truman_doc)
eisenhower_words = bag_of_words(eisenhower_doc)

# Combine bags to create common set of unique words
common_words = set(truman_words + eisenhower_words)

In [23]:
# Create bow features 
new_bow = bow_features(sentences, common_words)
new_bow.head()

Unnamed: 0,Admiral,wish,size,alike,safety,devoid,democratic,Western,frequently,conservation,...,have,backward,contingency,secret,November,persist,determine,situation,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(PRESIDENT, HARRY, S., TRUMAN, 'S, ADDRESS, BE...",Truman
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(It, is, with, a, heavy, heart, that, I, stand...",Truman
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Only, yesterday, ,, we, laid, to, rest, the, ...",Truman
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(At, a, time, like, this, ,, words, are, inade...",Truman
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, most, eloquent, tribute, would, be, a, r...",Truman


In [24]:
# Make new X and Y inputs
X_new_bow = new_bow.drop(['text_sentence', 'text_source'], 1)
Y_new_bow = new_bow['text_source']

# Rerun BoW
lr = LogisticRegression()
lr_new_bow = lr.fit(X_new_bow, Y_new_bow)
print('New_BoW (big) Logistic Regression Scores: ', cross_val_score(lr_new_bow, X_new_bow, Y_new_bow, cv=5))
print('Avg. Score ', np.mean(cross_val_score(lr_new_bow, X_new_bow, Y_new_bow, cv=5)))

#Tfidf
lr = LogisticRegression()
lr_tfidf = lr.fit(X_new_bow, Y_new_bow)
print('\nTfidf LogReg Scores:', cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5)))



New_BoW (big) Logistic Regression Scores:  [0.77319588 0.8125     0.79166667 0.83333333 0.78947368]




Avg. Score  0.8000339120998372





Tfidf LogReg Scores: [0.74468085 0.75268817 0.75268817 0.75268817 0.75      ]




Avg Score: 0.7505490734385724


####  Increase common word to 2000 and include punctuation and stop word

In [25]:
# Include punctuation
# Create bag of words function for each text
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text]
    
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(2000)]

# Get bags 
truman_words = bag_of_words(truman_doc)
eisenhower_words = bag_of_words(eisenhower_doc)

# Combine bags to create common set of unique words
common_words = set(truman_words + eisenhower_words)

In [26]:
# Create bow features 
new_bow_1 = bow_features(sentences, common_words)


KeyboardInterrupt: 

In [None]:
# Make new X and Y inputs
X_new_bow = new_bow_1.drop(['text_sentence', 'text_source'], 1)
Y_new_bow = new_bow_1['text_source']

# Rerun BoW
lr = LogisticRegression()
lr_new_bow = lr.fit(X_new_bow, Y_new_bow)
print('New_BoW (big) Logistic Regression Scores: ', cross_val_score(lr_new_bow, X_new_bow, Y_new_bow, cv=5))
print('Avg. Score ', np.mean(cross_val_score(lr_new_bow, X_new_bow, Y_new_bow, cv=5)))

## Use tfidf to increase accuracy

In [None]:
# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.80, 
                             min_df=4, 
                             stop_words='english',   
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )

tfidf = vectorizer.fit_transform(joined).tocsr()

In [None]:
from sklearn.linear_model import LogisticRegression

#Tfidf
lr = LogisticRegression()
lr_tfidf = lr.fit(X_bow, Y_bow)
print('\nTfidf LogReg Scores:', cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5)))

### Pick A Model and Try to Increase Accuracy by 5%

__Model: Logistic Regression Using BoW Feature Set__

In [27]:
# Create bow features 
big_bow = bow_features(sentences, common_words)

KeyboardInterrupt: 

In [None]:
big_bow.head()

In [None]:
# Make new X and Y inputs
X_big_bow = big_bow.drop(['text_sentence', 'text_source'], 1)
Y_big_bow = big_bow['text_source']

# Rerun BoW
lr = LogisticRegression()
lr_big_bow = lr.fit(X_big_bow, Y_big_bow)
print('BoW (big) Logistic Regression Scores: ', cross_val_score(lr_big_bow, X_big_bow, Y_big_bow, cv=5))
print('Avg. Score ', np.mean(cross_val_score(lr_big_bow, X_big_bow, Y_big_bow, cv=5)))

Using a bigger bag of words actually made the average score get worse by about 1%.  Try out another method - include punctuation in BoW.

In [None]:
# Update function, go back to 500 most common words and add in punctuation
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_stop]
                   
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(500)]

# Get bags 
bush_words = bag_of_words(bush_doc)
clinton_words = bag_of_words(clinton_doc)

# Combine bags to create common set of unique words
common_words = set(bush_words + clinton_words)

In [None]:
# Create bow features 
bow = bow_features(sentences, common_words)

In [None]:
# Regenerate model features
X_bow = bow.drop(['text_sentence', 'text_source'], 1)
Y_bow = bow['text_source']

In [None]:
# Rerun model
lr = LogisticRegression(
    )
lr_bow = lr.fit(X_bow, Y_bow)
print('BoW #3 - Logistic Regression Scores: ', cross_val_score(lr_bow, X_bow, Y_bow, cv=5))
print('Avg. Score ', np.mean(cross_val_score(lr_bow, X_bow, Y_bow, cv=5)))