In [4]:
import pandas as pd

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text

In [6]:
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

In [7]:
import re
import string

In [8]:
from google.colab import files
uploaded = files.upload()

Saving inaug_speeches.csv to inaug_speeches.csv


In [15]:
pd.set_option('max_colwidth', 150)
df = pd.read_csv('inaug_speeches.csv', engine='python')
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Inaugural Address,Date,text
0,4,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House of Representatives: ��AMONG the vicissitudes incident to life no event could have fille...
1,5,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: ��I AM again called upon by the voice of my country to execute the functions of its Chief Magistrate. When the occas...
2,6,John Adams,Inaugural Address,"Saturday, March 4, 1797","��WHEN it was first perceived, in early times, that no middle course for America remained between unlimited submission to a foreign le..."
3,7,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801","Friends and Fellow-Citizens: ��CALLED upon to undertake the duties of the first executive office of our country, I avail myself of th..."
4,8,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","��PROCEEDING, fellow-citizens, to that qualification which the Constitution requires before my entrance on the charge again conferred ..."


In [16]:
# Select Rows that are first term inaugural addresses
df = df.drop_duplicates(subset=['Name'], keep='first')
# Clean Up Index
df = df.reset_index()
# Select only President's Names and their Speeches
df = df[['Name', 'text']]
# Set Index to President's Names
df = df.set_index('Name')
# Visually Inspect
df.head()

Unnamed: 0_level_0,text
Name,Unnamed: 1_level_1
George Washington,Fellow-Citizens of the Senate and of the House of Representatives: ��AMONG the vicissitudes incident to life no event could have fille...
John Adams,"��WHEN it was first perceived, in early times, that no middle course for America remained between unlimited submission to a foreign le..."
Thomas Jefferson,"Friends and Fellow-Citizens: ��CALLED upon to undertake the duties of the first executive office of our country, I avail myself of th..."
James Madison,"��UNWILLING to depart from examples of the most revered authority, I avail myself of the occasion now presented to express the profoun..."
James Monroe,��I SHOULD be destitute of feeling if I was not deeply affected by the strong proof which my fellow-citizens have given me of their co...


In [17]:
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, 
    remove punctuation, remove read errors,
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('�', ' ', text)
    return text
round1 = lambda x: clean_text_round1(x)
# Clean Speech Text
df["text"] = df["text"].apply(round1)
# Visually Inspect
df.head()

Unnamed: 0_level_0,text
Name,Unnamed: 1_level_1
George Washington,fellow citizens of the senate and of the house of representatives among the vicissitudes incident to life no event could have fille...
John Adams,when it was first perceived in early times that no middle course for america remained between unlimited submission to a foreign le...
Thomas Jefferson,friends and fellow citizens called upon to undertake the duties of the first executive office of our country i avail myself of th...
James Madison,unwilling to depart from examples of the most revered authority i avail myself of the occasion now presented to express the profoun...
James Monroe,i should be destitute of feeling if i was not deeply affected by the strong proof which my fellow citizens have given me of their co...


In [27]:

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [28]:
# Noun extract and lemmatize function
def nouns(text):
    '''Given a string of text, tokenize the text 
    and pull out only the nouns.'''
    # create mask to isolate words that are nouns
    is_noun = lambda pos: pos[:2] == 'NN'
    # store function to split string of words 
    # into a list of words (tokens)
    tokenized = word_tokenize(text)
    # store function to lemmatize each word
    wordnet_lemmatizer = WordNetLemmatizer()
    # use list comprehension to lemmatize all words 
    # and create a list of all nouns
    all_nouns = [wordnet_lemmatizer.lemmatize(word) \
    for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    
    #return string of joined list of nouns
    return ' '.join(all_nouns)
# Create dataframe of only nouns from speeches
data_nouns = pd.DataFrame(df.text.apply(nouns))
# Visually Inspect
data_nouns.head()


Unnamed: 0_level_0,text
Name,Unnamed: 1_level_1
George Washington,citizen senate house representative vicissitude incident life event anxiety notification order day month hand i country voice i veneration love re...
John Adams,time course america submission legislature independence claim men reflection danger power fleet army contest dissension form government part count...
Thomas Jefferson,friend citizen duty executive office country i myself presence portion fellow citizen thanks favor consciousness task talent presentiment greatnes...
James Madison,example authority i myself occasion profound impression call country station duty i sanction mark confidence proceeding deliberate suffrage nation...
James Monroe,i destitute i proof fellow citizen confidence office function expression opinion conduct service gratification sensibility estimate importance tru...


In [29]:
# Add additional stop words since we are recreating the document-term matrix
stop_noun = ["america", 'today', 'thing']
stop_words_noun_agg = text.ENGLISH_STOP_WORDS.union(stop_noun)
# Create a document-term matrix with only nouns
# Store TF-IDF Vectorizer
tv_noun = TfidfVectorizer(stop_words=stop_words_noun_agg, ngram_range = (1,1), max_df = .8, min_df = .01)
# Fit and Transform speech noun text to a TF-IDF Doc-Term Matrix
data_tv_noun = tv_noun.fit_transform(data_nouns.text)
# Create data-frame of Doc-Term Matrix with nouns as column names
data_dtm_noun = pd.DataFrame(data_tv_noun.toarray(), columns=tv_noun.get_feature_names())
# Set President's Names as Index
data_dtm_noun.index = df.index
# Visually inspect Document Term Matrix
data_dtm_noun.head()


Unnamed: 0_level_0,abandonment,abeyance,ability,abode,abraham,absence,absent,absolute,abstraction,abundance,abuse,academy,accept,acceptance,access,accession,accident,accommodation,accomplishment,accord,accordance,account,accountability,accumulation,achievement,acknowledgment,acquiescence,acquisition,act,action,activism,activity,addiction,addition,address,adequate,adherence,adheres,adjunct,adjustment,...,winter,wisdom,wise,wiser,wish,wit,withal,withdrawal,witness,woman,womanhood,wonder,woodsman,word,work,worker,working,workingman,workshop,worm,worry,worship,worthy,wound,wreckage,wretchedness,writ,writer,writing,wrong,wrongdoing,wrought,year,yes,yesterday,york,yorktown,youth,zeal,zone
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
George Washington,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039793,0.0,0.0,0.0,0.0,0.068135,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.038597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027063,0.0,0.0,0.0,0.0,0.0,0.0,0.0
John Adams,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03629,0.058196,0.0,0.0,0.0,0.0,0.058196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.025407,0.0,0.069511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089073,0.0,0.0,0.0,0.0,0.0,0.038005,0.0
Thomas Jefferson,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058536,0.058536,0.0,0.025789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.062514,0.033159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049602,0.0
James Madison,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
James Monroe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017314,0.0,0.033302,0.0,0.0,0.0,0.050994,0.0,0.0,0.0,0.0,...,0.0,0.0,0.022263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03122,0.0,0.0,0.0,0.0,0.0,0.033302,0.050994


In [32]:
def display_topics(model, feature_names, num_top_words,topic_names=None):
    for ix, topic in enumerate(model.components_):
        #print topic, topic number, and top words
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] \
             for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [35]:
nmf_model = NMF(2)
# Learn an NMF model for given Document Term Matrix 'V' 
# Extract the document-topic matrix 'W'
doc_topic = nmf_model.fit_transform(data_dtm_noun)
# Extract top words from the topic-term matrix 'H' display_topics(nmf_model, tv_noun.get_feature_names(), 5)

In [36]:
nmf_model = NMF(8)
doc_topic = nmf_model.fit_transform(data_dtm_noun)
display_topics(nmf_model, tv_noun.get_feature_names(), 5)


Topic  0
union, constitution, territory, question, opinion

Topic  1
man, american, dream, earth, president

Topic  2
business, congress, party, policy, justice

Topic  3
principle, confidence, institution, error, difficulty

Topic  4
constitution, measure, executive, care, legislature

Topic  5
freedom, strength, peace, faith, dream

Topic  6
dollar, debt, payment, question, ability

Topic  7
story, american, child, generation, friend
