In [4]:
# import necessary libraries
import pandas as pd
import numpy as np

# data visualization libraries
# from mpl_toolkits.mplot3d import Axes3D
# import matplotlib.pyplot as plt
# %matplotlib inline
from sklearn.manifold import TSNE
import nltk
# nltk.download('punkt') #uncomment to run first time only
from nltk.tokenize import word_tokenize
np.random.seed(0)


In [5]:
# import the state of the union addresses
from nltk.corpus import state_union

In [6]:
# store state of the union address as a list
speech_list = list(state_union.fileids())


# import a single speech to see what our text looks like so that we can make sure we clean and tokenize it correctly.
with open('data/1945-Truman.txt') as f:
    test_speech = f.readlines()
    print(test_speech)

["PRESIDENT HARRY S. TRUMAN'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS\n", ' \n', 'April 16, 1945\n', '\n', 'Mr. Speaker, Mr. President, Members of the Congress:\n', 'It is with a heavy heart that I stand before you, my friends and colleagues, in the Congress of the United States.\n', 'Only yesterday, we laid to rest the mortal remains of our beloved President, Franklin Delano Roosevelt. At a time like this, words are inadequate. The most eloquent tribute would be a reverent silence.\n', 'Yet, in this decisive hour, when world events are moving so rapidly, our silence might be misunderstood and might give comfort to our enemies.\n', 'In His infinite wisdom, Almighty God has seen fit to take from us a great man who loved, and was beloved by, all humanity.\n', 'No man could possibly fill the tremendous void left by the passing of that noble soul. No words can ease the aching hearts of untold millions of every race, creed and color. The world knows it has lost a heroic champion of j

In [7]:
filenames = [
    '1945-Truman.txt', '1966-Johnson.txt', '1988-Reagan.txt',
    '1946-Truman.txt', '1967-Johnson.txt', '1989-Bush.txt', 
    '1947-Truman.txt', '1968-Johnson.txt', '1990-Bush.txt',
    '1948-Truman.txt', '1969-Johnson.txt', '1949-Truman.txt',
    '1991-Bush-1.txt', '1970-Nixon.txt', '1991-Bush-2.txt',
    '1950-Truman.txt', '1971-Nixon.txt', '1992-Bush.txt',
    '1951-Truman.txt', '1972-Nixon.txt', '1993-Clinton.txt',
    '1953-Eisenhower.txt',  '1973-Nixon.txt', '1994-Clinton.txt',
    '1954-Eisenhower.txt',  '1974-Nixon.txt', '1995-Clinton.txt',
    '1955-Eisenhower.txt', '1975-Ford.txt', '1996-Clinton.txt',
    '1956-Eisenhower.txt', '1976-Ford.txt', '1997-Clinton.txt',
    '1957-Eisenhower.txt', '1977-Ford.txt', '1998-Clinton.txt',
    '1958-Eisenhower.txt', '1978-Carter.txt', '1999-Clinton.txt',
    '1959-Eisenhower.txt', '1979-Carter.txt', '2000-Clinton.txt',
    '1960-Eisenhower.txt', '1980-Carter.txt', '2001-GWBush-1.txt',
    '1961-Kennedy.txt', '1981-Reagan.txt', '2001-GWBush-2.txt',
    '1962-Kennedy.txt', '1982-Reagan.txt', '2002-GWBush.txt',
    '1963-Johnson.txt', '1983-Reagan.txt', '2003-GWBush.txt',
    '1963-Kennedy.txt', '1984-Reagan.txt', '2004-GWBush.txt',
    '1964-Johnson.txt', '1985-Reagan.txt', '2005-GWBush.txt',
    '1965-Johnson-1.txt', '1986-Reagan.txt', '2006-GWBush.txt',
    "1965-Johnson-2.txt", '1987-Reagan.txt'
]

In [8]:
# make every word lowercase, remove newline characters \n, and any of the following punctuation marks: ",.'?!"
# remove -- [laughter]-- or other audience reactions
def clean_speech(speech):
    cleaned_speech = []
    for line in speech:
        if not '[' in line and  not ']' in line:
            for symbol in "-,.?!''\n":
                line = line.replace(symbol, '').lower()
            cleaned_speech.append(line)

    return cleaned_speech

scrubbed_speech = clean_speech(test_speech)
# Sanity-Check
# scrubbed_speech

In [9]:
def tokenize(speech):
    """
    :param speech: 
    joins all words as single string 
    
    :return: 
    fully tokenized version of the speech
    """
    joined_speech = ' '.join(speech)
    tokenized_speech = word_tokenize(joined_speech)
    
    return tokenized_speech

tokenized_test_speech = tokenize(scrubbed_speech)
tokenized_test_speech[:10]

['president',
 'harry',
 's',
 'trumans',
 'address',
 'before',
 'a',
 'joint',
 'session',
 'of']

In [10]:
# convert the entire text into a vector, where each element in the vector represents a different word. The vector is the 
# length of the entire vocabulary--usually, every word that occurs in the English language, or at least every word that appears in our corpus. Any given sentence can then be represented as a vector where all the vector is 1 (or some other value) for each time that word appears in the sentence.

def count_vectorize(speech, vocab=None):
    """
    
    :param speech: 
    :param vocab: 
    takes in a tokenized, cleaned song
    
    :return: 
    returns a Count Vectorized representation of speech as a python dictionary
    """
    
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(speech))
    
    speech_dict = {i:0 for i in unique_words}
    
    for word in speech:
        speech_dict[word] += 1
    
    return speech_dict

test_vectorized = count_vectorize(tokenized_test_speech)
print(test_vectorized)

{'preserve': 1, 'unnecessary': 1, 'leader': 1, 'lot': 1, 'duty': 2, 'faith': 5, 'suffering': 2, 'out': 1, 'was': 3, 'fathers': 1, 'doomed': 1, 'human': 2, 'great': 6, 'lost': 1, 'dmiral': 1, 'aggressors': 3, 'me': 2, 'dark': 1, 'tribute': 1, 'entire': 5, 'require': 1, 'certain': 2, 'tragic': 2, 'people': 10, 'men': 2, 'direction': 2, 'find': 1, 'retreat': 1, 'preserved': 1, 'looked': 2, 'something': 1, 'made': 2, 'prevented': 1, 'achieved': 1, 'protected': 1, 'barriers': 1, 'some': 1, 'glorious': 1, 'man': 5, 'am': 1, 'noble': 1, 'women': 2, 'secured': 1, 'humbly': 1, 'partial': 1, 'guidance': 1, 'indicates': 1, 'conflict': 1, 'and': 52, 'tokyo': 1, 'continued': 1, 'yield': 1, 'support': 3, 'making': 1, 'as': 5, 'almighty': 2, 'obligations': 1, 'pilot': 1, 'befor': 1, 'standards': 1, 'powers': 1, 'unhampered': 1, 'bring': 1, 'lord': 1, 'faced': 2, 'sibility': 1, 'has': 10, 'earned': 1, 'between': 1, 'can': 12, 'presidentwe': 1, 'therefore': 1, 'showed': 1, 'champion': 1, 'make': 1, 'fi

In [11]:
# calculate the term frequency

def term_frequency(BoW_dict):
    total_word_count = sum(BoW_dict.values())
    
    for ind, val in BoW_dict.items():
        BoW_dict[ind] = val/ total_word_count
    
    return BoW_dict

test = term_frequency(test_vectorized)
print(list(test)[10:20])

['doomed', 'human', 'great', 'lost', 'dmiral', 'aggressors', 'me', 'dark', 'tribute', 'entire']


In [12]:
# calculate Inverse Document Frequency
def inverse_document_frequency(list_of_dicts):
    vocab_set = set()
    # Iterate through list of dfs and add index to vocab_set
    for d in list_of_dicts:
        for word in d.keys():
            vocab_set.add(word)
    
    # Once vocab set is complete, create an empty dictionary with a key for each word and value of 0.
    full_vocab_dict = {i:0 for i in vocab_set}
    
    # Loop through each word in full_vocab_dict
    for word, val in full_vocab_dict.items():
        docs = 0
        
        # Loop through list of dicts.  Each time a dictionary contains the word, increment docs by 1
        for d in list_of_dicts:
            if word in d:
                docs += 1
        
        # Now that we know denominator for equation, compute and set IDF value for word
        
        full_vocab_dict[word] = np.log((len(list_of_dicts)/ float(docs)))
    
    return full_vocab_dict



In [13]:
def tf_idf(list_of_dicts):
    # Create empty dictionary containing full vocabulary of entire corpus
    doc_tf_idf = {}
    idf = inverse_document_frequency(list_of_dicts)
    full_vocab_list = {i:0 for i in list(idf.keys())}
    
    # Create tf-idf list of dictionaries, containing a dictionary that will be updated for each document
    tf_idf_list_of_dicts = []
    
    # Now, compute tf and then use this to compute and set tf-idf values for each document
    for doc in list_of_dicts:
        doc_tf = term_frequency(doc)
        for word in doc_tf:
            doc_tf_idf[word] = doc_tf[word] * idf[word]
        tf_idf_list_of_dicts.append(doc_tf_idf)
    
    return tf_idf_list_of_dicts