In [27]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.manifold import TSNE
import nltk
nltk.download('punkt') 
from nltk.tokenize import word_tokenize
np.random.seed(0)


[nltk_data] Downloading package punkt to /Users/db/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
from nltk.corpus import state_union

In [17]:
# store state of the union address as a list
speech_list = list(state_union.fileids())

In [34]:
# speech_list
# import a single speech to see what our text looks like so that we can make sure we clean and tokenize it correctly.
test_speech = with open('state_of_union_speeches/state_union/2000-Clinton.txt') as f: test_speech= f.readlines()

SyntaxError: invalid syntax (<ipython-input-34-0693ec737b82>, line 3)

In [25]:
# make every word lowercase, remove newline characters \n, and any of the following punctuation marks: ",.'?!"
# remove -- [laughter]-- or other audience reactions
def clean_speech(speech):
    cleaned_speech = []
    for line in speech:
        if not '[' in line and  not ']' in line:
            for symbol in "-,.?!''\n":
                line = line.replace(symbol, '').lower()
            cleaned_speech.append(line)

    return cleaned_speech

scrubbed_speech = clean_speech(test_speech)
scrubbed_speech

['president bill clintons address before a joint session of the congress on the state of the union',
 ' ',
 'january 27 2000 ',
 '',
 'mr speaker mr vice president members of congress honored guests my fellow americans:',
 'we are fortunate to be alive at this moment in history never before has our nation enjoyed at once so much prosperity and social progress with so little internal crisis and so few external threats never before have we had such a blessed opportunity and therefore such a profound obligation to build the more perfect union of our founders dreams',
 'we begin the new century with over 20 million new jobs; the fastest economic growth in more than 30 years; the lowest unemployment rates in 30 years; the lowest poverty rates in 20 years; the lowest african american and hispanic unemployment rates on record; the first backto back surpluses in 42 years; and next month america will achieve the longest period of economic growth in our entire history we have built a new economy

In [28]:
def tokenize(speech):
    """
    :param song: 
    :return: 
    """
    joined_speech = ' '.join(speech)
    tokenized_speech = word_tokenize(joined_speech)
    
    return tokenized_speech

tokenized_test_speech = tokenize(scrubbed_speech)
tokenized_test_speech[:10]

['president',
 'bill',
 'clintons',
 'address',
 'before',
 'a',
 'joint',
 'session',
 'of',
 'the']

In [29]:
def count_vectorize(speech, vocab=None):
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(speech))
    
    speech_dict = {i:0 for i in unique_words}
    
    for word in speech:
        speech_dict[word] += 1
    
    return speech_dict

test_vectorized = count_vectorize(tokenized_test_speech)
print(test_vectorized)

{'shoulder': 1, 'rulebased': 1, 'proposing': 1, 'media': 1, 'female': 1, 'unemployment': 3, 'roosevelt': 1, 'tie': 1, 'with': 49, 'wisely': 1, 'columbine': 1, 'cancer': 2, 'conflict': 1, 'got': 11, 'encourage': 1, 'adults': 2, 'force': 1, '150': 1, 'destruction': 1, 'reasons': 1, 'fathers': 1, 'like': 6, 'growing': 2, 'lack': 1, 'come': 1, 'triple': 1, 'different': 2, 'kind': 1, 'lot': 4, 'prevailed': 1, 'thought': 1, 'implore': 1, 'bravery': 1, 'grander': 1, 'effective': 2, 'economics': 1, 'personal': 1, 'designing': 1, 'grades': 1, 'work': 26, 'judged': 1, 'strain': 1, 'will': 50, 'embracing': 1, 'session': 1, 'reading': 1, 'crucial': 1, 'stay': 1, 'all': 58, 'once': 1, '2': 4, 'priorities': 1, 'judicial': 1, 'begin': 5, 'lands': 2, '6000': 1, 'predict': 1, '1989': 1, 'downpayment': 2, 'theirs': 1, 'spend': 1, 'directly': 1, 'again': 11, 'clintons': 1, 'acts': 1, 'wise': 1, 'student': 2, 'families': 14, 'increasing': 2, 'andlisten': 1, 'rights': 3, 'reconciliation': 2, 'eager': 1, 'e

In [30]:
# calculate the term frequency

def term_frequency(BoW_dict):
    total_word_count = sum(BoW_dict.values())
    
    for ind, val in BoW_dict.items():
        BoW_dict[ind] = val/ total_word_count
    
    return BoW_dict

test = term_frequency(test_vectorized)
print(list(test)[10:20])

['columbine', 'cancer', 'conflict', 'got', 'encourage', 'adults', 'force', '150', 'destruction', 'reasons']


In [31]:
# calculate Inverse Document Frequency
def inverse_document_frequency(list_of_dicts):
    vocab_set = set()
    # Iterate through list of dfs and add index to vocab_set
    for d in list_of_dicts:
        for word in d.keys():
            vocab_set.add(word)
    
    # Once vocab set is complete, create an empty dictionary with a key for each word and value of 0.
    full_vocab_dict = {i:0 for i in vocab_set}
    
    # Loop through each word in full_vocab_dict
    for word, val in full_vocab_dict.items():
        docs = 0
        
        # Loop through list of dicts.  Each time a dictionary contains the word, increment docs by 1
        for d in list_of_dicts:
            if word in d:
                docs += 1
        
        # Now that we know denominator for equation, compute and set IDF value for word
        
        full_vocab_dict[word] = np.log((len(list_of_dicts)/ float(docs)))
    
    return full_vocab_dict



In [32]:
def tf_idf(list_of_dicts):
    # Create empty dictionary containing full vocabulary of entire corpus
    doc_tf_idf = {}
    idf = inverse_document_frequency(list_of_dicts)
    full_vocab_list = {i:0 for i in list(idf.keys())}
    
    # Create tf-idf list of dictionaries, containing a dictionary that will be updated for each document
    tf_idf_list_of_dicts = []
    
    # Now, compute tf and then use this to compute and set tf-idf values for each document
    for doc in list_of_dicts:
        doc_tf = term_frequency(doc)
        for word in doc_tf:
            doc_tf_idf[word] = doc_tf[word] * idf[word]
        tf_idf_list_of_dicts.append(doc_tf_idf)
    
    return tf_idf_list_of_dicts

