In [1]:
import numpy as np
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

"""
This is a class sherlock. 
Notice how it is defined with the keyword `class` and a name that begins with a capital letter
"""
class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text):
        """
        The __init__ method is called everytime an object is instantiated.
        This is where you will define all the properties of the object that it must have
        when it is `born`.
        """
        
        #These are data members
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
        
        
        
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
        
    def demo_self():
        print 'this will error out'

In [2]:
import numpy as np
import codecs
import nltk
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
import math
import heapq


class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        

        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
        #create document_term_matrix
        #self.document_term_matrix()
        
        #create tf_idf scores
        #self.tf_idf()
        
        #create dict_rank
        #self.dict_rank(rep_tokens)
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
        
        
    ###### 1.1 doc_term ######   
    def document_term_matrix(self):
        """
        description: create a D by V array of frequency counts.
        """
        def term_count(doc):
            
            #initialize an array with size V
            term_count = [0]*len(self.token_set)
            for token in doc.tokens:
                if token in self.token_set:
                    term_count[list(self.token_set).index(token)] += 1 
            return term_count
                   
        self.doc_term_matrix = [term_count(doc) for doc in self.docs]
        
        
   
    ###### 1.2 tf_idf ######
    
    def tf_idf(self):
        """
        description: create a D by V array of tf-idf scores
        """
        
        
        def tf(doc):
            #initialize an array of size V
            term_count = [0]*len(self.token_set)
            for token in doc.tokens:
                if token in self.token_set:
                    term_count[list(self.token_set).index(token)] += 1 
            return term_count
        
        idf = [0]*len(self.token_set)
        for token in self.token_set:
            freq = 0
            for doc in self.docs:
                freq += (token in doc.tokens)
            idf[list(self.token_set).index(token)] = math.log(self.N/freq)
            
        def get_score(doc):
            t_f = tf(doc)
        
            for i,term in enumerate(t_f):
                if term != 0:
                    t_f[i] = (1 + math.log(term)) * idf[i]
            return t_f
            
        self.tf_idf = [(doc, get_score(doc)) for doc in self.docs]    
        
    
    ###### 1.3 dict_rank ######
    def dict_rank(self, n, dictionary, rep_tokens):
        """
        return top N documents given a representation of tokens
        """
        
        if rep_tokens == "doc_term":
            self.document_term_matrix()
            scores = self.doc_term_matrix
        elif rep_tokens == "tf_idf":
            self.tf_idf()
            scores = self.tf_idf
        else: print("dude, what's wrong?")

        weights = [0] * self.N
        
        for i, doc in enumerate(self.docs):
            for j, token in enumerate(self.token_set):
                if token in dictionary:
                    weights[i] += scores[i][j]
                    
        which_max = heapq.nlargest(n, range(len(weights)), weights.__getitem__)             
        self.dict_ranking = [self.docs[i] for i in which_max]
                    

In [3]:
import numpy as np
import codecs
import nltk
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

def parse_text(textraw, regex): # So this is a necessary process to perform in order to convert into our class
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)

text = open('./data/pres_speech/sou_all.txt', 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)

corpus = Corpus(pres_speech_list, './data/stopwords/stopwords.txt', 2)

corpus.tf_idf()
tfidf = corpus.tf_idf

In [4]:
################# WEEK 2 ###################

pres_list = [pair[0].pres for pair in tfidf]
scores = [pair[1] for pair in tfidf]

In [6]:
X = np.array(scores)
P, D, Q = np.linalg.svd(X, full_matrices=False)
n_sv = 40 #reasonable number of singular values
X_hat = np.dot(np.dot(P[:,0:n_sv], np.diag(D[0:n_sv])), Q[0:n_sv,:])

print(np.std(X), np.std(X_hat), np.std(X - X_hat))

(0.6246904024043467, 0.44365755967938536, 0.43980631946061771)


In [19]:
import scipy.spatial as sp

sim_X = 1 - sp.distance.cdist(X, X, 'cosine')
sim_X_hat = 1 - sp.distance.cdist(X_hat, X_hat, 'cosine')

In [10]:
# source: https://en.wikipedia.org/wiki/List_of_Presidents_of_the_United_States_by_political_affiliation

democrats = ["Obama", "Carter", "Clinton", "Johnson", "Kennedy", "Truman", "Roosevelt", 
              "Wilson", "Cleveland", "Jackson", "Buren", "Polk", "Pierce"]

republicans = ["Bush", "Reagan","Taft", "Harding", "Coolidge", "Hoover", 
               "Lincoln","Arthur", "Harrison", "McKinley",  "Grant", "Nixon", "Ford", "Hayes", "Eisenhower"]

rep_ind, dem_ind = [], []

for i in range(len(pres_list)):
    if pres_list[i] in democrats:
        dem_ind.append(i)
    elif pres_list[i] in republicans:
        rep_ind.append(i)


In [33]:
dd_sim = 1 - sp.distance.cdist(X[dem_ind], X[dem_ind], 'cosine')
dr_sim = 1 - sp.distance.cdist(X[dem_ind], X[rep_ind], 'cosine')
rr_sim = 1 - sp.distance.cdist(X[rep_ind], X[rep_ind], 'cosine')

print "#####  TF_IDF Similarities #####"
print "democrat vs democrat:", np.mean(dd_sim)
print "democrat vs republican:", np.mean(dr_sim)
print "republican vs republican:", np.mean(rr_sim)

dd_sim_h = 1 - sp.distance.cdist(X_hat[dem_ind], X_hat[dem_ind], 'cosine')
dr_sim_h = 1 - sp.distance.cdist(X_hat[dem_ind], X_hat[rep_ind], 'cosine')
rr_sim_h = 1 - sp.distance.cdist(X_hat[rep_ind], X_hat[rep_ind], 'cosine')

print "\n"
print "#####  SVD Similarities #####"
print "democrat vs democrat:", np.mean(dd_sim_h)
print "democrat vs republican:", np.mean(dr_sim_h)
print "republican vs republican:", np.mean(rr_sim_h)


#####  TF_IDF Similarities #####
democrat vs democrat: 0.102051234999
democrat vs republican: 0.0892043781676
republican vs republican: 0.107142964485


#####  SVD Similarities #####
democrat vs democrat: 0.378334054824
democrat vs republican: 0.367961605615
republican vs republican: 0.416229294744
