### Exercise 1


In [31]:
import numpy as np
import codecs
import nltk
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

"""
This is a class sherlock. 
Notice how it is defined with the keyword `class` and a name that begins with a capital letter
"""

class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, party, speech_text, stopwords, clean_length):
        self.year = speech_year
        self.pres = speech_pres
        self.party = party
        self.text = speech_text.lower()
        self.word_list(clean_length, stopwords)
        self.tokens = np.array(wordpunct_tokenize(self.text))
    
    def word_list(self, clean_length, stopwords):
        """
        description: define the word_list attribute (i.e. without stemming)
        """
        self.word_list = np.array(wordpunct_tokenize(self.text))
        self.word_list = np.array([t for t in self.word_list if (t.isalpha() and len(t) > clean_length)])        
        self.word_list = np.array([t for t in self.word_list if t not in stopwords])

    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
        


In [57]:
import re
import math
import numpy as np
from collections import Counter
from collections import defaultdict

class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    # We added in the initialization the dictionary with respect to which you want to compute the ranking: 
    # numb is for the number of top documents you want to consider, while metric == "tfidf" or "doc-term"
    def __init__(self, doc_data, stopword_file, clean_length, pres_parties):
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
                
        #Initialise documents by invoking the appropriate class
        self.docs = []
        for doc in doc_data:
            pres_last_name = doc[1]
            party = pres_parties[pres_last_name] if pres_last_name in pres_parties.keys() else None
            new_doc = Document(doc[0], pres_last_name, party, doc[2], self.stopwords, clean_length)
            self.docs.append(new_doc)
            
        # sort docs by political party
        groups = defaultdict(list)
        for obj in self.docs:
            groups[obj.party].append(obj)
        new_list = groups.values()
        self.docs = [item for sublist in new_list for item in sublist]
                
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length < 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 

    # Q1 part 1: document_term_matrix - which returns a D by V array of frequency counts.            
    def generate_document_term_matrix(self):
        """
        description: create the document_term_matrix
        """
        dimD = self.N
        # total number of columns
        dimV = len(self.token_set)
        terms_list = list(self.token_set)
        # initialize the matrix
        document_term_matrix = np.zeros((dimD, dimV))        
        for i in range(dimD):
            # count the terms for each document
            document = self.docs[i]
            if i%25==0: print 'counting terms for doc: ' + str(i)
            word_counts = Counter(document.tokens)
            for word_count_pair in word_counts.most_common():
                # split in word and count
                word = word_count_pair[0]
                count = word_count_pair[1]
                # save the term index
                term_idx = terms_list.index(word)
                doc_term_tuple = (i, term_idx)
                document_term_matrix.itemset(doc_term_tuple, count)
        # update the doc_term_matrix attribute        
        self.document_term_matrix = document_term_matrix

    def generate_idfv(self):
        """
        computes the inverse document frequency of each term v
        """
        D = self.N
        # idf_{v} = log(D/d_fv)
        terms_list = list(self.token_set)
        self.idfv = dict.fromkeys(terms_list, 0)
        # creates a hash {'term':0,'term2':0,...}
        for v in self.token_set:
            term_idx = terms_list.index(v)
            d_fv = np.sum(self.document_term_matrix[:,term_idx] > 0)
            # apply the formula
            c = math.log10(D/d_fv)
            self.idfv[v] = c

    # Q1 part 2: tf_idf - returns a D by V array of tf-idf scores            
    def generate_tf_idf(self):
        D = self.N
        terms_list = list(self.token_set)
        # initialize the matrix
        tf_idf = np.zeros(self.document_term_matrix.shape)
        for doc_i in range(D):
            if doc_i%25==0: print 'counting terms for doc: ' + str(doc_i)
            for v_i in range(len(terms_list)):
                doc_term_tuple = (doc_i, v_i)
                xdv_score = self.document_term_matrix.item(doc_term_tuple)
                if xdv_score > 0:
                    idf_score = self.idfv[terms_list[v_i]]
                    if idf_score > 0:
                        # apply the formula seen in class
                        tf_idf_score = (1 + np.log(xdv_score))*idf_score
                        tf_idf.itemset(doc_term_tuple, tf_idf_score)
        self.tf_idf = tf_idf

    # Q1 part 3: dict_rank
    def dict_rank(self, numb, dictionary, metric):
        """
        computes the dictionary rank of the top numb documents, based on the dictionary and metric method 
        (either doc-term or tfidf)
        """
        terms_list = list(self.token_set)
        # get the indices of occurences of the terms in the dictionary
        idcs = [terms_list.index(item) for item in dictionary]
        order = []
        if metric == 'doc-term':
            # get the needed columns
            cols = tuple([list(self.document_term_matrix[:,i]) for i in idcs])
            # sort and update order list
            order = list(np.lexsort(cols))
            order.reverse()
        elif metric == 'tf_idf':
            cols = tuple([list(self.tf_idf[:,i]) for i in idcs])
            order = list(np.lexsort(cols))
            order.reverse()
        # return the top numb documents    
        return [self.docs[i] for i in order[0:numb]]


In [3]:
def parse_text(textraw, regex):
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)

In [4]:
text = open('../Week1HW/sou_all.txt', 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)

In [61]:
from numpy import genfromtxt
pres_metadata = genfromtxt('pres_metadata.tsv', delimiter='\t', names = True, dtype= None)

pres_parties = {}
for i in range(len(pres_metadata)):
    last_name = pres_metadata[i]['Last']
    if not last_name in pres_parties.keys():
        pres_parties[last_name] = pres_metadata[i]['Party']

print pres_parties
print len(pres_parties.keys())

{'Wilson': 'Democratic', 'Jackson': 'Democratic', 'Buren': 'Democratic', 'Reagan': 'Republican', 'Pierce': 'Democratic', 'Bush': 'Republican', 'Tyler': 'Whig', 'Coolidge': 'Republican', 'Hoover': 'Republican', 'Harding': 'Republican', 'Grant': 'Republican', 'Ford': 'Republican', 'Eisenhower': 'Republican', 'Obama': 'Democratic', 'Lincoln': 'Republican/National Union', 'Adams': 'Federalist', 'Johnson': 'Democratic/National Union', 'Kennedy': 'Democratic', 'Roosevelt': 'Republican', 'Hayes': 'Republican', 'Arthur': 'Republican', 'Taft': 'Republican', 'Henry': 'Whig', 'Clinton': 'Democratic', 'Nixon': 'Republican', 'Madison': 'Democratic-Republican', 'Taylor': 'Whig', 'Fillmore': 'Whig', 'Carter': 'Democratic', 'Buchanan': 'Democratic', 'Washington': 'Independent', 'Garfield': 'Republican', 'Jefferson': 'Democratic-Republican', 'Harrison': 'Republican', 'Cleveland': 'Democratic', 'McKinley': 'Republican', 'Truman': 'Democratic', 'Polk': 'Democratic', 'Monroe': 'Democratic-Republican'}
39


In [64]:
#Instantite the corpus class
corpus = Corpus(pres_speech_list, '../Week1HW/stopwords.txt', 3, pres_parties)
print corpus.docs[0].text[0:25]

# CHECK OUR WORK
# for i in range(len(corpus.docs)):
#     print corpus.docs[i].pres + ' is affliated with the ' + str(corpus.docs[i].party) + ' party.'

 fellow-citizens of the s


In [65]:
# Q2 part 1: Use the two methods above to score each document in your data.
import time
t0 = time.time()
corpus.generate_document_term_matrix()
corpus.document_term_matrix[:,0]
t1 = time.time()
print 'time spent computing document_term_matrix: ' + str(t1 - t0)

corpus.generate_idfv()
print len(corpus.idfv) # == len(corpus.token_set)
corpus.idfv[corpus.idfv.keys()[9]]

corpus.generate_tf_idf()
print corpus.tf_idf.shape

counting terms for doc: 0
counting terms for doc: 25
counting terms for doc: 50
counting terms for doc: 75
counting terms for doc: 100
counting terms for doc: 125
counting terms for doc: 150
counting terms for doc: 175
counting terms for doc: 200
counting terms for doc: 225
time spent computing document_term_matrix: 207.569051981
13588
counting terms for doc: 0
counting terms for doc: 25
counting terms for doc: 50
counting terms for doc: 75
counting terms for doc: 100
counting terms for doc: 125
counting terms for doc: 150
counting terms for doc: 175
counting terms for doc: 200
counting terms for doc: 225
(236, 13588)


In [96]:
import scipy

def gen_cosine_matrix(corpus, doc_term_matrix):
    cosine_matrix = np.zeros((corpus.N, corpus.N))
    for i in range(corpus.N):
        for j in range(corpus.N):
            if i == j:
                cosine_matrix[i,j] = 1
            elif j < i:
                cosine_matrix[i,j] = cosine_matrix[j,i]
            else:
                cosine_matrix[i,j] = 1-scipy.spatial.distance.cosine(doc_term_matrix[i,:], doc_term_matrix[j,:])
    return cosine_matrix

cosine_matrix = gen_cosine_matrix(corpus, corpus.tf_idf)
# test
print cosine_matrix.item((1,2)) == cosine_matrix.item((2,1))
print cosine_matrix.item((2,2)) == 1
print cosine_matrix.item((2,1)) != cosine_matrix.item((2,3))

print cosine_matrix[0,:]

True
True
True
[ 1.          0.1233673   0.18749003  0.14247549  0.15269546  0.09685395
  0.14338882  0.06745857  0.09074177  0.08064543  0.07242745  0.09601532
  0.06775577  0.05627785  0.06394946  0.10218228  0.05368801  0.07093068
  0.11474534  0.09960175  0.09058679  0.09385505  0.09817714  0.07407106
  0.0764705   0.05144827  0.05447246  0.05345622  0.05936839  0.07613458
  0.05887399  0.09685397  0.10188679  0.06026158  0.12372114  0.08063384
  0.09591824  0.06774948  0.04540258  0.06054393  0.06423356  0.04044441
  0.06296118  0.06406253  0.0229444   0.03158374  0.03619887  0.02405588
  0.02494243  0.01328416  0.07125239  0.06149412  0.082294    0.08113806
  0.04858756  0.08631996  0.08308064  0.07267689  0.05697599  0.05620938
  0.05867019  0.05545621  0.04412697  0.06630099  0.05566819  0.05044021
  0.07832019  0.06203478  0.07824163  0.06831105  0.06636182  0.04313458
  0.0470978   0.05258295  0.05894993  0.04512392  0.04886972  0.04499137
  0.04542274  0.05776664  0.05979084

In [108]:
import plotly.plotly as py
import plotly.graph_objs as go
py.sign_in('aimeeb', '***')
data = [
    go.Heatmap(
        z=cosine_matrix
    )
]
plot_url = py.plot(data, filename='basic-sou-heatmap')

In [118]:
from IPython.core.display import display, HTML
display(HTML('<div><a href="https://plot.ly/~aimeeb/4/" target="_blank" title="" style="display: block; text-align: center;"><img src="https://plot.ly/~aimeeb/4.png" alt="" style="max-width: 100%;width: 600px;"  width="600" onerror="this.onerror=null;this.src=\'https://plot.ly/404.png\';" /></a><script data-plotly="aimeeb:4"  src="https://plot.ly/embed.js" async></script></div>'))

In [74]:
#svd_sou = np.linalg.svd(corpus.tf_idf)
a = svd_sou[0]
s = svd_sou[1]
b = svd_sou[2]

print a.shape
print np.diag(s).shape
print b.T.shape

(236, 236)
(236, 236)
(13588, 13588)


In [90]:
# Use svd
p = 0.7
sum_of_all_eigens = np.sum(s)
k = 0
for i in range(len(s)):
    res = np.sum(s[0:i])/sum_of_all_eigens
    if res > p:
        k = i
        break
        
print 'least eigens required: ' + str(k)
tf_idf_pca = np.dot(np.dot(a[:,0:k], np.diag(s[0:k])), b.T[0:k,:])

print tf_idf_pca.shape

least eigens required: 129
(236, 13588)


In [104]:
cosine_similarity_pca = gen_cosine_matrix(corpus, tf_idf_pca)
cosine_similarity_pca[0,:]

array([ 1.        ,  0.79340641,  0.80664352,  0.79139459,  0.82479415,
        0.73520092,  0.84537699,  0.79806626,  0.74929426,  0.73928675,
        0.75063101,  0.75335834,  0.72889676,  0.70036866,  0.76342647,
        0.78453582,  0.76789793,  0.73918473,  0.75268488,  0.72559368,
        0.73405246,  0.6762715 ,  0.80700697,  0.74008296,  0.64674075,
        0.56623183,  0.62150008,  0.61402679,  0.53434132,  0.63229825,
        0.56780259,  0.5991937 ,  0.72195463,  0.71136501,  0.74318345,
        0.77931075,  0.43790163,  0.42123541,  0.34678048,  0.174538  ,
        0.28017565,  0.43519217,  0.24312688,  0.25066939,  0.13837608,
        0.25970026,  0.20192327,  0.15869352,  0.16777973,  0.16485001,
        0.26827476,  0.28999707,  0.46450658,  0.34189316,  0.19094515,
        0.4479368 ,  0.35950945,  0.30673148,  0.46142122,  0.22582278,
        0.40947371,  0.22477763,  0.19357067,  0.2381464 ,  0.2126672 ,
        0.31159697,  0.40365334,  0.41443143,  0.32278155,  0.28

In [107]:
import plotly.plotly as py
import plotly.graph_objs as go

py.sign_in('aimeeb', '***')
data = [
    go.Heatmap(
        z=cosine_similarity_pca
    )
]
plot_url = py.plot(data, filename='pca-heatmap')

Plots urls:
* [basic cosine](https://plot.ly/~aimeeb/4)
* [pca cosine](https://plot.ly/~aimeeb/2)

In [113]:
from IPython.core.display import display, HTML
display(HTML('<div><a href="https://plot.ly/~aimeeb/2/" target="_blank" title="" style="display: block; text-align: center;"><img src="https://plot.ly/~aimeeb/2.png" alt="" style="max-width: 100%;width: 600px;"  width="600" onerror="this.onerror=null;this.src=\'https://plot.ly/404.png\';" /></a><script data-plotly="aimeeb:2"  src="https://plot.ly/embed.js" async></script></div>'))