### Exercise 1


In [4]:
import numpy as np
import codecs
import nltk
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

"""
This is a class sherlock. 
Notice how it is defined with the keyword `class` and a name that begins with a capital letter
"""

class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text, stopwords, clean_length):
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.word_list(clean_length, stopwords)
        self.tokens = np.array(wordpunct_tokenize(self.text))
    
    def word_list(self, clean_length, stopwords):
        """
        description: define the word_list attribute (i.e. without stemming)
        """
        self.word_list = np.array(wordpunct_tokenize(self.text))
        self.word_list = np.array([t for t in self.word_list if (t.isalpha() and len(t) > clean_length)])        
        self.word_list = np.array([t for t in self.word_list if t not in stopwords])

    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
        


In [6]:
import re
import math
import numpy as np
from collections import Counter

class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    # We added in the initialization the dictionary with respect to which you want to compute the ranking: 
    # numb is for the number of top documents you want to consider, while metric == "tfidf" or "doc-term"
    def __init__(self, doc_data, stopword_file, clean_length):
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
                
        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2], self.stopwords, clean_length) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length < 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 

    # Q1 part 1: document_term_matrix - which returns a D by V array of frequency counts.            
    def generate_document_term_matrix(self):
        """
        description: create the document_term_matrix
        """
        dimD = self.N
        # total number of columns
        dimV = len(self.token_set)
        terms_list = list(self.token_set)
        # initialize the matrix
        document_term_matrix = np.zeros((dimD, dimV))        
        for i in range(dimD):
            # count the terms for each document
            document = self.docs[i]
            if i%25==0: print 'counting terms for doc: ' + str(i)
            word_counts = Counter(document.tokens)
            for word_count_pair in word_counts.most_common():
                # split in word and count
                word = word_count_pair[0]
                count = word_count_pair[1]
                # save the term index
                term_idx = terms_list.index(word)
                doc_term_tuple = (i, term_idx)
                document_term_matrix.itemset(doc_term_tuple, count)
        # update the doc_term_matrix attribute        
        self.document_term_matrix = document_term_matrix

    def generate_idfv(self):
        """
        computes the inverse document frequency of each term v
        """
        D = self.N
        # idf_{v} = log(D/d_fv)
        terms_list = list(self.token_set)
        self.idfv = dict.fromkeys(terms_list, 0)
        # creates a hash {'term':0,'term2':0,...}
        for v in self.token_set:
            term_idx = terms_list.index(v)
            d_fv = np.sum(self.document_term_matrix[:,term_idx] > 0)
            # apply the formula
            c = math.log10(D/d_fv)
            self.idfv[v] = c

    # Q1 part 2: tf_idf - returns a D by V array of tf-idf scores            
    def generate_tf_idf(self):
        D = self.N
        terms_list = list(self.token_set)
        # initialize the matrix
        tf_idf = np.zeros(self.document_term_matrix.shape)
        for doc_i in range(D):
            if doc_i%25==0: print 'counting terms for doc: ' + str(doc_i)
            for v_i in range(len(terms_list)):
                doc_term_tuple = (doc_i, v_i)
                xdv_score = self.document_term_matrix.item(doc_term_tuple)
                if xdv_score > 0:
                    idf_score = self.idfv[terms_list[v_i]]
                    if idf_score > 0:
                        # apply the formula seen in class
                        tf_idf_score = (1 + np.log(xdv_score))*idf_score
                        tf_idf.itemset(doc_term_tuple, tf_idf_score)
        self.tf_idf = tf_idf

    # Q1 part 3: dict_rank
    def dict_rank(self, numb, dictionary, metric):
        """
        computes the dictionary rank of the top numb documents, based on the dictionary and metric method 
        (either doc-term or tfidf)
        """
        terms_list = list(self.token_set)
        # get the indices of occurences of the terms in the dictionary
        idcs = [terms_list.index(item) for item in dictionary]
        order = []
        if metric == 'doc-term':
            # get the needed columns
            cols = tuple([list(self.document_term_matrix[:,i]) for i in idcs])
            # sort and update order list
            order = list(np.lexsort(cols))
            order.reverse()
        elif metric == 'tf_idf':
            cols = tuple([list(self.tf_idf[:,i]) for i in idcs])
            order = list(np.lexsort(cols))
            order.reverse()
        # return the top numb documents    
        return [self.docs[i] for i in order[0:numb]]


In [5]:
def parse_text(textraw, regex):
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)

In [7]:
text = open('sou_all.txt', 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)

In [9]:
#Instantite the corpus class
corpus = Corpus(pres_speech_list, 'stopwords.txt', 3)
print corpus.docs[0].text[0:25]

 fellow-citizens of the s


In [10]:
# Q2 part 1: Use the two methods above to score each document in your data.
import time
t0 = time.time()
corpus.generate_document_term_matrix()
corpus.document_term_matrix[:,0]
t1 = time.time()
print 'time spent computing document_term_matrix: ' + str(t1 - t0)

corpus.generate_idfv()
print len(corpus.idfv) # == len(corpus.token_set)
corpus.idfv[corpus.idfv.keys()[9]]

corpus.generate_tf_idf()
print corpus.tf_idf.shape

counting terms for doc: 0
time spent computing document_term_matrix: 1.49046897888
2746
counting terms for doc: 0
(20, 2746)


In [23]:
import scipy

cosine_matrix = np.zeros((corpus.N, corpus.N))
print cosine_matrix.shape

print corpus.N
for i in range(corpus.N):
    for j in range(corpus.N):
        if i == j:
            cosine_matrix[i,j] = 1
        elif j < i:
            cosine_matrix[i,j] = cosine_matrix[j,i]
        else:
            cosine_matrix[i,j] = scipy.spatial.distance.cosine(corpus.tf_idf[i,:],corpus.tf_idf[j,:])

(20, 20)
20


True