### Exercise 1


In [1]:
import numpy as np
import codecs
import nltk
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

"""
This is a class sherlock. 
Notice how it is defined with the keyword `class` and a name that begins with a capital letter
"""

class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text):
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
    
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
        


In [2]:
import re
import math
import numpy as np
from collections import Counter

class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    # I added in the initialization the dictionary with respect to which you want to compute the ranking 
    # (and numb for the number of top documents you want to consider), and method == "tdidf" or "doc-term"
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        
        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length < 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 

                
    def generate_document_term_matrix(self):
        """
        description: create the document_term_matrix
        """
        dimD = self.N
        dimV = len(self.token_set)
        terms_list = list(self.token_set)
        document_term_matrix = np.zeros((dimD, dimV))        
        for i in range(dimD):
            document = self.docs[i]
            if i%25==0: print 'counting terms for doc: ' + str(i)
            word_counts = Counter(document.tokens)
            for word_count_pair in word_counts.most_common():
                word = word_count_pair[0]
                count = word_count_pair[1]
                term_idx = terms_list.index(word)
                doc_term_tuple = (i, term_idx)
                document_term_matrix.itemset(doc_term_tuple, count)
        self.document_term_matrix = document_term_matrix

    # exercise 1
    def generate_idfv(self):
        """
        computes the inverse document frequency of each term v
        """
        D = self.N
        # idf_{v} = log(D/d_fv)
        terms_list = list(self.token_set)
        self.idfv = dict.fromkeys(terms_list, 0)
        # creates a hash {'term':0,'term2':0,...}
        for v in self.token_set:
            term_idx = terms_list.index(v)
            d_fv = np.sum(self.document_term_matrix[:,term_idx] > 0)
            c = math.log10(D/d_fv)
            self.idfv[v] = c

    def generate_tf_idf(self):
        D = self.N
        terms_list = list(self.token_set)
        tf_idf = np.zeros(self.document_term_matrix.shape)
        for doc_i in range(D):
            if doc_i%25==0: print 'counting terms for doc: ' + str(doc_i)
            for v_i in range(len(terms_list)):
                doc_term_tuple = (doc_i, v_i)
                xdv_score = self.document_term_matrix.item(doc_term_tuple)
                if xdv_score > 0:
                    idf_score = self.idfv[terms_list[v_i]]
                    if idf_score > 0:
                        tf_idf_score = 1 + np.log(xdv_score*idf_score)
                        tf_idf.itemset(doc_term_tuple, tf_idf_score)
        self.tf_idf = tf_idf

    # FIXME: return top n documents
    def dict_rank(self, numb, dictionary, metric):
        """
        computes the dictionary rank of the top numb documents, based on the dictionary and method_matrix 
        (either doc-term or tdidf)
        """
        terms_list = list(self.token_set)
        idcs = [terms_list.index(item) for item in dictionary]
        order = []
        if metric == 'doc-term':
            cols = tuple([list(self.document_term_matrix[:,i]) for i in idcs])
            order = list(np.lexsort(cols))
            order.reverse()
        elif metric == 'tf_idf':
            cols = tuple([list(self.tf_idf[:,i]) for i in idcs])
            order = list(np.lexsort(cols))
            order.reverse()
        return order

In [3]:
def parse_text(textraw, regex):
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)

In [4]:
text = open('sou_all.txt', 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)

In [5]:
#Instantite the corpus class
# REVERT: only a subset for faster iterations
corpus = Corpus(pres_speech_list, 'stopwords.txt', 3)
print corpus.docs[0].text[0:25]

 fellow-citizens of the s


In [6]:
# DEMO
corpus.generate_document_term_matrix()
corpus.document_term_matrix[:,0]

corpus.generate_idfv()
print len(corpus.idfv) # == len(corpus.token_set)
corpus.idfv[corpus.idfv.keys()[9]]

corpus.generate_tf_idf()
print corpus.tf_idf.shape

counting terms for doc: 0
counting terms for doc: 25
counting terms for doc: 50
counting terms for doc: 75
counting terms for doc: 100
counting terms for doc: 125
counting terms for doc: 150
counting terms for doc: 175
counting terms for doc: 200
counting terms for doc: 225
13588
counting terms for doc: 0
counting terms for doc: 25
counting terms for doc: 50
counting terms for doc: 75
counting terms for doc: 100
counting terms for doc: 125
counting terms for doc: 150
counting terms for doc: 175
counting terms for doc: 200
counting terms for doc: 225
(236, 13588)


In [7]:
dictionary = ["all","children","forget"]
print corpus.dict_rank(10, dictionary, 'doc-term')[0:10]
print corpus.dict_rank(10, dictionary, 'tf_idf')[0:10]

[155, 201, 227, 73, 163, 215, 102, 107, 172, 45]
[155, 227, 201, 163, 73, 215, 172, 153, 129, 107]


### Exercise 2

In [8]:
# Read two texts with positive and negative words
text2 = open('positive.txt', 'r').read()
dictionary_pos = text2.splitlines()
dictionary_pos = filter(None, dictionary_pos) # define the positive words dictionary
text3 = open('negative.txt', 'r').read()
dictionary_neg = text3.splitlines()
dictionary_neg = filter(None, dictionary_neg) # define the negative words dictionary
dictionary_neg = [re.sub(r'\W+', '', string) for string in dictionary_neg]


In [9]:
pos_tokens = set(dictionary_pos).intersection(corpus.token_set)
tdidf_pos = corpus.dict_rank(10, pos_tokens, "tf_idf")
docterm_pos = corpus.dict_rank(10, pos_tokens, "doc-term")

neg_tokens = set(dictionary_neg).intersection(corpus.token_set)
tdidf_neg = corpus.dict_rank(10, neg_tokens, "tf_idf")
docterm_neg = corpus.dict_rank(10, neg_tokens, "doc-term")


In [11]:
# 100 most positive speeches by td_idf score
most_positive_speeches = [pres_speech_list[i][1] for i in tdidf_pos[0:100]]
counts = Counter(most_positive_speeches)
print counts

print ''
most_positive_speeches = [pres_speech_list[i][1] for i in docterm_pos[0:100]]
counts = Counter(most_positive_speeches)
print counts

Counter({'Roosevelt': 11, 'Nixon': 7, 'Clinton': 7, 'Eisenhower': 6, 'Obama': 6, 'Wilson': 5, 'Bush': 4, 'Taft': 4, 'Carter': 4, 'Cleveland': 4, 'Reagan': 3, 'Lincoln': 3, 'Johnson': 3, 'Fillmore': 3, 'Madison': 3, 'Jackson': 2, 'Buren': 2, 'Coolidge': 2, 'Kennedy': 2, 'Hayes': 2, 'Arthur': 2, 'Harrison': 2, 'McKinley': 2, 'Polk': 2, 'Tyler': 1, 'Grant': 1, 'Ford': 1, 'Pierce': 1, 'Adams': 1, 'Buchanan': 1, 'Washington': 1, 'Truman': 1, 'Monroe': 1})

Counter({'Roosevelt': 9, 'Clinton': 7, 'Bush': 6, 'Obama': 6, 'Johnson': 6, 'Cleveland': 6, 'Jackson': 5, 'Eisenhower': 5, 'Hayes': 4, 'Carter': 4, 'Buren': 3, 'Reagan': 3, 'Grant': 3, 'Taft': 3, 'Polk': 3, 'Coolidge': 2, 'Pierce': 2, 'Adams': 2, 'Fillmore': 2, 'Nixon': 2, 'Madison': 2, 'Washington': 2, 'Jefferson': 2, 'McKinley': 2, 'Truman': 2, 'Tyler': 1, 'Ford': 1, 'Kennedy': 1, 'Arthur': 1, 'Buchanan': 1, 'Harrison': 1, 'Monroe': 1})


#### Comments



### Exercise 3

In [29]:
# To create the dictionary
def valence_scores():
    afinn = dict(map(lambda (k,v): (k,int(v)), 
                         [ line.split('\t') for line in open("AFINN-111.txt") ]))
    return afinn

scores = valence_scores()
print [(k, scores[k]) for k in scores.keys()[0:10]]

[('limited', -1), ('suicidal', -2), ('pardon', 2), ('desirable', 2), ('protest', -2), ('lurking', -1), ('controversial', -2), ('hating', -3), ('ridiculous', -3), ('hate', -3)]


### Exercise 4