### Exercise 1


In [228]:
import numpy as np
import codecs
import nltk
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

"""
This is a class sherlock. 
Notice how it is defined with the keyword `class` and a name that begins with a capital letter
"""

class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text):
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
        
        
        
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
        


In [408]:
import re
import math
import numpy as np

class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    # I added in the initialization the dictionary with respect to which you want to compute the ranking 
    # (and numb for the number of top documents you want to consider), and method == "tdidf" or "doc-term"
    def __init__(self, doc_data, stopword_file, clean_length, dictionary, numb, method):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        

        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
        #create the idfv
        self.idv(dictionary)
        
        #create document_term_matrix
        self.document_term_matrix(dictionary)
        
        #create dictionary ranking
        self.dict_rank(numb,dictionary, method)
        
       
        #create the tf-idf score
        #self.tdidf(self.token_set)
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length < 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 

            
            
            
    # exercise 1
    def idv(self, dictionary):
        """
        computes the frequency of each term v
        """
        self.idfv = []
        for v in list(dictionary):
            dfv = 0
            for doc in self.docs:
                if v in doc.tokens:
                    dfv = dfv + 1
            if dfv == 0:
                self.idfv.append(1000)
            else:
                c = self.N/dfv
                self.idfv.append(math.log10(c)) 
                

    def document_term_matrix(self, dictionary):
        """
        description: create the document_term_matrix
        """
        self.document_term_mat = []
        self.tfdv = []
        self.tfidf = []
        for doc in self.docs:
            new = []
            tf = []
            # count the frequency of each word in the vocabulary for each document
            for v in list(dictionary):
                temp = list(doc.tokens).count(v)
                new.append(temp)
                if temp == 0:
                    tf.append(temp)
                else:
                    tf.append(1+ math.log10(temp))
            # update both document term matrix and tf_d,v matrix        
            self.document_term_mat.append(new)
            self.tfdv.append(tf)
            self.tfidf.append([a*b for a,b in zip(self.tfdv[-1], self.idfv)])
    
            
    def dict_rank(self, numb, dictionary, method_matrix):
        """
        computes the dictionary rank of the top numb documents, based on the dictionary and method_matrix 
        (either doc-term or tdidf)
        """
        # compute the score 
        self.rank = []
        if method_matrix == "doc-term":
            for i in range(self.N):
                self.rank.append(sum(self.document_term_mat[i]))
        elif method_matrix == "tdidf":
            for i in range(self.N):
                self.rank.append(sum(self.tfidf[i]))
        # this tells the order of the ranking of the top numb documents        
        self.rank = np.argsort(self.rank) + 1     
        self.rank = self.rank[:(numb)]

In [409]:
def parse_text(textraw, regex):
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)

In [433]:
text = open('sou_all.txt', 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)

In [440]:
#Instantite the corpus class
#self, doc_data, stopword_file, clean_length, dictionary, numb, method
#corpus = Corpus(pres_speech_list, 'stopwords.txt', 2, corpus.token_set, 3, "tdidf")

dictionary = ["positive", "joy", "optimism", "god"]
corpus = Corpus(pres_speech_list, 'stopwords.txt', 3, dictionary, 10, "tdidf")
print corpus.docs[0].text[0:25]

 fellow-citizens of the s


In [441]:
# DEMO
b = corpus.rank
b
#corpus.docs[122].pres

array([  1, 107, 108, 109, 110, 114, 115, 116, 117, 118])

### Exercise 2

In [442]:
# Read two texts with positive and negative words
text2 = open('positive.txt', 'r').read()
dictionary_pos = text2.splitlines()
dictionary_pos = filter(None, dictionary_pos) # define the positive words dictionary
text3 = open('negative.txt', 'r').read()
dictionary_neg = text3.splitlines() # define the negative words dictionary

In [None]:
corpus1 = Corpus(pres_speech_list, 'stopwords.txt', 3, dictionary_pos, 10, "tdidf")
pos_tdidf1 = corpus1.rank
pos_tdidf1

In [None]:
corpus2 = Corpus(pres_speech_list, 'stopwords.txt', 3, dictionary_pos, 10, "doc-term")
pos_doc1 = corpus2.rank
pos_doc1

In [None]:
corpus3 = Corpus(pres_speech_list, 'stopwords.txt', 3, dictionary_neg, 10, "tdidf")
pos_tdidf2 = corpus3.rank
pos_tdidf2

In [None]:
corpus4 = Corpus(pres_speech_list, 'stopwords.txt', 3, dictionary_neg, 10, "doc-term")
pos_doc2 = corpus4.rank
pos_doc2

#### Comments
bla bla bla

### Exercise 3

In [436]:
# To create the dictionary
afinn = dict(map(lambda (k,v): (k,int(v)), 
                     [ line.split('\t') for line in open("AFINN-111.txt") ]))

### Exercise 4