In [27]:
import numpy as np
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

"""
This is a class sherlock. 
Notice how it is defined with the keyword `class` and a name that begins with a capital letter
"""
class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text):
        """
        The __init__ method is called everytime an object is instantiated.
        This is where you will define all the properties of the object that it must have
        when it is `born`.
        """
        
        #These are data members
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
        
        
        
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
        
    def demo_self():
        print 'this will error out'

In [28]:
import numpy as np
import codecs
import nltk
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
import math
import heapq


class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        

        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
        #create document_term_matrix
        #self.document_term_matrix()
        
        #create tf_idf scores
        #self.tf_idf()
        
        #create dict_rank
        #self.dict_rank(rep_tokens)
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
        
        
    ###### 1.1 doc_term ######   
    def document_term_matrix(self):
        """
        description: create a D by V array of frequency counts.
        """
        def term_count(doc):
            
            #initialize an array with size V
            term_count = [0]*len(self.token_set)
            for token in doc.tokens:
                if token in self.token_set:
                    term_count[list(self.token_set).index(token)] += 1 
            return term_count
                   
        self.doc_term_matrix = [term_count(doc) for doc in self.docs]
        
        
   
    ###### 1.2 tf_idf ######
    
    def tf_idf(self):
        """
        description: create a D by V array of tf-idf scores
        """
        
        
        def tf(doc):
            #initialize an array of size V
            term_count = [0]*len(self.token_set)
            for token in doc.tokens:
                if token in self.token_set:
                    term_count[list(self.token_set).index(token)] += 1 
            return term_count
        
        idf = [0]*len(self.token_set)
        for token in self.token_set:
            freq = 0
            for doc in self.docs:
                freq += (token in doc.tokens)
            idf[list(self.token_set).index(token)] = math.log(self.N/freq)
            
        def get_score(doc):
            t_f = tf(doc)
        
            for i,term in enumerate(t_f):
                if term != 0:
                    t_f[i] = (1 + math.log(term)) * idf[i]
            return t_f
            
        self.tf_idf = [(doc, get_score(doc)) for doc in self.docs]    
        
    
    ###### 1.3 dict_rank ######
    def dict_rank(self, n, dictionary, rep_tokens):
        """
        return top N documents given a representation of tokens
        """
        
        if rep_tokens == "doc_term":
            self.document_term_matrix()
            scores = self.doc_term_matrix
        elif rep_tokens == "tf_idf":
            self.tf_idf()
            scores = self.tf_idf
        else: print("dude, what's wrong?")

        weights = [0] * self.N
        
        for i, doc in enumerate(self.docs):
            for j, token in enumerate(self.token_set):
                if token in dictionary:
                    weights[i] += scores[i][j]
                    
        which_max = heapq.nlargest(n, range(len(weights)), weights.__getitem__)             
        self.dict_ranking = [self.docs[i] for i in which_max]
                    

In [34]:
import numpy as np
import codecs
import nltk
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

def parse_text(textraw, regex): # So this is a necessary process to perform in order to convert into our class
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)

text = open('./data/pres_speech/sou_all.txt', 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)[1:10]

corpus = Corpus(pres_speech_list, './data/stopwords/stopwords.txt', 2)

corpus.tf_idf()
tfidf = corpus.tf_idf

In [54]:
################# WEEK 2 ###################

pres_list = [pair[0].pres for pair in tfidf]
scores = [pair[1] for pair in tfidf]


In [55]:
X = np.array(scores)
P, D, Q = np.linalg.svd(X, full_matrices=False)
X_hat = np.dot(np.dot(P, np.diag(D)), Q)

# we need to do the dot product for a "reasonable" number of sv's

print(np.std(X), np.std(X_hat), np.std(X - X_hat))

(0.64433277201665784, 0.6443327720166564, 1.1708700837467513e-15)


In [56]:
def cosine_sim(doc1, doc2):
    return np.dot(doc1, doc2) / (math.sqrt(np.dot(doc1, doc1)) * math.sqrt(np.dot(doc2, doc2)))

def cos_sim_map(X):
    return map(lambda doc1: map(lambda doc2: cosine_sim(doc1, doc2), X), X)

sim_X = cos_sim_map(X)
sim_X_hat = cos_sim_map(X_hat)

In [97]:
sim_X

[[1.0,
  0.11761926595678301,
  0.085658410521250658,
  0.035057229723018953,
  0.058110020179341554,
  0.053599987817413153,
  0.068531075901293551,
  0.020930661939335817,
  0.047803809189043367],
 [0.11761926595678301,
  1.0,
  0.11278344021576492,
  0.083696340130119257,
  0.090660185946278463,
  0.10384023999128583,
  0.080855728913674207,
  0.074985617591664916,
  0.063442749735928713],
 [0.085658410521250658,
  0.11278344021576492,
  1.0000000000000002,
  0.09933097933469659,
  0.091472580073926585,
  0.070834577212945296,
  0.06506419910863484,
  0.050515651088784905,
  0.070498535579454527],
 [0.035057229723018953,
  0.083696340130119257,
  0.09933097933469659,
  0.99999999999999989,
  0.1293534593913879,
  0.063965286106769284,
  0.092040179590605231,
  0.0789226761113965,
  0.070451421053688151],
 [0.058110020179341554,
  0.090660185946278463,
  0.091472580073926585,
  0.1293534593913879,
  1.0000000000000002,
  0.11364819094031955,
  0.090941755263055812,
  0.05029623195874

In [98]:
sim_X_hat

[[0.99999999999999978,
  0.11761926595678299,
  0.085658410521250714,
  0.035057229723019029,
  0.058110020179341547,
  0.053599987817413403,
  0.068531075901293453,
  0.020930661939335942,
  0.047803809189043617],
 [0.11761926595678299,
  0.99999999999999989,
  0.11278344021576388,
  0.083696340130120214,
  0.090660185946277214,
  0.10384023999128687,
  0.080855728913673264,
  0.074985617591664652,
  0.063442749735928297],
 [0.085658410521250714,
  0.11278344021576388,
  0.99999999999999989,
  0.09933097933469634,
  0.091472580073926474,
  0.070834577212945893,
  0.065064199108635243,
  0.050515651088785293,
  0.070498535579454957],
 [0.035057229723019029,
  0.083696340130120214,
  0.09933097933469634,
  1.0,
  0.1293534593913874,
  0.063965286106770963,
  0.092040179590604509,
  0.078922676111396972,
  0.070451421053688013],
 [0.058110020179341547,
  0.090660185946277214,
  0.091472580073926474,
  0.1293534593913874,
  0.99999999999999989,
  0.11364819094031869,
  0.09094175526305657