In [None]:
import numpy as np
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer

"""
This is a class sherlock. 
Notice how it is defined with the keyword `class` and a name that begins with a capital letter
"""
class Document():
    
    """ The Doc class rpresents a class of individul documents
    
    """
    
    def __init__(self, speech_year, speech_pres, speech_text):
        """
        The __init__ method is called everytime an object is instantiated.
        This is where you will define all the properties of the object that it must have
        when it is `born`.
        """
        
        #These are data members
        self.year = speech_year
        self.pres = speech_pres
        self.text = speech_text.lower()
        self.tokens = np.array(wordpunct_tokenize(self.text))
        
        
        
    def token_clean(self,length):

        """ 
        description: strip out non-alpha tokens and tokens of length > 'length'
        input: length: cut off length 
        """

        self.tokens = np.array([t for t in self.tokens if (t.isalpha() and len(t) > length)])


    def stopword_remove(self, stopwords):

        """
        description: Remove stopwords from tokens.
        input: stopwords: a suitable list of stopwords
        """

        
        self.tokens = np.array([t for t in self.tokens if t not in stopwords])


    def stem(self):

        """
        description: Stem tokens with Porter Stemmer.
        """
        
        self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
        
    def demo_self():
        print 'this will error out'

In [None]:
import numpy as np
import codecs
import nltk
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import PorterStemmer
import math
import heapq


class Corpus():
    
    """ 
    The Corpus class represents a document collection
     
    """
    def __init__(self, doc_data, stopword_file, clean_length):
        """
        Notice that the __init__ method is invoked everytime an object of the class
        is instantiated
        """
        

        #Initialise documents by invoking the appropriate class
        self.docs = [Document(doc[0], doc[1], doc[2]) for doc in doc_data] 
        
        self.N = len(self.docs)
        self.clean_length = clean_length
        
        #get a list of stopwords
        self.create_stopwords(stopword_file, clean_length)
        
        #stopword removal, token cleaning and stemming to docs
        self.clean_docs(2)
        
        #create vocabulary
        self.corpus_tokens()
        
        #create document_term_matrix
        #self.document_term_matrix()
        
        #create tf_idf scores
        #self.tf_idf()
        
        #create dict_rank
        #self.dict_rank(rep_tokens)
        
    def clean_docs(self, length):
        """ 
        Applies stopword removal, token cleaning and stemming to docs
        """
        for doc in self.docs:
            doc.token_clean(length)
            doc.stopword_remove(self.stopwords)
            doc.stem()        
    
    def create_stopwords(self, stopword_file, length):
        """
        description: parses a file of stowords, removes words of length 'length' and 
        stems it
        input: length: cutoff length for words
               stopword_file: stopwords file to parse
        """
        
        with codecs.open(stopword_file,'r','utf-8') as f: raw = f.read()
        
        self.stopwords = (np.array([PorterStemmer().stem(word) 
                                    for word in list(raw.splitlines()) if len(word) > length]))
        
     
    def corpus_tokens(self):
        """
        description: create a set of all all tokens or in other words a vocabulary
        """
        
        #initialise an empty set
        self.token_set = set()
        for doc in self.docs:
            self.token_set = self.token_set.union(doc.tokens) 
        
        
    ###### 1.1 doc_term ######   
    def document_term_matrix(self):
        """
        description: create a D by V array of frequency counts.
        """
        def term_count(doc):
            
            #initialize an array with size V
            term_count = [0]*len(self.token_set)
            for token in doc.tokens:
                if token in self.token_set:
                    term_count[list(self.token_set).index(token)] += 1 
            return term_count
                   
        self.doc_term_matrix = [term_count(doc) for doc in self.docs]
        
        
   
    ###### 1.2 tf_idf ######
    
    def tf_idf(self):
        """
        description: create a D by V array of tf-idf scores
        """
        
        
        def tf(doc):
            #initialize an array of size V
            term_count = [0]*len(self.token_set)
            for token in doc.tokens:
                if token in self.token_set:
                    term_count[list(self.token_set).index(token)] += 1 
            return term_count
        
        idf = [0]*len(self.token_set)
        for token in self.token_set:
            freq = 0
            for doc in self.docs:
                freq += (token in doc.tokens)
            idf[list(self.token_set).index(token)] = math.log(self.N/freq)
            
        def get_score(doc):
            t_f = tf(doc)
        
            for i,term in enumerate(t_f):
                if term != 0:
                    t_f[i] = (1 + math.log(term)) * idf[i]
            return t_f
            
        self.tf_idf = [(doc, get_score(doc)) for doc in self.docs]    
        
    
    ###### 1.3 dict_rank ######
    def dict_rank(self, n, dictionary, rep_tokens):
        """
        return top N documents given a representation of tokens
        """
        
        if rep_tokens == "doc_term":
            self.document_term_matrix()
            scores = self.doc_term_matrix
        elif rep_tokens == "tf_idf":
            self.tf_idf()
            scores = self.tf_idf
        else: print("dude, what's wrong?")

        weights = [0] * self.N
        
        for i, doc in enumerate(self.docs):
            for j, token in enumerate(self.token_set):
                if token in dictionary:
                    weights[i] += scores[i][j]
                    
        which_max = heapq.nlargest(n, range(len(weights)), weights.__getitem__)             
        self.dict_ranking = [self.docs[i] for i in which_max]

In [None]:
def parse_text(textraw, regex): # So this is a necessary process to perform in order to convert into our class
    """takes raw string and performs two operations
    1. Breaks text into a list of speech, president and speech
    2. breaks speech into paragraphs
    """
    prs_yr_spch_reg = re.compile(regex, re.MULTILINE|re.DOTALL)
    
    #Each tuple contains the year, last ane of the president and the speech text
    prs_yr_spch = prs_yr_spch_reg.findall(textraw)
    
    #convert immutabe tuple to mutable list
    prs_yr_spch = [list(tup) for tup in prs_yr_spch]
    
    for i in range(len(prs_yr_spch)):
        prs_yr_spch[i][2] = prs_yr_spch[i][2].replace('\n', '')
    
    #sort
    prs_yr_spch.sort()
    
    return(prs_yr_spch)

text = open('./data/pres_speech/sou_all.txt', 'r').read()
regex = "_(\d{4}).*?_[a-zA-Z]+.*?_[a-zA-Z]+.*?_([a-zA-Z]+)_\*+(\\n{2}.*?)\\n{3}"
pres_speech_list = parse_text(text, regex)

corpus = Corpus(pres_speech_list, './data/stopwords/stopwords.txt', 2)

corpus.document_term_matrix()
word_mat = np.array(corpus.doc_term_matrix) # D x V matrix

In [None]:
# Sample topic allocation from multinomial using current B and T
## Performed: Once, as will go over the space inside the function 
## Input: A vector of beta (topics over v) and theta (topics over d) and the word_matrix for the corpus 
## Output: A matrix of topic assignments
def topic_assignment(beta, theta, word_mat, D, V):
    
    for d in range(0, D):
        # here go over the documents in the corpus
        for v in range(0, V):
            # So here we will go over the words in the document
            prob_num = [theta_d*beta_v for theta_d,beta_v in zip(theta[:][d], beta[:][v])] # probabilities for each k
            norm_const = sum(prob_num)
            
            gen_topic = np.random.multinomial(1, prob_num / norm_const).argmax() + 1
            word_mat[d,v] = np.where(word_mat[d,v] != 0, gen_topic, 0) # assign arbitrary topics from 1:K
            
    return word_mat
    
    
# Sample theta_d
## Performed: As part of a map to repeat over documents
## Input: a "document", which is a list of the words, a row of the word_mat
## Output: a vector of size K to build into a K x D matrix 
def sample_theta(a, K, document):
    n_d = [list(document).count(k) for k in range(1, K+1)] + a
    theta_d = np.random.dirichlet(n_d)
    return theta_d

# Sample beta_k
## Performed: As part of a map to repeat over topics
## Input: a "topic", which is a row in our word_mat to show 
## Output: a vector of size V to build into a V x K matrix
def sample_beta(eta, word_mat, topic):
    m_k = np.count_nonzero(word_mat == topic) + eta # get the parameter from total count over corpus
    beta_k = np.random.dirichlet(m_k)
    return beta_k

def gibs_sampler(n, m, N, word_mat, a, eta, K, D, V):
    result_list = []
    topics = range(1,K+1)

    for i in range(0, n):
        theta = [sample_theta(a, K, doc) for doc in word_mat]
        beta = [sample_beta(eta, word_mat, topic) for topic in topics]
        word_mat = topic_assignment(beta, theta, word_mat, D, V)
       
        if(n % N == 0 and n > m):
            result_list.append(word_mat) #save data
        
    return result_list

# Initialise with K+1 so we do not count any as an actual topic
K = 10
word_mat[np.nonzero(word_mat)] = K+1
D =
V = 

n = 8000
m = 1000 # burn in
N = 100 # take results every N
a
eta

# NEED TO CHECK THE DIMENSIONALITY OF MATRICES ETC
gibbs = gibs_sampler(n, m, N, word_mat, a, eta, K, D, V)
    

In [69]:
import numpy as np
A = np.array([[0,2],[3,0]])

np.nonzero(A)



(array([0, 1], dtype=int64), array([1, 0], dtype=int64))

In [65]:
b = []
b.append(1)
b.append([1,2])
b

[1, [1, 2]]