In [1]:
%matplotlib inline


Bad key savefig.frameon in file /home/michele/miniconda3/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 421 ('savefig.frameon : True')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.3.3/matplotlibrc.template
or from the matplotlib source distribution

Bad key verbose.level in file /home/michele/miniconda3/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 472 ('verbose.level  : silent      # one of silent, helpful, debug, debug-annoying')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.3.3/matplotlibrc.template
or from the matplotlib source distribution

Bad key verbose.fileo in file /home/michele/miniconda3/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 473 ('verbose.fileo  : sys.stdout  # a log filename, sys.stdout or sys.stderr')
You probably need to g

# SNLP exercise sheet 5

In [2]:
from os.path import join
from glob import glob
import nltk
from nltk.corpus import stopwords
from tqdm.notebook import trange, tqdm

from string import punctuation
import random
import numpy as np

class LdaModel(object):
    # add some instance variables for storing the corpus
    
    
    # Exercise 1 ###################################################################
    def __init__(self, path_to_corpus, k):
        '''
        Import and preprocess the corpus.
        Parameters: path_to_corpus: string; path to the directory containing the corpus
        '''

        # list of tokenized documents (list of lists of tokens)
        self.corpus = []
        # list of topics for document (list of lists of topics)
        self.topics = []
        # total number of tokens in the whole corpus
        self.N = 0
        # number of topics
        self.k = k

        self.words_index = {}

        # counts
        self.topic_word_counts = None
        self.document_topic_counts = None
        self.topic_count = None

        # preprocess all documents in the corpus
        self.filenames = []
        for newsgroup_dir in glob(join(path_to_corpus, '*')):
            for filename in glob(join(newsgroup_dir, '*')):
                self.filenames.append(filename)

        # cut corpus to help developement
        self.filenames = self.filenames[:int(len(self.filenames) * 0.01)]

        for filename in tqdm(self.filenames, desc='importing corpus'):
            try:
                self.corpus.append(self._preprocess(filename))
            except Exception as e:
                print(filename, 'error', e)

        words = set()
        # initialize document topics randomly
        for document in self.corpus:
            document_topics = []
            for token in document:
                self.N += 1
                words.add(token)
                document_topics.append(random.randint(0, self.k-1))
            self.topics.append(document_topics)

        # assign an index to words
        for i, word in enumerate(words):
            self.words_index[word] = i

        # topic count
        self.topic_count = np.zeros(20)
        # topics on the rows, words on the columns
        self.topic_word_counts = np.zeros((self.k, len(self.words_index)))
        # documents on the rows, topics on the columns
        self.document_topic_counts = np.zeros((len(self.corpus), self.k))

        for i, (document_words, document_topics) in tqdm(enumerate(zip(self.corpus, self.topics)), desc='inizializing structures'):
            for word, topic in zip(document_words, document_topics):
                self.topic_count[topic] += 1
                self.topic_word_counts[topic, self.words_index[word]] += 1
                self.document_topic_counts[i, topic] += 1


    @staticmethod
    def _preprocess(filename):
        with open(filename) as infile:
            data = [line.strip() for line in infile]

        text = []
        header_ended = False
        for line in data:
            # extract the subject
            if line.lower().startswith('subject: '):
                text.append(line.split(': ', 1)[1])
            # detect end of header
            if line == '':
                header_ended = True
            # get message lines
            if header_ended and line != '':
                text.append(line)

        # join in a single string
        text = '\n'.join(text)
        # tokenize the string
        tokens = nltk.word_tokenize(text)

        # remove stopwords
        stopwords_set = set(stopwords.words('english'))
        tokens = list(filter(lambda token: token.lower() not in stopwords_set, tokens))

        # remove punctuation
        tokens = list(filter(lambda token: token not in punctuation, tokens))

        return tokens
    
    # Exercise 2 ###################################################################
    def gibbs_sampling(self, num_iterations, alpha=0.25, beta=0.1):
        '''
        Implement the LDA Gibbs sampling algorithm.
        Parameters: num_iterations: int; number of sampling steps to do for each word
                    alpha: float; alpha parameter of the Dirichlet prior distribution
                    beta: float; beta parameter of the Dirichlet prior distribution
        '''
        
        for _ in range(num_iterations):
            for i, (document_words, document_topics) in tqdm(enumerate(zip(self.corpus, self.topics)), desc='documents'):
                for j, (word, topic) in tqdm(enumerate(zip(document_words, document_topics)), desc='words'):
                    self.document_topic_counts[i, topic] -= 1
                    self.topic_word_counts[topic, self.words_index[word]] -= 1
                    self.topic_count[topic] -= 1

                    distribution = []
                    for k in range(self.k):
                        normalization = sum(self.topic_word_counts[k,:]) + beta * len(self.words_index)
                        p_zk = (self.document_topic_counts[i,k] + alpha) *\
                               (self.topic_word_counts[k,self.words_index[word]] + beta) /\
                               normalization
                        distribution.append(p_zk)

                    new_topic = random.choices(range(self.k), distribution)
                    # assign new topic
                    document_topics[j] = new_topic
                    # update counts
                    self.topic_count[new_topic] += 1
                    self.topic_word_counts[new_topic, self.words_index[word]] += 1
                    self.document_topic_counts[i, new_topic] += 1

    def print_predictions(self):
        for document_name, document_topic_count in zip(self.filenames, self.document_topic_counts):
            print(f'File: "{document_name}", topic: {document_topic_count.argmin()}')


In [3]:
# test preprocessing
print(LdaModel._preprocess('corpus/alt.atheism/51267'))

['Benediktine', 'Metaphysics', 'Benedikt', 'Rosenau', 'writes', 'great', 'authority', 'CONTRADICTORY', 'EXIST', '``', 'Contradictory', "''", 'property', 'language', 'correct', 'THINGS', 'DEFINED', 'CONTRADICTORY', 'LANGUAGE', 'EXIST', 'object', 'definitions', 'reality', 'amend', 'THINGS', 'DESCRIBED', 'CONTRADICTORY', 'LANGUAGE', 'EXIST', "'ve", 'come', 'something', 'plainly', 'false', 'Failures', 'description', 'merely', 'failures', 'description', "'m", 'objectivist', 'remember', '--', 'C.', 'Wingate', '``', 'peace', 'God', 'peace', 'strife', 'closed', 'sod', 'mangoe', 'cs.umd.edu', 'Yet', 'brothers', 'pray', 'one', 'thing', 'tove', 'mangoe', "marv'lous", 'peace', 'God', "''"]


In [4]:
# initialize model
lda = LdaModel('corpus', 20)

HBox(children=(HTML(value='importing corpus'), FloatProgress(value=0.0, max=2000.0), HTML(value='')))

corpus/rec.sport.baseball/104352 error 'utf-8' codec can't decode byte 0xd1 in position 1658: invalid continuation byte
corpus/talk.religion.misc/83651 error 'utf-8' codec can't decode byte 0xb6 in position 1191: invalid start byte
corpus/comp.sys.ibm.pc.hardware/60366 error 'utf-8' codec can't decode byte 0xfe in position 572: invalid start byte
corpus/comp.sys.mac.hardware/51917 error 'utf-8' codec can't decode byte 0xe4 in position 255: invalid continuation byte
corpus/comp.sys.mac.hardware/51892 error 'utf-8' codec can't decode byte 0xb2 in position 407: invalid start byte
corpus/comp.sys.mac.hardware/51904 error 'utf-8' codec can't decode byte 0xb5 in position 1546: invalid start byte
corpus/comp.sys.mac.hardware/51865 error 'utf-8' codec can't decode byte 0xe4 in position 313: invalid continuation byte



HBox(children=(HTML(value='inizializing structures'), FloatProgress(value=1.0, bar_style='info', layout=Layout…




In [5]:
print(len(lda.corpus))
print(len(lda.words_index))
print(lda.topic_count)
print(lda.topic_word_counts)
print(lda.document_topic_counts)

1993
66744
[17835. 17887. 17937. 18059. 17999. 18040. 18105. 17942. 18109. 17961.
 17781. 18067. 18001. 18047. 18015. 17788. 18029. 17973. 17740. 18277.]
[[1. 0. 0. ... 1. 0. 0.]
 [1. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]
[[ 4.  5.  2. ...  4.  4.  1.]
 [18. 16. 21. ... 11. 13. 12.]
 [ 3.  1.  3. ...  2.  2.  2.]
 ...
 [ 7.  4.  5. ...  9.  5.  9.]
 [ 6.  8. 11. ... 11.  7.  7.]
 [ 3.  1.  3. ...  0.  4.  4.]]


In [None]:
# 1% of the corpus is considered to speed the test
# uncomment the line in the constructor to work on the full corpus
lda.gibbs_sampling(1)

HBox(children=(HTML(value='documents'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px')…

HBox(children=(HTML(value='words'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), ma…




HBox(children=(HTML(value='words'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), ma…

In [None]:
lda.print_predictions()