# Project 1 Probabilistic Programming

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import nltk
import re
import pymc as pm
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import math

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to C:\Users\Daniel
[nltk_data]     Ciovica\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Daniel
[nltk_data]     Ciovica\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

To implement the first and the second task I have created a class called LDA. There, I have initialized the required dimensions and created alpha, beta, phi, theta, z and w as described in task 1, as well as the following functions that are to be described: 

def __init__ :

- K is the number of topics
- V is the size of the vocabulary
- M is the number of documents
- N contains the number of words for each document
- Phi is a container composed of a completed dirichlet that has a prior distribution as a dirichlet of length K
- Theta is a container composed of a completed dirichlet that has a prior distribution as a dirichlet of length M
- Z is a contianer composed of a categorical distribution with probabilities theta, having K values of size N[i], where i is the number of words of the i th document
- w is a contianer composed of a categorical distribution of a lambda function that is determined using phi[z] and z[i][j], where i, j are the i^th document and j^th number of i^th document . To the categorical distribution the data and the observed=True parameter are given. 

def compile :

- create the model using all the above described parameters
- create the mcmc model
- sample the model according to the set iterations and burn

def trace :

- trace the phi, theta and z samples and display them

def topic_words:

- using the phi values, I have displayed the 5 most representative words for each topic

def hellinger_distance:

- to calculate the similarity between two documents of the same topic, I have used the Hellinger Distance that calculates the difference between the square root of every element of the same index of the first and second document, everything squared. 

def documents_similarity:

- iterating through the traced theta samples, search for the corresponding documnets and call the function that calculates the dellinger distance

def assign_topics_new_documnets:

- iterating through all the new data, add the traced phi values corresponding to the new provided words to get the probability of every document belonging to each topic.
- predict the topic by selecting the one with the maximum value 


In [2]:
class LDA():
    def __init__(self, data, K, V, N, M, iterations, burn, second_df=None):
        self.data = data
        self.K = K
        self.V = V
        self.M = M
        self.N = N
        self.iterations = iterations
        self.burn = burn

        self.alpha = np.ones(self.K)
        self.beta = np.ones(self.V)
        
        self.theta_prior = pm.Container([pm.Dirichlet("theta_prior_%s" % i, theta = self.alpha) for i in range(self.M)])
        self.theta = pm.Container([pm.CompletedDirichlet("theta_%s" % i, self.theta_prior[i]) for i in range(self.M)])

        self.phi_prior = pm.Container([pm.Dirichlet("phi_prior_%s" % i, theta=self.beta) for i in range(self.K)])
        self.phi = pm.Container([pm.CompletedDirichlet("phi_%s" %i, self.phi_prior[i]) for i in range(self.K)])

        self.z = pm.Container([pm.Categorical("z_%s" %i, p = self.theta[i], 
                                size = self.N[i],
                                value = np.random.randint(self.K, size=self.N[i]))
                                for i in range(self.M)
                                ])
        self.w = pm.Container([pm.Categorical("w_%s, %s" % (i, j),
                                p = pm.Lambda("phi_z_%s_%s" % (i, j),
                                lambda z=self.z[i][j],
                                phi = self.phi:phi[z]),
                                value = self.data[i][j],
                                observed = True)
                                for i in range(self.M) for j in range(self.N[i])
                                ])

    def compile(self):
        self.model = pm.Model([self.theta, self.phi, self.z, self.w, self.theta_prior, self.phi_prior])
        self.mcmc = pm.MCMC(self.model)
        self.mcmc.sample(self.iterations, self.burn)

    def trace(self):
        self.phi_samples = [self.mcmc.trace("phi_%s" %k)[:].mean(axis=0) for k in range(self.K)]
        self.theta_samples = [self.mcmc.trace("theta_%s" %m)[:].mean(axis=0) for m in range(self.M)]
        self.z_samples = [np.round(self.mcmc.trace("z_%s" %k)[:].mean(axis=0)) for k in range(self.M)]
        print("\n\n Phi")
        for phi in self.phi_samples:
            print("\n", phi)
        print("\n Theta")
        for theta in self.theta_samples:
            print("\n", theta)
        print("\n Z")
        for z in self.z_samples:
            print("\n", z)
    
    def topic_words(self):
        words_per_topic = []
        for index, topic in enumerate(self.phi.value):
            words_per_topic.append(np.argsort(topic[0])[-5:])
        return(words_per_topic)

    @staticmethod
    def hellinger_distance(doc_1, doc_2):
        result = 0
        for index in range(len(doc_1[0])):
            result += pow((math.sqrt(doc_1[0][index]) - (math.sqrt(doc_2[0][index]))), 2)
        return result

    def documents_similarity(self, threshold):
        similarities = []
        for index_1, doc_1 in enumerate(self.theta_samples):
            for index_2, doc_2 in enumerate(self.theta_samples):
                if index_1 % 2 == 0 and index_2 %2 ==1  and index_1 + 1== index_2:
                    similarities.append([index_1, index_2, 1 - self.hellinger_distance(doc_1, doc_2)])

        similarities = [similarity for similarity in similarities if similarity[2] > threshold]

        
        for index, similarity in enumerate(similarities):
            print("Topic {}: ".format(index))
            print("{} \n".format(similarity))


    def assign_topics_new_document(self, second_df):
        topics = []
        for i in range(len(second_df)):
            result = []
            for k in range(self.K):
                topic_sum = 0
                for word in second_df[i]:
                    topic_sum += self.phi_samples[k][0][word]
                result.append(topic_sum)
            topics.append(np.argsort(result)[-1])
        
        print(topics)

The following function processes the original data set by:

- eliminating whitespace
- eliminating the words containig numbers
- get only the words that are not in the stop_words list
- get rid of any punctuation
- lemmatizing each word, by verb

In [3]:
def lemmatizeText(text):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    new_text =  [w.lower() for w in w_tokenizer.tokenize(text) if bool(re.search(r'\d', w)) == False]
    new_text = [re.sub(r'[^\w\s]','',x) for x in new_text if x not in stop_words ]
    aux_text =[lemmatizer.lemmatize(w, pos='v') for w in new_text]
    return aux_text

Because the original dataset is way too big, the following function selects only the first two documents for each topic, storing them in the first_df dataframe. The second_df will be used for assigning topics to new, unseen documents. 

In [4]:
def getData(df):

    first_df = pd.DataFrame()
    aux_df = pd.DataFrame()
    second_df = pd.DataFrame()

    for i in df['category'].unique():
        aux_df = df.query('category==@i')
        aux_df.reset_index(inplace=True)
        first_df = first_df.append(aux_df.iloc[0], ignore_index=True)
        first_df = first_df.append(aux_df.iloc[1], ignore_index=True)
        second_df = second_df.append(aux_df.iloc[2], ignore_index=True)

    first_df.reset_index(inplace=True)
    first_df = pd.DataFrame(first_df['text_lemmatized'])

    second_df.reset_index(inplace=True)
    second_df = pd.DataFrame(second_df['text_lemmatized'])

    return first_df, second_df


The following function returns a list containing all the words in the new created dataset that are to be labeled

In [5]:
def createVocab(first_df):
    documents = []
    for i in range(len(first_df.index)):
        for x in first_df.iloc[i]['text_lemmatized']:
            documents.append(x)
    return documents
# print(documents)

The labelDocs function labels the data using all the provided words and transforms each word in each document with the corresponding number.

For the second dataframe, the words that were to be processed consisted only in those already labeled by the encoder. 

In [6]:
def labelDocs(first_df, second_df, vocab):

    le = LabelEncoder()
    first_df_copy = first_df.copy()
    second_df_copy = second_df.copy()
    le.fit(vocab)

    for i in range(len(first_df_copy.index)):
        first_df_copy.iloc[i]['text_lemmatized'] = le.transform(first_df_copy.iloc[i]['text_lemmatized'])

    for i in range(len(second_df_copy.index)):
        list_words = second_df_copy.iloc[i]['text_lemmatized']
        list_words = [word for word in list_words if word in vocab]
        second_df_copy.iloc[i]['text_lemmatized'] = le.transform(list_words)

    first_df_copy = first_df_copy.to_numpy().reshape(-1)
    second_df_copy = second_df_copy.to_numpy().reshape(-1)

    return first_df_copy, second_df_copy

The below function is used to calculate the M, N, V, K variables required for the LDA. As described above:

- K is the number of topics
- V is the size of the vocabulary
- M is the number of documents
- N contains the number of words for each document

In [7]:
def generateDims(df, first_df, first_df_labeled, categories):

    M = len(first_df.index)
    N= []
    for i in range(M):
        N.append(len(first_df.iloc[i]['text_lemmatized']))
    max_v = 0
    for array in first_df_labeled:
        if max_v < max(array):
            max_v = max(array)
    V = max_v + 1
    K = len(categories)

    print("M = {}".format(M))
    print("N = {}".format(N))
    print("V = {}".format(V))
    print("K = {}".format(K))

    return M, N, V, K


The function below returns the top 5 most representative words in each topic according to the model

In [8]:
def topWords(lda, categories, vocab):
    words_array = lda.topic_words()
    print("The topics are: {} \n".format(categories))
    for index_topic, index_w_array in enumerate(words_array):
        words_topic = []
        print("Topic {} \n".format(index_topic)) 
        for index_w in index_w_array:
            words_topic.append(vocab[index_w])
        print("Top 5 words: {} \n".format(words_topic))

The cell below is used to create the sanity checks in order to see if the model is performing correctly on data 

In [9]:
data_1 = [[0, 1, 0], [1, 0, 1], [0, 1, 1, 0], [2, 3], [2, 3, 3], [2, 3, 3, 2]]
data_2 = [[0, 1 ,2 ,3 ,4 ,5, 6 ,7],[0, 8, 9, 10, 11, 3, 12, 13],[14, 15, 16, 2, 17, 18, 19],[20, 12, 21, 22, 23, 24],[25, 26, 27, 3, 9, 28, 18]]

def sanity_check_1(data_1):
    N = [len(doc) for doc in data_1]
    K = 2
    V = 4
    M = 6
    lda_sanity = LDA(data_1, K=K, V=V, N=N, M=M, iterations=40000, burn=1000)
    lda_sanity.compile()
    lda_sanity.trace()

def sanity_check_2(data_2):
    N = [len(doc) for doc in data_2]
    K = 2
    V = 29
    M = 5
    lda_sanity_2 = LDA(data_2, K=K, V=V, N=N, M=M, iterations=40000, burn=1000)
    lda_sanity_2.compile()
    lda_sanity_2.trace()

The below cell, consists of the main function that runs all the above described functions

In [10]:
df = pd.read_csv(r'bbc-news-data.csv', sep='\t')
categories = df['category'].unique()
df.drop(['filename', 'title'], axis = 1, inplace=True)
df['text_lemmatized'] = df.content.apply(lemmatizeText)

print("\n\n ~~~~~~~~~~Task 1~~~~~~~~~~\n\n")

print("\n\n ~~~~Data~~~~\n\n")
first_df, second_df = getData(df)
print("{}".format(first_df))
vocab = createVocab(first_df)

print("\n\n ~~~~Data Labeled Sample~~~~ \n\n")
first_df_labeled, second_df_labeled = labelDocs(first_df, second_df, vocab)
print("{}".format(first_df_labeled[0]))

print("\n\n ~~~~Variables~~~~ \n\n")
M, N, V, K = generateDims(df, first_df, first_df_labeled, categories)

print("\n\n ~~~~LDA~~~~ \n\n")
lda = LDA(first_df_labeled, K=K, V=V, N=N, M=M, iterations=40000 ,burn=1000)
lda.compile()
lda.trace()

print("\n\n ~~~~TOP WORDS~~~~ \n\n")
topWords(lda, categories, vocab)

print("\n\n ~~~~~~~~~~Task 2~~~~~~~~~~\n\n")

print("\n\n ~~~~Similarity between documents of the same topic~~~~")
lda.documents_similarity(0.8)

print("\n\n ~~~~Topics assigned to new documents~~~~")
print("\n\n ~~~~New Doc Sample~~~~ \n\n")
print("{}\n ".format(second_df))

print("\n\n ~~~~Assigned Topics~~~~\n\n")
lda.assign_topics_new_document(second_df = second_df_labeled)

print("\n\n ~~~~True Topics~~~~\n\n")
true_topics = [0, 1, 2, 3, 4]
print(true_topics)


print("\n\n ~~~~~~~~~~Sanity Checks~~~~~~~~~~ \n\n ")
print("~~~~Sanity Checks 1~~~~ \n\n ")
sanity_check_1(data_1)
print("~~~~Sanity Checks 2~~~~ \n\n ")
sanity_check_2(data_2)



 ~~~~~~~~~~Task 1~~~~~~~~~~




 ~~~~Data~~~~


                                     text_lemmatized
0  [quarterly, profit, us, media, giant, timewarn...
1  [dollar, hit, highest, level, euro, almost, th...
2  [christmas, tree, receive, text, message, unve...
3  [french, musician, jeanmichel, jarre, perform,...
4  [maternity, pay, new, mother, rise, part, new,...
5  [information, commissioner, say, urgently, ask...
6  [british, hurdler, sarah, claxton, confident, ...
7  [sonia, osullivan, indicate, would, like, part...
8  [kyrgyz, republic, small, mountainous, state, ...
9  [chinese, authorities, close, net, cafes, clos...


 ~~~~Data Labeled Sample~~~~ 


[ 772  755 1005  604  412  967  523  960  620  251 1057  374  669  105
  507  418   98  837  458  505  207  454   13  837  967  846  389  771
  837  822  755  135  670  402  665  755  282 1024  129  553 1007   47
  965 1024  846  397  686  853  418  505  137   47  616  386  580  933
  389  771  755  582  735  960  771  471  195  84

As it can bee seen from the logs, if the LDA were to iterate 40000 times, then it would correctly assign topics to 80% of the documents. Moreover, the sanity checks show that the model is working fine. 
