https://ethen8181.github.io/machine-learning/clustering_old/topic_model/LDA.html

https://agustinus.kristia.de/techblog/2017/09/07/lda-gibbs/

In [1]:
import nltk
import re
import spacy
import numpy as np
import random

In [2]:
def to_lowercase(document):
    document = document.lower()
    return document

def remove_special_characters_english(document):
    document = re.sub(r'[^a-zA-Z\s]', '', document)
    return document

def tokenize(document):
    document = document.split()
    return document

stopwords_english = nltk.corpus.stopwords.words('english')
def remove_stopwords(document):
    document = [token for token in document if token not in stopwords_english]
    return document

def remove_letters(document):
    document = [token for token in document if len(token) > 1]
    return document

def lemmatize(tokenized_document):
    nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser', 'textcat'])
    doc = nlp(" ".join(tokenized_document))
    document = [token.lemma_ for token in doc]
    return document

In [3]:
def preprocess(documents):
    preprocessed_documents = []

    for document in documents:
        document = to_lowercase(document)
        document = remove_special_characters_english(document)
        document = tokenize(document)
        document = remove_stopwords(document)
        document = lemmatize(document)
        document = remove_letters(document)
        preprocessed_documents.append(document)
    return preprocessed_documents

In [4]:
# coffee
doc_1 = "The aroma of freshly brewed coffee tantalized my senses as I entered the cozy cafe on a brisk morning. \
        With each sip of rich, dark espresso, I felt a surge of energy coursing through my veins, \
        awakening my mind and spirit. As the steam rose from my mug, I savored the robust flavor of \
        the carefully roasted beans, \
        relishing in the simple pleasure of a perfect cup of coffee."

# tea
doc_2 = "The aroma of freshly steeped tea leaves filled the air, creating a sense of tranquility in the serene tea room. \
        With each delicate sip of fragrant jasmine tea, I felt a soothing warmth spreading through my body, \
        calming my senses. As I held the porcelain teacup in my hands, I savored the gentle infusion of flavors, \
        embracing the peaceful moment offered by a perfect cup of tea."

# sleep
doc_3 = "As night falls, the world fades away into a realm of dreams where the mind finds solace in the \
        embrace of sleep. In the depths of slumber, the body enters a state of restorative rest, \
        replenishing its energy for the challenges of the coming day. \
        Each peaceful breath in the quiet of the night serves as a reminder of the profound importance \
        of a good night's sleep for overall well-being."

# morning routines
doc_4 = "Morning routines set the tone for the day, offering a structured start to ensure productivity and balance. \
        From the invigorating aroma of freshly brewed coffee to the calming ritual of meditation, \
        each activity contributes to a harmonious beginning. Whether it's a brisk jog in the crisp \
        morning air or a moment of reflection amidst the chaos, morning routines provide a sacred space \
        to align mind, body, and spirit for the day ahead."


documents = [doc_1, doc_2, doc_3, doc_4]

In [5]:
preprocessed_documents = preprocess(documents)
preprocessed_documents

[['aroma',
  'freshly',
  'brew',
  'coffee',
  'tantalize',
  'sense',
  'enter',
  'cozy',
  'cafe',
  'brisk',
  'morning',
  'sip',
  'rich',
  'dark',
  'espresso',
  'feel',
  'surge',
  'energy',
  'course',
  'vein',
  'awaken',
  'mind',
  'spirit',
  'steam',
  'rise',
  'mug',
  'savor',
  'robust',
  'flavor',
  'carefully',
  'roast',
  'bean',
  'relish',
  'simple',
  'pleasure',
  'perfect',
  'cup',
  'coffee'],
 ['aroma',
  'freshly',
  'steep',
  'tea',
  'leave',
  'fill',
  'air',
  'create',
  'sense',
  'tranquility',
  'serene',
  'tea',
  'room',
  'delicate',
  'sip',
  'fragrant',
  'jasmine',
  'tea',
  'feel',
  'soothe',
  'warmth',
  'spread',
  'body',
  'calm',
  'sense',
  'hold',
  'porcelain',
  'teacup',
  'hand',
  'savored',
  'gentle',
  'infusion',
  'flavor',
  'embrace',
  'peaceful',
  'moment',
  'offer',
  'perfect',
  'cup',
  'tea'],
 ['night',
  'fall',
  'world',
  'fade',
  'away',
  'realm',
  'dream',
  'mind',
  'find',
  'solace',


In [6]:
preprocessed_documents

[['aroma',
  'freshly',
  'brew',
  'coffee',
  'tantalize',
  'sense',
  'enter',
  'cozy',
  'cafe',
  'brisk',
  'morning',
  'sip',
  'rich',
  'dark',
  'espresso',
  'feel',
  'surge',
  'energy',
  'course',
  'vein',
  'awaken',
  'mind',
  'spirit',
  'steam',
  'rise',
  'mug',
  'savor',
  'robust',
  'flavor',
  'carefully',
  'roast',
  'bean',
  'relish',
  'simple',
  'pleasure',
  'perfect',
  'cup',
  'coffee'],
 ['aroma',
  'freshly',
  'steep',
  'tea',
  'leave',
  'fill',
  'air',
  'create',
  'sense',
  'tranquility',
  'serene',
  'tea',
  'room',
  'delicate',
  'sip',
  'fragrant',
  'jasmine',
  'tea',
  'feel',
  'soothe',
  'warmth',
  'spread',
  'body',
  'calm',
  'sense',
  'hold',
  'porcelain',
  'teacup',
  'hand',
  'savored',
  'gentle',
  'infusion',
  'flavor',
  'embrace',
  'peaceful',
  'moment',
  'offer',
  'perfect',
  'cup',
  'tea'],
 ['night',
  'fall',
  'world',
  'fade',
  'away',
  'realm',
  'dream',
  'mind',
  'find',
  'solace',


In [7]:
# generate vocabulary
flattened_list = [word for sublist in preprocessed_documents for word in sublist]
vocab = set(flattened_list)
vocab

{'activity',
 'ahead',
 'air',
 'align',
 'amidst',
 'aroma',
 'awaken',
 'away',
 'balance',
 'bean',
 'begin',
 'body',
 'breath',
 'brew',
 'brisk',
 'cafe',
 'calm',
 'carefully',
 'challenge',
 'chaos',
 'coffee',
 'come',
 'contribute',
 'course',
 'cozy',
 'create',
 'crisp',
 'cup',
 'dark',
 'day',
 'delicate',
 'depth',
 'dream',
 'embrace',
 'energy',
 'ensure',
 'enter',
 'espresso',
 'fade',
 'fall',
 'feel',
 'fill',
 'find',
 'flavor',
 'fragrant',
 'freshly',
 'gentle',
 'good',
 'hand',
 'harmonious',
 'hold',
 'importance',
 'infusion',
 'invigorate',
 'jasmine',
 'jog',
 'leave',
 'meditation',
 'mind',
 'moment',
 'morning',
 'mug',
 'night',
 'offer',
 'overall',
 'peaceful',
 'perfect',
 'pleasure',
 'porcelain',
 'productivity',
 'profound',
 'provide',
 'quiet',
 'realm',
 'reflection',
 'relish',
 'reminder',
 'replenish',
 'rest',
 'restorative',
 'rich',
 'rise',
 'ritual',
 'roast',
 'robust',
 'room',
 'routine',
 'sacred',
 'savor',
 'savored',
 'sense',
 

In [10]:
# word to index / index to word mapping
word_2_idx = {}
idx_2_word = {} 
count = 0
for word in vocab:
    word_2_idx[word] = count
    idx_2_word[count] = word
    count += 1
word_2_idx

{'freshly': 0,
 'morning': 1,
 'steam': 2,
 'leave': 3,
 'calm': 4,
 'soothe': 5,
 'sense': 6,
 'infusion': 7,
 'awaken': 8,
 'amidst': 9,
 'relish': 10,
 'spread': 11,
 'sleep': 12,
 'wellbeing': 13,
 'start': 14,
 'brisk': 15,
 'crisp': 16,
 'tea': 17,
 'find': 18,
 'world': 19,
 'whether': 20,
 'come': 21,
 'ahead': 22,
 'delicate': 23,
 'ritual': 24,
 'rest': 25,
 'embrace': 26,
 'tranquility': 27,
 'good': 28,
 'porcelain': 29,
 'jog': 30,
 'surge': 31,
 'day': 32,
 'provide': 33,
 'roast': 34,
 'create': 35,
 'away': 36,
 'set': 37,
 'quiet': 38,
 'air': 39,
 'slumber': 40,
 'teacup': 41,
 'replenish': 42,
 'chaos': 43,
 'espresso': 44,
 'depth': 45,
 'space': 46,
 'simple': 47,
 'dream': 48,
 'challenge': 49,
 'vein': 50,
 'harmonious': 51,
 'enter': 52,
 'fade': 53,
 'sip': 54,
 'solace': 55,
 'reminder': 56,
 'perfect': 57,
 'pleasure': 58,
 'sacred': 59,
 'contribute': 60,
 'importance': 61,
 'warmth': 62,
 'restorative': 63,
 'mind': 64,
 'spirit': 65,
 'energy': 66,
 'hold'

In [11]:
idx_2_word

{0: 'freshly',
 1: 'morning',
 2: 'steam',
 3: 'leave',
 4: 'calm',
 5: 'soothe',
 6: 'sense',
 7: 'infusion',
 8: 'awaken',
 9: 'amidst',
 10: 'relish',
 11: 'spread',
 12: 'sleep',
 13: 'wellbeing',
 14: 'start',
 15: 'brisk',
 16: 'crisp',
 17: 'tea',
 18: 'find',
 19: 'world',
 20: 'whether',
 21: 'come',
 22: 'ahead',
 23: 'delicate',
 24: 'ritual',
 25: 'rest',
 26: 'embrace',
 27: 'tranquility',
 28: 'good',
 29: 'porcelain',
 30: 'jog',
 31: 'surge',
 32: 'day',
 33: 'provide',
 34: 'roast',
 35: 'create',
 36: 'away',
 37: 'set',
 38: 'quiet',
 39: 'air',
 40: 'slumber',
 41: 'teacup',
 42: 'replenish',
 43: 'chaos',
 44: 'espresso',
 45: 'depth',
 46: 'space',
 47: 'simple',
 48: 'dream',
 49: 'challenge',
 50: 'vein',
 51: 'harmonious',
 52: 'enter',
 53: 'fade',
 54: 'sip',
 55: 'solace',
 56: 'reminder',
 57: 'perfect',
 58: 'pleasure',
 59: 'sacred',
 60: 'contribute',
 61: 'importance',
 62: 'warmth',
 63: 'restorative',
 64: 'mind',
 65: 'spirit',
 66: 'energy',
 67: 'h

In [12]:
# replace words with indices in documents

for i in range(len(preprocessed_documents)):
    for j in range(len(preprocessed_documents[i])):
       token = preprocessed_documents[i][j] 
       index = word_2_idx[token]
       preprocessed_documents[i][j] = index 
preprocessed_documents

[[116,
  0,
  74,
  71,
  69,
  6,
  52,
  105,
  84,
  15,
  1,
  54,
  82,
  90,
  44,
  103,
  31,
  66,
  91,
  50,
  8,
  64,
  65,
  2,
  113,
  110,
  76,
  107,
  93,
  81,
  34,
  111,
  10,
  47,
  58,
  57,
  117,
  71],
 [116,
  0,
  102,
  17,
  3,
  104,
  39,
  35,
  6,
  27,
  77,
  17,
  100,
  23,
  54,
  86,
  87,
  17,
  103,
  5,
  62,
  11,
  109,
  4,
  6,
  67,
  29,
  41,
  80,
  72,
  79,
  7,
  93,
  26,
  101,
  70,
  118,
  57,
  117,
  17],
 [112,
  78,
  19,
  53,
  36,
  95,
  48,
  64,
  18,
  55,
  26,
  12,
  45,
  40,
  109,
  52,
  97,
  63,
  25,
  42,
  66,
  49,
  21,
  32,
  101,
  89,
  38,
  112,
  115,
  56,
  68,
  61,
  28,
  112,
  12,
  75,
  13],
 [1,
  106,
  37,
  96,
  32,
  118,
  92,
  14,
  88,
  98,
  83,
  108,
  116,
  0,
  74,
  71,
  4,
  24,
  94,
  114,
  60,
  51,
  85,
  20,
  15,
  30,
  16,
  1,
  39,
  70,
  73,
  9,
  43,
  1,
  106,
  33,
  59,
  46,
  99,
  64,
  109,
  65,
  32,
  22]]

In [32]:
# specify cluster number:
K = 2

In [110]:
# go through every word in documents and assign a random topic
def calculate_complete_topic_mapping(preprocessed_documents):
    complete_topic_mapping = []
    for i in range(len(preprocessed_documents)):
        topic_mapping = []
        for j in range(len(preprocessed_documents[i])):
            word_index = preprocessed_documents[i][j]
            topic_index = random.randint(0, K-1)
            topic_mapping.append(topic_index)
        complete_topic_mapping.append(topic_mapping)
    return complete_topic_mapping

In [111]:
complete_topic_mapping = calculate_complete_topic_mapping(preprocessed_documents)

In [34]:
# create document-topic matrix: Number of words assigned to each topic for each document 
document_topic = np.zeros((len(preprocessed_documents), K))

# create topic-word matrix: Count of each word being assigned to each topic
topic_word = np.zeros((K, len(vocab)))

In [35]:
document_topic.shape

(4, 2)

In [36]:
topic_word.shape

(2, 119)

In [97]:
# populate the topic-word matrix
def caclulate_topic_word_matrix(preprocessed_documents, complete_topic_mapping):
    for i in range(len(preprocessed_documents)):
        for j in range(len(preprocessed_documents[i])):
            word_index = preprocessed_documents[i][j] 
            topic_index = complete_topic_mapping[i][j]
            # topic-word matrix
            topic_word[topic_index][word_index] += 1
    return topic_word

In [99]:
topic_word = caclulate_topic_word_matrix(preprocessed_documents, complete_topic_mapping)

In [100]:
# populate document-topic matrix
def calculate_document_topic_matrix(preprocessed_documents, complete_topic_mapping):
    for i in range(len(preprocessed_documents)):
        for j in range(len(preprocessed_documents[i])):
            word_index = preprocessed_documents[i][j]
            topic_index = complete_topic_mapping[i][j]
            document_topic[i][topic_index] += 1
    return document_topic

In [101]:
document_topic = calculate_document_topic_matrix(preprocessed_documents, complete_topic_mapping)

### Gibbs Sampling

For each document d, go through each word w (a double for loop). Reassign a new topic to w, where we choose topic t with the probability of word w given topic t × probability of topic t given document d, denoted by the following mathematical notation*



In [112]:
document_topic.shape

(4, 2)

In [113]:
topic_word.shape

(2, 119)

In [114]:
topic_word

array([[2., 4., 2., 2., 4., 2., 2., 2., 2., 0., 2., 2., 4., 0., 0., 0.,
        2., 8., 0., 2., 0., 2., 0., 2., 0., 2., 2., 0., 0., 2., 2., 2.,
        4., 2., 0., 2., 2., 2., 0., 2., 0., 0., 0., 2., 2., 2., 0., 0.,
        0., 0., 0., 0., 4., 2., 2., 2., 2., 4., 2., 2., 2., 0., 2., 2.,
        4., 2., 0., 2., 2., 0., 0., 2., 0., 0., 2., 0., 2., 2., 2., 0.,
        0., 0., 0., 2., 2., 2., 2., 0., 0., 0., 0., 2., 2., 2., 2., 2.,
        0., 0., 0., 0., 2., 4., 0., 2., 2., 2., 2., 2., 0., 4., 0., 2.,
        4., 0., 2., 0., 4., 2., 2.],
       [4., 4., 0., 0., 0., 0., 4., 0., 0., 2., 0., 0., 0., 2., 2., 4.,
        0., 0., 2., 0., 2., 0., 2., 0., 2., 0., 2., 2., 2., 0., 0., 0.,
        2., 0., 2., 0., 0., 0., 2., 2., 2., 2., 2., 0., 0., 0., 2., 2.,
        2., 2., 2., 2., 0., 0., 2., 0., 0., 0., 0., 0., 0., 2., 0., 0.,
        2., 2., 4., 0., 0., 2., 4., 4., 2., 2., 2., 2., 0., 0., 0., 2.,
        2., 2., 2., 0., 0., 0., 0., 2., 2., 2., 2., 0., 0., 2., 0., 0.,
        2., 2., 2., 2., 0.,

In [115]:
document_topic

array([[36., 40.],
       [52., 28.],
       [40., 34.],
       [48., 40.]])

In [116]:
alpha = 0.5
beta = 0.5

In [117]:
sum(document_topic[0,:])

76.0

In [118]:
topic_word[0]

array([2., 4., 2., 2., 4., 2., 2., 2., 2., 0., 2., 2., 4., 0., 0., 0., 2.,
       8., 0., 2., 0., 2., 0., 2., 0., 2., 2., 0., 0., 2., 2., 2., 4., 2.,
       0., 2., 2., 2., 0., 2., 0., 0., 0., 2., 2., 2., 0., 0., 0., 0., 0.,
       0., 4., 2., 2., 2., 2., 4., 2., 2., 2., 0., 2., 2., 4., 2., 0., 2.,
       2., 0., 0., 2., 0., 0., 2., 0., 2., 2., 2., 0., 0., 0., 0., 2., 2.,
       2., 2., 0., 0., 0., 0., 2., 2., 2., 2., 2., 0., 0., 0., 0., 2., 4.,
       0., 2., 2., 2., 2., 2., 0., 4., 0., 2., 4., 0., 2., 0., 4., 2., 2.])

In [119]:
topic_word[:,0]

array([2., 4.])

In [124]:
#def gibbs_sampling():
n_iterations = 100

complete_topic_mapping = calculate_complete_topic_mapping(preprocessed_documents)
document_topic = calculate_document_topic_matrix(preprocessed_documents, complete_topic_mapping)
topic_word = caclulate_topic_word_matrix(preprocessed_documents, complete_topic_mapping)

for _ in range(n_iterations):
    for i in range(len(preprocessed_documents)):
        for j in range(len(preprocessed_documents[i])):
            word_index = preprocessed_documents[i][j]

            topic_doc_counts = [] 
            word_topic_counts = []
            for k in range(K):
                # how much is topic k in doc i
                topic_doc_count = document_topic[i][k] + alpha
                topic_doc_counts.append(topic_doc_count) 

                # how much is word_index in topic k?
                word_topic_count = topic_word[k][word_index] + beta
                word_topic_counts.append(word_topic_count)
            
            # sample new topic
            n = np.multiply(np.array(word_topic_counts), np.array(topic_doc_counts))
            n = n / sum(n)
            multinomial = np.random.multinomial(1, n)
            new_topic = np.argmax(multinomial)
            print(new_topic)
            complete_topic_mapping[i][j] = new_topic

            # re-calculate matrices
            document_topic = calculate_document_topic_matrix(preprocessed_documents, complete_topic_mapping)
            topic_word = caclulate_topic_word_matrix(preprocessed_documents, complete_topic_mapping)

    

1
1
0
0
1
0
0
0
1
1
1
0
0
1
1
1
0
0
1
0
0
1
1
1
1
0
1
0
0
1
0
1
0
1
1
1
1
1
1
1
1
0
0
0
1
0
0
0
1
1
0
0
1
1
1
0
1
1
0
0
1
1
0
0
0
1
1
1
1
0
0
0
1
0
1
0
0
0
1
1
1
0
1
1
0
1
0
0
1
1
0
1
1
0
1
1
0
1
0
0
1
1
1
1
1
1
1
1
0
1
0
0
1
1
0
1
0
1
0
1
0
0
1
0
0
0
0
1
1
0
0
1
0
1
0
0
1
0
0
1
0
1
1
0
0
0
0
1
1
0
0
1
1
1
1
1
1
0
1
1
1
0
0
1
1
1
0
1
1
0
1
0
1
1
1
0
0
1
1
0
1
1
0
1
1
1
1
0
1
0
0
0
1
1
1
1
0
1
1
1
1
0
0
1
0
1
0
1
1
0
0
1
1
1
0
1
1
0
0
1
1
0
1
0
1
0
1
1
0
0
0
0
1
0
1
1
1
1
1
1
0
1
1
0
1
1
1
0
1
0
1
0
0
1
1
0
1
1
0
0
0
0
1
1
1
1
1
0
1
1
1
0
1
0
1
1
1
0
0
1
0
0
0
0
0
0
1
0
0
1
0
1
1
0
0
1
0
0
1
0
1
1
0
0
1
0
0
1
1
0
1
1
1
1
1
1
0
0
1
1
1
0
1
0
0
0
1
1
1
1
0
1
1
1
0
0
1
0
0
1
0
1
1
1
1
1
0
1
0
0
0
0
1
0
1
0
1
1
1
0
0
0
0
0
1
0
1
1
0
0
1
1
1
0
1
0
0
0
1
0
0
1
0
1
1
1
1
0
0
0
1
0
0
1
1
1
1
1
1
1
1
1
0
1
0
0
1
1
0
1
0
1
0
0
0
1
1
0
1
1
0
1
1
1
1
1
0
1
1
1
0
1
0
1
0
1
0
0
1
0
1
0
0
0
0
1
1
0
1
1
0
1
0
0
1
0
0
1
0
1
1
0
0
1
0
1
0
0
1
1
1
1
1
1
1
1
0
1
0
1
0
1
1
1
0
1
1
1
0
0
1
1
1
0
0
1
1
0
1
1


In [125]:
complete_topic_mapping

[[1,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  0],
 [1,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  1],
 [1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  0],
 [1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  1]]

In [126]:
document_topic

array([[163362., 453112.],
       [268664., 380256.],
       [ 68602., 531649.],
       [347023., 366789.]])

In [127]:
sum(document_topic)

array([ 847651., 1731806.])

In [128]:
document_topic / document_topic.sum(axis=1, keepdims=True)

array([[0.26499414, 0.73500586],
       [0.41401714, 0.58598286],
       [0.11428886, 0.88571114],
       [0.48615462, 0.51384538]])