In [1]:
# Common imports 
from ast import literal_eval

import gensim
import numpy as np
import pandas as pd

from categorical_em import CategoricalEM
#from gmm_em.categorical_em import CategoricalEM
import sys

In [2]:
#!pip install gensim
#!pip install GMM

In [3]:
#import gmm_em

## 1. Hyperparameters


In [4]:
K = 5 # Number of mixture components
I = 120 # Number of words in the dictionary
N = None # Number of documents

## 2. Load and preprocess the data

First, we need to load the data from the csv. This file contains the documents already processed and cleaned after applying the following steps:

1. Tokenization
2. Homogeneization, which includes:
    1. Removing capitalization.
    2. Removing non alphanumeric tokens (e.g. punktuation signs)
    3. Stemming/Lemmatization.
3. Cleaning
4. Vectorization


We load it as a `pandas` dataframe.


In [6]:
df = pd.read_csv('../L2_code/tweets_cleaned.csv')
df.drop_duplicates(subset="tweet", inplace=True)

df['tokens'] = df['tokens'].apply(literal_eval) #Transform the string into a list of tokens
X_tokens = list(df['tokens'].values)


In [7]:
print('Columns: {}\n'.format(' | '.join(df.columns.values)))

print('Tweet:\n{}'.format(df.loc[1, 'tweet']))
print('Tweet cleaned:\n{}'.format(df.loc[1, 'tweets_clean']))
print('Tweet tokens:\n{}'.format(X_tokens[1]))

Columns: tweet_id | timestamp | user_id | tweet | tweets_clean | tokens

Tweet:
OSINT people - please retweet, if possible. My friend is looking for women involved in OSINT. https://twitter.com/manisha_bot/status/1181594280336531457 …
Tweet cleaned:
osint people   please retweet  if possible  my friend is looking for women involved in osint
Tweet tokens:
['osint', 'peopl', 'retweet', 'possibl', 'friend', 'look', 'woman', 'involv', 'osint']


### Create the dictionary

Up to this point, we have transformed the raw text collection in a list of documents stored in `X_tokens`, where each document is a collection 
of the words that are most relevant for semantic analysis. Now, we need to convert these data (a list of token lists) into 
a numerical representation (a list of vectors, or a matrix). To do so, we will start using the tools provided by the `gensim` library. 

As a first step, we create a dictionary containing all tokens in our text corpus, and assigning an integer identifier to each one of them.



In [7]:
dictionary = gensim.corpora.Dictionary(X_tokens)
print(dictionary)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=I)
print(dictionary)

Dictionary(12243 unique tokens: ['collin', 'cum', 'domin', 'phil', 'room']...)
Dictionary(120 unique tokens: ['look', 'peopl', 'woman', 'love', 'work']...)


### Create Bag of Words (BoW): Numerical version of documents
In the second step, let us create a numerical version of our corpus using the `doc2bow` method. In general, 
`D.doc2bow(token_list)` transforms any list of tokens into a list of tuples `(token_id, n)`, one per each token in 
`token_list`, where `token_id` is the token identifier (according to dictionary `D`) and `n` is the number of occurrences 
of such token in `token_list`. 

*Exercise:* Apply the `doc2bow` method from gensim dictionary `D`, to all tokens in every document in `X_tokens`. 
The result must be a new list named `X_bow` where each element is a list of tuples `(token_id, number_of_occurrences)`.

In [8]:
X_bow = list()
keep_tweet = list()
for tweet in X_tokens:
    tweet_bow = dictionary.doc2bow(tweet)
    if len(tweet_bow) > 1:
        X_bow.append(tweet_bow)
        keep_tweet.append(True)
    else:
        keep_tweet.append(False)

df_data = df[keep_tweet]

N = len(df_data)


Finally, we transform the BoW representation `X_bow` into a matrix, namely `X_matrix`, in which the i-th row and j-th column represents the 
number of occurrences of the j-th word of the dictionary in the i-th document. This will be the matrix used in the algorithm.

In [9]:
X_matrix = np.zeros([N, I])
for i, doc_bow in enumerate(X_bow):
    word_list = list()
    for word in doc_bow:
        X_matrix[i, word[0]] = word[1]

In [10]:
X_matrix

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 2., ..., 0., 0., 0.]])

## 3. Categorical Mixture Model with Expectation Maximization

$$\pi$$
$$\theta$$

In [11]:
pi_vector = 1 / K * np.ones([1, K])
theta_matrix =  np.random.dirichlet(1. * np.ones(I), size=K)  # [K, I]


#### Exercise 1.1

\begin{align}
p(\{\mathbf{x}_n\}| \Theta) =  \cdots
\end{align}

In [12]:
def log_p(X_matrix, pi_vector, theta_matrix):
    '''
    
    :param X_matrix: NxI
    :param z_matrix: NxK. One hot encoding
    :param pi_vector: 1xK
    :param theta_matrix: KxI
    :return: 
    '''
    log_pi_vector = np.log(pi_vector)
    log_theta_matrix = np.log(theta_matrix)
    prior_term = np.sum(log_pi_vector.T)
    lik_term_per_K =  X_matrix @ log_theta_matrix.T
    lik_term = np.sum( lik_term_per_K)
    return prior_term + lik_term

In [13]:
log_theta_matrix = np.log(theta_matrix)
print(X_matrix @ log_theta_matrix.T)

[[-15.60439025 -13.76016706 -15.0480915  -16.42057138 -14.76129789]
 [ -8.58873385  -9.57893044 -10.27025621  -9.54036862 -12.62719319]
 [-16.7486295  -18.5238452  -15.67700129 -14.70349439 -17.35594727]
 ...
 [-14.30684845  -9.48822629 -11.36747685  -9.20682184  -9.16761109]
 [-19.89238691 -16.18090747 -18.11231184 -17.4609332  -17.45881176]
 [-23.01575311 -18.56495455 -23.60519406 -22.62986005 -17.26680875]]


In [14]:
# or instead of using "@", try the following
log_theta_matrix = np.log(theta_matrix)
lik_term_per_K = [[sum(a*b for a,b in zip(X_row,Y_col)) for Y_col in zip(*log_theta_matrix.T)] for X_row in X_matrix]

for r in lik_term_per_K:
   print(r)

[-15.60439024683058, -13.760167058085273, -15.048091498041206, -16.420571383004248, -14.761297885104346]
[-8.588733846189331, -9.578930439162288, -10.27025621183218, -9.540368619173773, -12.627193187544659]
[-16.74862950280192, -18.523845198051, -15.677001290029061, -14.703494388617658, -17.355947271326603]
[-9.446463555072054, -12.460031012622228, -12.862401870424668, -17.746770854623197, -13.550246037019107]
[-22.07749224692359, -16.844009274065687, -20.107991004838652, -23.762692010752176, -22.622958645512078]
[-8.9206609017508, -13.28508157151334, -8.124521059813231, -9.067639526701603, -11.898293543294319]
[-22.849959857132276, -20.038477666921537, -16.928994868727496, -12.561248325009228, -20.72012421102371]
[-37.06033747430885, -32.731727505728855, -28.53722635936238, -29.619684813832265, -30.703710082071076]
[-11.188421779307479, -10.173257302975554, -10.68372652808533, -8.135879165330717, -11.633723085264688]
[-10.639204590400844, -8.072298718768563, -15.760550691017226, -13.9

[-23.92262898065619, -25.92413662308017, -29.714167816905167, -27.42042670380782, -28.295923683926475]
[-27.268247847817914, -26.175758177969804, -30.155933789009758, -33.788056949663016, -22.24230061916864]
[-8.140590949693788, -13.216089109214439, -9.597367581955906, -10.743200998558532, -9.471181733255694]
[-31.967386879908663, -30.346531957628052, -26.47599297854916, -26.84787860846381, -30.858884610644008]
[-10.163678270069159, -11.221778625067493, -12.056671791048508, -11.974862050602708, -9.900482293050286]
[-32.42068800808446, -38.44331829956011, -33.33231946352862, -39.00294341878185, -29.79279443901776]
[-18.34762314241871, -16.89448972256827, -19.51078927202807, -21.01169607393247, -14.909364876052965]
[-18.763931447162367, -27.695300218739973, -23.14757655092596, -22.42170541849205, -19.074996035421037]
[-15.10670457585707, -16.015200707603874, -20.531974281380805, -21.12106963266389, -16.624037175244126]
[-44.74323312073489, -43.4314983318573, -43.982215887089986, -47.4723

[-37.09142946271649, -33.20399670316868, -35.93467524185607, -40.69676934669733, -39.107168875201324]
[-17.475129821125055, -20.239347586201024, -19.56325445081505, -19.27807409225472, -21.556527222800824]
[-10.63812176518168, -10.494193464107825, -10.675167934111865, -8.921453235174134, -10.765044879048332]
[-11.895450908706929, -13.921016427136992, -10.778223608730531, -11.082389935793834, -11.113354512163081]
[-13.576498600013316, -10.26128971343108, -11.958820026100993, -10.1865489666124, -11.974862463751982]
[-15.157205062277454, -13.308438814593696, -16.93289068973167, -13.749731291284986, -17.13816182476197]
[-22.052274271425276, -19.645402615766628, -19.09164301378127, -25.132478594481, -23.031235342122876]
[-22.391029035176913, -24.791207188540813, -21.18085687340536, -34.7540393231363, -25.086748690837386]
[-9.720179099312098, -10.32568822995961, -12.4816775536591, -11.034423913760731, -10.050387349101662]
[-8.98345894510656, -11.346566935623098, -10.149035645182297, -12.7154

[-17.798927884664376, -19.404551247062034, -19.8666414240576, -19.975345922704157, -22.66566891652665]
[-19.788119041818703, -21.48433951176247, -20.854511797863488, -20.75313930903165, -19.923076233846505]
[-25.857160463399683, -23.020972499179496, -24.342683547795502, -24.534453784610715, -18.89933929637843]
[-21.605650438965583, -22.424758783043227, -20.108939628158943, -22.144441407130117, -21.364439771406737]
[-17.355871689396718, -12.515753205989434, -18.218874864210996, -17.220124384662075, -13.343114628857448]
[-32.17810919661882, -25.85258573585315, -25.449138232103465, -28.54041869972852, -28.832716704622193]
[-31.5676194382729, -29.021154330019804, -32.31418083214625, -32.836901859623296, -34.349574953521994]
[-13.034480535607022, -17.471740835272062, -14.942697119225759, -16.163484815605226, -16.508137340140287]
[-24.47844602667555, -22.602305393323796, -23.423444211952884, -21.2250462991551, -21.994947980257095]
[-33.27920279304366, -28.995379050365116, -28.512894221763844

[-23.311407383422708, -25.550971695118943, -29.17929846838289, -23.40377133638462, -26.412368060404095]
[-19.37446639087078, -26.795930408707925, -26.602626398273713, -29.52429133211178, -20.823882728584103]
[-35.411139490874916, -41.31332457173623, -42.89536463205948, -44.983259592877474, -39.90181986453346]
[-16.308030524040333, -16.658141697175694, -20.163406441508002, -14.840652217618398, -15.977409802338933]
[-23.088421699517383, -31.9310865607095, -27.992563873184487, -24.09103901342272, -32.598856174619264]
[-21.663914589602143, -21.357711423833116, -17.608200606334407, -15.407734463707019, -17.744522699323575]
[-24.762832714087715, -29.457475930787915, -20.876122636386143, -24.58100493712754, -27.29098371803575]
[-20.710594307873407, -21.695533045376173, -26.108639964727647, -20.734651720787053, -28.909493449065856]
[-22.04058656198333, -22.264148179177653, -18.95390817974874, -18.862020843385476, -22.922629078838284]
[-16.20670469917333, -19.393273448549778, -21.20662122373956

In [15]:

print(f"Data log-likelihood: {log_p(X_matrix, pi_vector, theta_matrix)}")

Data log-likelihood: -600211.8576169236


#### Exercise 1.2

\begin{align}
Q(\Theta, \Theta^{\text{old}}) = \cdots
\end{align}

In [16]:
def compute_lset(ns):
    max_ = np.max(ns)
    ds = ns - max_
    sumOfExp = np.exp(ds).sum()
    return max_ + np.log(sumOfExp)
def rik_matrix(X, pi_vector, theta_matrix):
    N, I = X.shape
    K, _ = theta_matrix.shape

    log_theta_matrix = np.log(theta_matrix)
    log_pi_vector = np.log(pi_vector)

    last_term = np.zeros([N, 1])
    last_term2 = X @ log_theta_matrix.T
    ns = last_term2 + np.tile(log_pi_vector, [N, 1])  # [N, K]
    for i in range(N):
        last_term[i] = compute_lset(ns[i, :])

    last_term = np.tile(last_term, [1, K])
    log_r_matrix = np.tile(log_pi_vector, [N, 1]) + last_term2 - last_term
    r_matrix = np.exp(log_r_matrix)

    return np.clip(r_matrix, 1e-250, None), last_term2
def Q_function(X, pi_vector, theta_matrix):
    log_pi_vector = np.log(pi_vector)

    r_matrix, last_term = rik_matrix(X, pi_vector, theta_matrix)  # [N, K]

    r_matrix = r_matrix / np.tile(np.sum(r_matrix, 1, keepdims=True), [1, r_matrix.shape[1]])
    log_pi_matrix = np.tile(log_pi_vector, [N, 1])
    term1 = np.multiply(r_matrix, log_pi_matrix)
    term1 = np.sum(term1)

    Q = term1 + np.sum(np.multiply(r_matrix, last_term))
    return Q

In [17]:
print(f"Q: {Q_function(X_matrix, pi_vector, theta_matrix)}")

Q: -115549.22295629328


#### Exercise 1.3
\begin{align}
\hat{\pi}_k = \cdots
\end{align}

\begin{align}
\hat{\theta}_{km} = \cdots
\end{align}

In [18]:
def pi_vector_hat(X, pi_vector, theta_matrix):
    r_matrix, _ = rik_matrix(X, pi_vector, theta_matrix)  # [N, K]
    return np.mean(r_matrix, 0)

def theta_matrix_hat(X, pi_vector, theta_matrix):
    N, I = X.shape
    r_matrix, _ = rik_matrix(X, pi_vector, theta_matrix)  # [N, K]
    num = r_matrix.T @ X  # [K, I]
    den = np.tile(np.sum(num, 1, keepdims=True), [1, I])
    return np.divide(num, den)

In [19]:
print(f"pi_vector_hat: {np.sum(pi_vector_hat(X_matrix, pi_vector, theta_matrix))}")
print(f"theta_matrix_hat: {np.sum(theta_matrix_hat(X_matrix, pi_vector, theta_matrix), axis=-1)}")

pi_vector_hat: 0.9999999999999981
theta_matrix_hat: [1. 1. 1. 1. 1.]


### Exercise 2: Data anlysis task
#### Exercise 2.1

In [20]:
K = 5 # Number of mixture components


for i_theta in [2,5]:  
    for i_pi in [0, 5]:
         model = CategoricalEM(K, I, N, delta=0.01, epochs=200, init_params={'theta': i_theta, 'pi': i_pi})
         model.fit(X_matrix)
         print('-------------------------------')

ITER: 0 Q= -113014.3342 diff= 200
ITER: 5 Q= -105846.3543 diff= 771.7595
ITER: 10 Q= -103343.8562 diff= 213.5411
ITER: 15 Q= -103012.7655 diff= 27.46
ITER: 20 Q= -102938.8255 diff= 12.2731
ITER: 25 Q= -102846.1079 diff= 18.6653
ITER: 30 Q= -102796.1873 diff= 6.9143
ITER: 35 Q= -102759.5064 diff= 7.7912
ITER: 40 Q= -102730.1782 diff= 4.2513
ITER: 45 Q= -102712.8784 diff= 3.1218
ITER: 50 Q= -102699.7626 diff= 2.3824
ITER: 55 Q= -102688.3702 diff= 2.2545
ITER: 60 Q= -102677.2424 diff= 2.1798
ITER: 65 Q= -102667.2325 diff= 1.8567
ITER: 70 Q= -102659.115 diff= 1.4754
ITER: 75 Q= -102652.6887 diff= 1.1679
ITER: 80 Q= -102647.6474 diff= 0.9041
ITER: 85 Q= -102643.9197 diff= 0.6374
ITER: 90 Q= -102641.5723 diff= 0.358
ITER: 95 Q= -102640.5195 diff= 0.1248
ITER: 100 Q= -102640.2517 diff= 0.0284
ITER: 105 Q= -102640.0155 diff= 0.0718
ITER: 110 Q= -102639.4517 diff= 0.1346
ITER: 115 Q= -102638.7401 diff= 0.1394
ITER: 120 Q= -102638.1516 diff= 0.0993
ITER: 125 Q= -102637.8286 diff= 0.04
ITER: 128 

#### Exercise 2.2

In [21]:
i_theta = 5
i_pi = 5

AIC_list = list()
for K in [2,3, 5, 10]:  
    model = CategoricalEM(K, I, N, delta=0.01, epochs=200, init_params={'theta': i_theta, 'pi': i_pi})
    model.fit(X_matrix)
    print('-------------------------------')

ITER: 0 Q= -110037.9298 diff= 200
ITER: 5 Q= -104571.1062 diff= 233.7021
ITER: 10 Q= -103997.3407 diff= 51.6127
ITER: 15 Q= -103842.7923 diff= 29.5681
ITER: 20 Q= -103711.7704 diff= 22.7654
ITER: 25 Q= -103623.196 diff= 15.3764
ITER: 30 Q= -103553.7406 diff= 13.0723
ITER: 35 Q= -103491.5457 diff= 12.3009
ITER: 40 Q= -103428.1225 diff= 13.0248
ITER: 45 Q= -103360.2916 diff= 14.1361
ITER: 50 Q= -103264.5769 diff= 23.3501
ITER: 55 Q= -103120.9015 diff= 32.2725
ITER: 60 Q= -102928.0238 diff= 43.5285
ITER: 65 Q= -102671.2807 diff= 55.4105
ITER: 70 Q= -102394.792 diff= 53.3325
ITER: 75 Q= -102168.5902 diff= 37.4243
ITER: 80 Q= -102022.9688 diff= 24.7727
ITER: 85 Q= -101925.7268 diff= 16.4025
ITER: 90 Q= -101853.4194 diff= 14.3006
ITER: 95 Q= -101794.4457 diff= 10.1353
ITER: 100 Q= -101749.8885 diff= 7.7184
ITER: 105 Q= -101715.8849 diff= 6.5596
ITER: 110 Q= -101687.6794 diff= 4.985
ITER: 115 Q= -101662.2714 diff= 5.2292
ITER: 120 Q= -101623.3542 diff= 10.3955
ITER: 125 Q= -101567.5964 diff= 

#### Exercise 2.3

Some useful packages:
- matplotlib https://matplotlib.org/
- seaborn https://github.com/mwaskom/seaborn
- wordcloud https://github.com/amueller/word_cloud
- probvis https://github.com/psanch21/prob-visualize



In [22]:
# Visualization imports 
%matplotlib inline
import probvis.aux as pva
import matplotlib.pyplot as plt
pva.activate_latex_format()

In [24]:
#from gmm_em_complete.categorical_em import CategoricalEM
K = 5 # for example
i_theta = 5
i_pi = 5
model = CategoricalEM(K, I, N, delta=0.01, epochs=200, init_params={'theta': i_theta, 'pi': i_pi})
model.fit(X_matrix)


ITER: 0 Q= -113826.1003 diff= 200
ITER: 5 Q= -106216.6365 diff= 713.0888
ITER: 10 Q= -104030.8666 diff= 309.2636
ITER: 15 Q= -102847.7257 diff= 191.7261
ITER: 20 Q= -102353.6564 diff= 49.5822
ITER: 25 Q= -102244.0073 diff= 16.831
ITER: 30 Q= -102190.2984 diff= 7.2975
ITER: 35 Q= -102168.9743 diff= 2.6291
ITER: 40 Q= -102163.9561 diff= 0.5948
ITER: 44 Q= -102161.8084 diff= -0.0238


In [26]:
tweet_array = np.array(df_data['tweet'].values)

#model.show_title_by_topic(tweet_array, model.theta_matrix, model.r_matrix)

In [None]:
#model.show_words_by_topic(dictionary)

In [27]:
#model.plot_result(dictionary, close=-1)
#plt.show()