In [None]:
# Common imports 
from ast import literal_eval

import gensim
import numpy as np
import pandas as pd

from categorical_em import CategoricalEM
import sys
# print(sys.version)

## 1. Hyperparameters


In [None]:
import os
# print(os.getcwd())

In [None]:
K = 5 # Number of mixture components
I = 120 # Number of words in the dictionary
N = None # Number of documents

In [None]:
df = pd.read_csv('tweets_cleaned.csv')
df

## 2. Load and preprocess the data

First, we need to load the data from the csv. This file contains the documents already processed and cleaned after applying the following steps:

1. Tokenization
2. Homogeneization, which includes:
    1. Removing capitalization.
    2. Removing non alphanumeric tokens (e.g. punktuation signs)
    3. Stemming/Lemmatization.
3. Cleaning
4. Vectorization


We load it as a `pandas` dataframe.


In [None]:
df = pd.read_csv('tweets_cleaned.csv')
df.drop_duplicates(subset="tweet", inplace=True)

df['tokens'] = df['tokens'].apply(literal_eval) #Transform the string into a list of tokens
X_tokens = list(df['tokens'].values)


In [None]:
X_tokens

In [None]:
print('Columns: {}\n'.format(' | '.join(df.columns.values)))

print('Tweet:\n{}'.format(df.loc[1, 'tweet']))
print('Tweet cleaned:\n{}'.format(df.loc[1, 'tweets_clean']))
print('Tweet tokens:\n{}'.format(X_tokens[1]))

### Create the dictionary

Up to this point, we have transformed the raw text collection in a list of documents stored in `X_tokens`, where each document is a collection 
of the words that are most relevant for semantic analysis. Now, we need to convert these data (a list of token lists) into 
a numerical representation (a list of vectors, or a matrix). To do so, we will start using the tools provided by the `gensim` library. 

As a first step, we create a dictionary containing all tokens in our text corpus, and assigning an integer identifier to each one of them.



In [None]:
dictionary = gensim.corpora.Dictionary(X_tokens)

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=I)


### Create Bag of Words (BoW): Numerical version of documents
In the second step, let us create a numerical version of our corpus using the `doc2bow` method. In general, 
`D.doc2bow(token_list)` transforms any list of tokens into a list of tuples `(token_id, n)`, one per each token in 
`token_list`, where `token_id` is the token identifier (according to dictionary `D`) and `n` is the number of occurrences 
of such token in `token_list`. 

*Exercise:* Apply the `doc2bow` method from gensim dictionary `D`, to all tokens in every document in `X_tokens`. 
The result must be a new list named `X_bow` where each element is a list of tuples `(token_id, number_of_occurrences)`.

In [None]:
X_bow = list()
keep_tweet = list()
for tweet in X_tokens:
    tweet_bow = dictionary.doc2bow(tweet)
    if len(tweet_bow) > 1:
        X_bow.append(tweet_bow)
        keep_tweet.append(True)
    else:
        keep_tweet.append(False)

df_data = df[keep_tweet]

N = len(df_data)

Finally, we transform the BoW representation `X_bow` into a matrix, namely `X_matrix`, in which the i-th row and j-th column represents the 
number of occurrences of the j-th word of the dictionary in the i-th document. This will be the matrix used in the algorithm.

In [None]:
X_matrix = np.zeros([N, I])
for i, doc_bow in enumerate(X_bow):
    word_list = list()
    for word in doc_bow:
        X_matrix[i, word[0]] = word[1]


## 3. Categorical Mixture Model with Expectation Maximization

### Exercise 1: Analytical forms of the E and M steps for the EM-Algorithm
1. Write the log joint distribution: $\log p(\{\mathbf{x}_n, z_n\}| \Theta) = ?$
2. Write the analytical expression for $Q(\Theta, \Theta^{\text{old}}) = ?$
3. Write the MLE for $\Theta$


#### Exercise 1.1

\begin{align}
\log p(\{\mathbf{x}_n, z_n\}| \Theta) =  \cdots
\end{align}

In [None]:
def log_p():
    pass

#### Exercise 1.2

\begin{align}
Q(\Theta, \Theta^{\text{old}}) = \cdots
\end{align}

#### Exercise 1.3
\begin{align}
\hat{\pi}_k = \cdots
\end{align}

\begin{align}
\hat{\theta}_{km} = \cdots
\end{align}

### Exercise 2: Data anlysis task
#### Exercise 2.1

In [None]:
K = 5 # Number of mixture components
i_theta = 5 # Dirichlet parameter from which the parameter is sampled for initialization
i_pi = 0 # Dirichlet parameter from which the parameter is sampled for initialization

model = CategoricalEM(K, I, N, delta=0.01, epochs=200, init_params={'theta': i_theta, 'pi': i_pi})
model.fit(X_matrix)

#### Exercise 2.2

In [None]:
# TODO
def AIC():
    return

In [None]:
# Visualization imports 
%matplotlib inline
import probvis.aux as pva
import matplotlib.pyplot as plt
pva.activate_latex_format()

#### Exercise 2.3

Some useful packages:
- matplotlib https://matplotlib.org/
- seaborn https://github.com/mwaskom/seaborn
- wordcloud https://github.com/amueller/word_cloud
- probvis https://github.com/psanch21/prob-visualize



In [None]:
from gmm_em_complete.categorical_em import CategoricalEM
K = 5 # for example
i_theta = 5
i_pi = 5
model = CategoricalEM(K, I, N, delta=0.01, epochs=200, init_params={'theta': i_theta, 'pi': i_pi})
model.fit(X_matrix)

In [None]:
tweet_array = np.array(df_data['tweet'].values)

# Show the 10 most representative words for each topic using a cloud of words
model.show_title_by_topic(tweet_array, model.theta_matrix, model.r_matrix)

In [None]:
tweet_array.shape

In [None]:
model.theta_matrix.shape

In [None]:
model.r_matrix.shape

In [None]:
# Show the 10 most relevant documents for each topic.
model.show_words_by_topic(dictionary)

In [None]:
# Show the evolution of Q over the epochs
model.plot_result(dictionary, close=-1)
plt.show()