In [None]:
import pandas as pd
import fasttext
from sklearn import metrics, cluster, decomposition, manifold
import numpy as np
import string
import matplotlib.pyplot as plt
import ipywidgets as w
from collections import Counter
from pathlib import Path
import umap
import hdbscan

from IPython.display import clear_output
import plotly

from utils.preprocessing import preprocess_func

In [None]:
lemmatize = True
preprocess = preprocess_func(lemmatize=lemmatize)

In [None]:
# list available models
for p in Path('models').glob('*.bin'):
    print(p.name)

In [None]:
# model_name = 'cc.fi.300.bin'
model_name = 'fasttext-lr=0.05,dim=30,ws=5,epoch=5,minn=4,maxn=6,neg=10,loss=ns,bucket=2000000,lrUpdateRate=100,t=0.0001,lemmatize=True.bin'

In [None]:
ft = fasttext.load_model(f'models/{model_name}')

In [None]:
def get_normalized_ft_vec(word):
    v = ft.get_word_vector(word)

    if np.isclose(v.sum(), 0).all():
        return v

    return v / np.sqrt(np.sum(v**2))    

In [None]:
df = pd.read_csv('data/processed/ensisijainen.csv', index_col=0)
df.head()

In [None]:
df['answer'] = df['answer'].apply(preprocess)

In [None]:
df['tokens'] = df['answer'].str.split()

In [None]:
# set of all words
all_words_list = [token for tokens in df['tokens'] for token in tokens]

c = Counter(all_words_list)

all_words = sorted(list(set(all_words_list)))

len(all_words)

In [None]:
c.most_common()[:20]

In [None]:
V = pd.DataFrame(data=[get_normalized_ft_vec(w) for w in all_words], index=all_words)
V.shape

In [None]:
# There are some vectors which do not have word embeddings
invalid_words = V.index[(V == 0).all(axis=1)]

# drop them from V
V = V.drop(index=invalid_words)

# and remove them from tokens
df['tokens'] = df['tokens'].apply(lambda tokens: [t for t in tokens if t not in invalid_words])

# and all_words
all_words = [w for w in all_words if w not in invalid_words]

In [None]:
invalid_words

In [None]:
assert np.allclose(np.sum(V**2, axis=1), 1)

In [None]:
# n_clusters = 100

# clustering = cluster.KMeans(n_clusters)

# labels = clustering.fit_predict(V)

In [None]:
reducer1 = umap.UMAP(n_neighbors=50, n_components=5)

embeddings1 = reducer1.fit_transform(V)


In [None]:
clustering = hdbscan.HDBSCAN(min_cluster_size=5)

labels = clustering.fit_predict(embeddings1)

In [None]:
n_clusters = len(pd.Series(labels).unique())

In [None]:
# reducer = manifold.TSNE(n_components=2)
# embeddings = reducer.fit_transform(V)

In [None]:
reducer = umap.UMAP(n_neighbors=50)

embeddings = reducer.fit_transform(embeddings1)


In [None]:
embeddings.shape

In [None]:
labels_ = [lbl for lbl in labels if lbl != -1]
embeddings_ = np.stack([embeddings[i] for i, lbl in enumerate(labels) if lbl != -1])

fig, ax = plt.subplots(figsize=(16, 12))
plt.scatter(*zip(*embeddings_), marker='.', c=labels_)

In [None]:
pd.Series(labels).value_counts()

In [None]:
# sorted([(w, c[w]) for w in V[labels == -1].index], key=lambda t: t[1], reverse=True)

In [None]:
# fig, ax = plt.subplots(figsize=(16, 12))
# plt.scatter(*zip(*tsne_embeddings), marker='.', c=labels)

In [None]:
class KwargContainer:
    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

In [None]:
dd = pd.DataFrame({
        'word': all_words,
        'count': [c[word] for word in all_words],
        'cluster': labels})

dropdown = w.Dropdown(options=range(n_clusters))

output = w.Output()

def handle_change(change):
    selected_cluster = change.new

    with output:
        clear_output()

        sel = dd[dd['cluster'] == selected_cluster]\
            .set_index('word')\
            .sort_values('count', ascending=False)[:20]

        display(sel)

dropdown.observe(handle_change, names='value')

handle_change(KwargContainer(new=0)) # send a dummy object to trigger the event handler

w.VBox([dropdown, output])

In [None]:
model_name.rsplit('.', 1)[0]

In [None]:
dd.to_csv(f"data/interim/word_clusters_{model_name.rsplit('.', 1)[0]}-lemmatized={lemmatize}.csv")