In [None]:
from collections import Counter
import string

from ipywidgets import interact

import numpy as np
import hdbscan
import umap
import scipy
import numpy as np
import pandas as pd
import spacy_fi_experimental_web_md
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer

from utils.preprocessing import preprocess_func
import utils.preprocessing as preproc

In [None]:
_stop = ['jne.', 'em.', 'esim.', 'tms.', 'mm.', 'yms.', 'redacted', 'pitää', 'http', 'voida', 'haluta', 'syventää', 'esimerkki', 'taito', 'kiinnostaa', 'mennä', 'meno', 'estää', 'kehittää', 'kehittäminen', 'erityisesti', 'onneksi', 'tämä', 'näkyä', 'käyttö', 'osata', 'kehittää', 'työ', 'taito', 'kehittyä', 'oppia', 'liittyvä', 'osaaminen', 'käyttö', 'lisätä', 'haluta']

for w in _stop:
    if w.endswith('.'):
        _stop.append(w[:-1])

STOP = set(stopwords.words('finnish') 
           + open('data/external/stopwords.txt').read().splitlines()
           + _stop
          )

In [None]:
preprocess = preprocess_func(lemmatize=False)

In [None]:
nlp = spacy_fi_experimental_web_md.load()

In [None]:
custom_tokenizer_exception_s = """
esim. - esimerkiksi
Esim. - Esimerkiksi
ym. - ynnä muuta
tms. - tai muuta sellaista
jne. - ja niin edelleen
kts. - katso
"""

In [None]:
def get_custom_token_exceptions(s):
    
    custom_token_exceptions = []

    for exception in custom_tokenizer_exception_s.split('\n'):
        if not exception:
            # skip blank lines
            continue

        parts = exception.split('-')

        s = parts[0].strip()
        substrings = parts[1].strip().split()

        custom_token_exceptions.append((s, [{'ORTH': s}]))
        
    return custom_token_exceptions

In [None]:
exceptions = get_custom_token_exceptions(custom_tokenizer_exception_s)

In [None]:
for s, substrings in exceptions:
    nlp.tokenizer.add_special_case(s, substrings)

In [None]:
# df = pd.read_csv('data/processed/ensisijainen.csv', index_col=0)

In [None]:
dd = pd.DataFrame({'sentence': [sent.text.strip() for para in open('lorem.txt', 'r').read().split('\n') if para for sent in nlp(para).sents]})

In [None]:
display(df['organisaatio1'].dropna().unique())

# org1 = 'Sosiaali- ja terveystoimiala'
org1 = 'Kaupunginkanslia'
# 
df = df[df['organisaatio1'] == org1].copy()

In [None]:
df['sentences'] = df['answer']\
.str.replace('\[redacted\]', 'REDACTED')\
.apply(lambda text: [line for line in text.split('\n') if line.strip()])\
.apply(lambda lines: [sent.text.strip() for line in lines for sent in nlp(line).sents])

In [None]:
df.head()

In [None]:
dd = pd.DataFrame([{'sentence': sent, 'doc_idx': t.Index} for t in df.itertuples() for sent in t.sentences])

In [None]:
dd.head()

In [None]:
model = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')
# model = SentenceTransformer('LaBSE')

In [None]:
embeddings = model.encode(dd['sentence'])

In [None]:
embeddings.shape

In [None]:
# scale to unit length
normalized_emb = normalize(embeddings)

In [None]:
n_clusters = 20

# clusterer = hdbscan.HDBSCAN()
clusterer = KMeans(n_clusters=n_clusters)
labels = clusterer.fit_predict(normalized_emb)
dd['cluster'] = labels

In [None]:
pd.Series(dd['cluster']).hist(bins=len(dd['cluster'].unique()))

In [None]:
pd.set_option('display.max_rows', 400)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

In [None]:
dd['lemmatized_sentence'] = dd['sentence'].apply(preprocess_func(lemmatize=True))

In [None]:
cluster_document_df = dd.groupby('cluster').agg({'lemmatized_sentence': lambda s: ' '.join(s)})

cluster_vec = TfidfVectorizer()

cluster_tfidf_weights = cluster_vec.fit_transform(cluster_document_df['lemmatized_sentence'])

tfidf_words = [word for word, idx in sorted(cluster_vec.vocabulary_.items(), key=lambda t: t[1])]

In [None]:
for cluster_i in dd['cluster'].unique():
    tfidf_row_weights = cluster_tfidf_weights[cluster_i].toarray().squeeze()
    top_words_and_scores = sorted(zip(tfidf_words, tfidf_row_weights), key=lambda t: t[1], reverse=True)
    cluster_document_df.loc[cluster_i, 'top_words'] = ' '.join([w for w, _ in top_words_and_scores if w not in STOP][:10])
#     cluster_document_df.loc[cluster_i, 'avg length'] = sent_df.loc[sent_df['cluster'] == cluster_i, 'sentence'].apply(lambda s: len(s.split())).mean()
    cluster_document_df.loc[cluster_i, 'n'] = (dd['cluster'] == cluster_i).sum()

In [None]:
cluster_document_df.drop(columns=['lemmatized_sentence'])

In [None]:
@interact(cluster=dd['cluster'].sort_values().unique())
def _f(cluster):
    display(dd.loc[dd['cluster'] == cluster, ['sentence']])

In [None]:
query = 'apotti-osaaminen ja kirjaaminen'

v = normalize(model.encode([query])).reshape(-1)

scores = normalized_emb @ v

highest_score_idx = scores.argsort()[-20:][::-1]

In [None]:
scores[highest_score_idx]

In [None]:
dd['sentence'].iloc[highest_score_idx]

In [None]:
dd.to_csv('lorem.csv')