In [None]:
import re
import numpy as np
import pandas as pd
import ipywidgets as widgets
from ipywidgets import *
import traitlets
from functools import partial
from nltk.corpus import stopwords
from collections import Counter, defaultdict

In [None]:
_stop = ['jne.', 'em.', 'esim.', 'tms.', 'mm.', 'yms.', 'redacted', 'pitää', 'http', 'voida', 'haluta', 'syventää', 'esimerkki', 'taito', 'kiinnostaa', 'mennä', 'meno', 'estää', 'kehittää', 'kehittäminen', 'erityisesti', 'onneksi', 'tämä', 'näkyä', 'käyttö', 'osata', 'kehittää', 'työ', 'taito', 'kehittyä', 'oppia', 'liittyvä', 'osaaminen', 'käyttö', 'lisätä', 'haluta']

for w in _stop:
    if w.endswith('.'):
        _stop.append(w[:-1])

STOP = set(stopwords.words('finnish') 
           + open('data/external/stopwords.txt').read().splitlines()
           + _stop
          )

In [None]:
df = pd.read_csv('report/sote_result-asiakkaan-kanssa.csv', index_col=0).sort_values('sentence')
df.head()

In [None]:
def get_cluster_names(_df):
    most_common_words_df = _df.groupby('cluster')\
        .agg({'lemmatized_sentence': lambda s: ' '.join(s.dropna())})\
        .rename(columns={'lemmatized_sentence': 'aggregated_cluster_doc'})\
        .apply(lambda s: Counter([w for w in s['aggregated_cluster_doc'].split() if w not in STOP]).most_common(n=1)[0][0], axis=1)
    
    d = defaultdict(lambda: '')
    d.update(most_common_words_df)
    
    return d

In [None]:
def _set_button_style(button, *, selected):
    if selected:
        icon = 'check-square-o'
    else:
        icon = 'square-o'
        
    button.icon = icon

def _set_button_description(button, name, n=-1):
        button.description = f'{name} ({n})'

def _toggle_rename(c):
    if c.rename in c.children:
        c.children = [c.goto, c.assign, c.rename_name, c.rename_ok]
    else:
        c.children = [c.goto, c.assign, c.rename, c.remove]


def create_assign_to_cluster_component(cluster, cbs):
        remove = Button(button_style='danger', icon='trash-alt', layout=Layout(width='30px'))
        assign = Button(layout=Layout(width='200px'))
        goto = Button(icon='check-square-o', layout=Layout(width='30px'))
        rename = Button(description='Nimeä uudelleen', layout=Layout(visibility='visible', width='200px'))
        rename_name = Text(placeholder='Uusi nimi', layout=Layout(width='180px'))
        rename_ok = Button(description='OK', layout=Layout(width='50px'))
        
        component = HBox([goto, assign, rename, remove])
        
        component.cluster = cluster
        component.goto = goto
        component.rename = rename
        component.assign = assign
        component.rename_name = rename_name
        component.rename_ok = rename_ok
        component.remove = remove
        
        component.set_name = partial(_set_button_description, assign)
        component.set_style = partial(_set_button_style, goto)
        component.toggle_rename = partial(_toggle_rename, component)
        
        assign.on_click(lambda _: cbs['on_assign_to_cluster_click'](cluster))
        goto.on_click(lambda _: cbs['set_selected_cluster'](cluster))
        rename.on_click(lambda _: cbs['on_rename_click'](component))
        rename_ok.on_click(lambda _: cbs['on_rename_ok_click'](component))
        remove.on_click(lambda _: cbs['on_remove_click'](component))

        return component

In [None]:
dff = pd.read_csv('data/processed/ensisijainen.csv', index_col=0)
dff['answer'] = dff['answer'].str.replace('\[replace\]', 'REPLACE')
dff

In [None]:
class AppLogic(traitlets.HasTraits):

    # list of cluster ids
    clusters = traitlets.List()
    
    # current selection
    selected_cluster = traitlets.Integer()
    
    # list of select options shown in the multiselec box
    select_options = traitlets.List()
    
    # indexes of selected multiselect options
    selected_index = traitlets.List()
    
    # text corresponding to the selected index if only one row is selected
    selected_text = traitlets.Unicode()
    
    def __init__(self, gui, df):
        super().__init__()
        self.gui = gui
        self.df = df
        self.cluster_names = {}
        self.clusters = sorted(self.df['cluster'].unique().tolist())

        self.gui._create_cluster_button.on_click(self.on_create_new_cluster_click)
        
    ####################### OBSERVES #######################
        
    @traitlets.observe('clusters')
    def on_clusters_change(self, change):
        clusters = change['new']
        
        callbacks = {
            'on_assign_to_cluster_click': self.on_assign_to_cluster_click,
            'on_rename_click': self.on_rename_click,
            'on_rename_ok_click': self.on_rename_ok_click,
            'on_remove_click': self.on_remove_click,
            'set_selected_cluster': lambda cluster: setattr(self, 'selected_cluster', cluster)
        }

        components = [create_assign_to_cluster_component(cluster, callbacks) for cluster in clusters]
            
        self.gui.set_assign_to_cluster_components(components)
        self._update_components()

    @traitlets.observe('selected_cluster')
    def select_cluster_change(self, change):
        selected_cluster = change['new']
        
        is_selected_cluster = self.df['cluster'] == self.selected_cluster
        
        sel_df = self.df.loc[is_selected_cluster, 'sentence']
        
        # add ({i}) as select options must be unique
        self.select_options = [f'({i}) {s}' for i, s in enumerate(sel_df)]
        
        self._update_components()
        
    @traitlets.observe('selected_index')
    def selected_index_change(self, change):
        selected_index = change['new']

        if len(selected_index) != 1:
            return

        is_selected_cluster = self.df['cluster'] == self.selected_cluster

        selected_df = self.df.loc[is_selected_cluster].iloc[selected_index]
        
        assert len(selected_df) == 1
        
        sel_doc_idx, sel_sentence = selected_df.iloc[0][['doc_idx', 'sentence']]
        
        answer_text = dff.loc[sel_doc_idx, 'answer']

        self.selected_text = answer_text.replace(sel_sentence, f'<b>{sel_sentence}</b>')


    ####################### HANDLERS #######################

    def on_create_new_cluster_click(self, button):
        name = self.gui._create_cluster_text.value
        self.gui._create_cluster_text.value = ''

        cluster = max(self.clusters) + 1

        self.cluster_names[cluster] = name
        self.clusters = self.clusters + [cluster]


    def on_assign_to_cluster_click(self, new_cluster):
        is_selected_cluster = self.df['cluster'] == self.selected_cluster

        # map positions to df index
        selected_df_index = self.df[is_selected_cluster].iloc[self.selected_index].index

        self.df.loc[selected_df_index, 'cluster'] = new_cluster

        # send event to refresh
        self.select_cluster_change({'new': self.selected_cluster})


    def on_rename_click(self, component):
        component.toggle_rename()

    def on_rename_ok_click(self, component):
        new_name = component.rename_name.value

        if new_name:
            self.cluster_names[component.cluster] = new_name
            self._update_components()

        component.toggle_rename()

    def on_remove_click(self, component):
        # remove cluster only if it is empty
        if len(self.df[self.df['cluster'] == component.cluster]) == 0:
            self.clusters = [c for c in self.clusters if c != component.cluster]    
            

    ####################### UTILS #######################
            
    def _update_components(self):
        # refresh button icons, names and counts
        
        counts = defaultdict(lambda: 0)
        counts.update(df.groupby('cluster').count()['sentence'].to_dict())
        
        names = get_cluster_names(self.df)
        
        for component in self.gui._cluster_buttons.children[1:]:
            cluster = component.cluster

            name = self.cluster_names[cluster] if cluster in self.cluster_names else names[cluster]
            count = counts[cluster]
            component.set_name(name, n=count)
            
            selected = cluster == self.selected_cluster            
            component.set_style(selected=selected)

In [None]:
class AppGUI(AppLayout):
    def __init__(self):
        super().__init__()

        self._selected_text = HTML(layout=Layout(width='600px', height='50px'))

        self._multiselect = SelectMultiple(rows=50, layout=Layout(width='600px'))

        self._create_cluster_text = Text(placeholder='Uuden klusterin nimi', layout=Layout(width='170px'))
        self._create_cluster_button = Button(description='+', layout=Layout(width='30px'))
        self._create_cluster_hbox = HBox([self._create_cluster_text, self._create_cluster_button], layout=Layout(height='50px'))

        self._cluster_buttons = VBox()

        app = HBox([VBox([#self._dropdown_hbox, 
                          self._selected_text,
                          self._multiselect]), 
                    self._cluster_buttons])

        self.center = app

    def set_assign_to_cluster_components(self, components):
        self._cluster_buttons.children = [self._create_cluster_hbox] + components

In [None]:
class App(AppGUI):
    def __init__(self, df):
        super().__init__()
        self.model = AppLogic(self, df=df)
        
        traitlets.link((self.model, 'select_options'), (self._multiselect, 'options'))
        traitlets.link((self.model, 'selected_index'), (self._multiselect, 'index'))
        traitlets.link((self.model, 'selected_text'), (self._selected_text, 'value'))
        
        self.model.select_cluster_change({'new': 0})

In [None]:
len(df)

In [None]:
app = App(df)

app

In [None]:
d = {}
d.update(get_cluster_names(df))
d.update(app.model.cluster_names)

In [None]:
result_df = app.model.df.copy()
result_df['cluster_name'] = result_df['cluster'].apply(lambda cl: d[cl])
result_df[['sentence', 'cluster', 'doc_idx', 'cluster_name']]
# result_df.to_csv('kaupunginkanslia_luokittelu.csv')