# SOMPY Experimentation

## Description

This notebook contains code for generating hierarchical data files (`*.h5`) that can be loaded in [`SOM_Visualization.ipynb`](SOM_Visualization.ipynb).

## Workflow

1. Choose the columns you wish to train on in [the som column selection cell](#another_cell), and run the cell in jupyter to update the variable reference.
1. Navigate to [the som training cell](#training_cell).
1. Press the "Train" button to begin training the SOM. Depending on the size of your dataset, and how many columns you are training on, this may take longer or shorter.
1. When the process is complete, optionally give the SOM a uniquely identifying name by entering it in the text box, and then hit the "Save" button to write a hierarchical data file to be loaded later.

In [1]:
import logging
import ipywidgets as widgets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import sompy
from sompy.decorators import timeit
from sompy.sompy import SOMFactory
logging.getLogger().setLevel(logging.WARNING)
import itertools
import functools
import datetime as dt
#from IPython.display import display

In [2]:
# Group specific package
from tfprop_sompy.tfprop_vis import render_posmap_to_axes, kmeans_clust, ViewTFP

In [3]:
from IPython.display import display

In [4]:
logging.getLogger().setLevel(logging.WARNING)

In [5]:
# File location and name
fin = 'DummyData.csv'

# The product of the values in mapsize needs to be a square and larger than the dataset
# Warning: the larger the dataset the longer it will take to run
mapsize = (30, 30)
n_job = 1

# Reads CSV as data frame
data_df = pd.read_csv(fin)

In [6]:
#Makes data frame of index values
name_df = pd.DataFrame(data_df.index)
km_cluster = 4

<a id='another_cell'></a>

In [7]:
# Names of columns you want to train on
# Columns with values that are not numerical should be excluded
som_columns = [
    "Set_1",
    "Set_2",
    "Set_3",
    "Set_4",
]


In [8]:
# All the data values for indicated som_columns
descr = data_df[som_columns].values

# Shows size of data
# Sample should be (560,4)
descr.shape

(560, 4)

In [9]:
# Builds a SOM model
sm = SOMFactory.build(descr, 
                      mapsize=mapsize, 
                      normalization='var', 
                      initialization='pca', 
                      component_names=som_columns
                     )

In [10]:
# Not done commenting this cell
# monkeypatch _batchtrain so we can display what's going on
def _batchtrain_monkeypatch(self, trainlen, radiusin, radiusfin, njob=1,
                shared_memory=False):
    from time import time
    radius = np.linspace(radiusin, radiusfin, trainlen)

    if shared_memory:
        data = self._data
        data_folder = tempfile.mkdtemp()
        data_name = os.path.join(data_folder, 'data')
        dump(data, data_name)
        data = load(data_name, mmap_mode='r')

    else:
        data = self._data

    bmu = None

    # X2 is part of euclidean distance (x-y)^2 = x^2 +y^2 - 2xy that we use
    # for each data row in bmu finding.
    # Since it is a fixed value we can skip it during bmu finding for each
    # data point, but later we need it calculate quantification error
    fixed_euclidean_x2 = np.einsum('ij,ij->i', data, data)

    logging.info(" radius_ini: %f , radius_final: %f, trainlen: %d\n" %
                 (radiusin, radiusfin, trainlen))

    for i in range(trainlen):
        t1 = time()
        neighborhood = self.neighborhood.calculate(
            self._distance_matrix, radius[i], self.codebook.nnodes)
        bmu = self.find_bmu(data, njb=njob)
        self.codebook.matrix = self.update_codebook_voronoi(data, bmu,
                                                            neighborhood)
        qerror = (i + 1, round(time() - t1, 3),
                  np.mean(np.sqrt(bmu[1] + fixed_euclidean_x2)))
        logging.info(
            " epoch: %d ---> elapsed time:  %f, quantization error: %f\n" %
            qerror)
        
        # this is the reason why we override the function
        update_sm_info(*qerror)
        
        if np.any(np.isnan(qerror)):
            logging.info("nan quantization error, exit train\n")

    bmu[1] = np.sqrt(bmu[1] + fixed_euclidean_x2)
    self._bmu = bmu
    
sompy.sompy.SOM._batchtrain = _batchtrain_monkeypatch


<a id='training_cell'></a>

In [11]:
%matplotlib notebook

# Creates the Training and save box
b = widgets.Button(description="Train")
out = widgets.Output(layout={'border': '1px solid black'})
hm_output = widgets.Output()

# Saves the trained som data for use in SOM_Visualization
def save_som_data(sm: sompy.sompy.SOM, name: str):
    # This will overwrite the old hd5 file, so be aware
    with pd.HDFStore(name, mode="w") as store:
        store['sm_codebook_matrix'] = pd.DataFrame(sm.codebook.matrix, columns=som_columns)
        store['sm_data'] = data_df.drop("Row", axis='columns')
        store['sm_codebook_mapsize'] = pd.Series(mapsize)
        columns_group = store._handle.create_group(store._handle.root, 'sm_codebook_columns')
        stored_columns_array = store._handle.create_array(columns_group, "property_names", list(som_columns), "Material property names")
        matfamilies_group = store._handle.create_group(store._handle.root, 'sm_codebook_matfamilies')
        stored_matfamilies_array = store._handle.create_array(matfamilies_group, "material_families", list(data_df["Row"]), "Material families")
    with out:
        print(f"Saved to {name}")

# Trains the data
def do_training(*args):
    out.clear_output()
    with out:
        sm.train(n_job=n_job, verbose='debug', train_rough_len=0,
                 train_finetune_len=0)

        topographic_error = sm.calculate_topographic_error()
        quantization_error = np.mean(sm._bmu[1])
        print("Topographic error = {:.5f}; Quantization error = {:.5f};"
              .format(topographic_error, quantization_error))
b.on_click(do_training)

# Produces text for the widget box
epoch_text_widget = widgets.Label(value="Epoch: 0")
topo_err_text_widget = widgets.Label(value="Topographic error: 0")
quantization_err_text_widget = widgets.Label(value="Quantization error: 0")
warning_txt = widgets.Label(value="Clicking save will overwrite the old hd5 file, so be aware")
infobox = widgets.VBox([warning_txt, epoch_text_widget, topo_err_text_widget, quantization_err_text_widget])

# Gives file name and saves it
today = dt.date.today()
outname = widgets.Text(description="Output file", value=f"som_codemat_{len(som_columns)}props_{today.strftime('%y-%m-%d')}.h5")
savebtn = widgets.Button(description="Save")
savebox = widgets.VBox([outname, savebtn], layout={'border': '1px solid black'})

savebtn.on_click(lambda *args: save_som_data(sm, outname.value))

# Displays the widgets below
graph_display = widgets.Output()
with graph_display:
    display(hm_output)
    
# Updates as data gets trained
def update_sm_info(epoch, topographic_err, quantization_err):
    epoch_text_widget.value = "Epoch: {}".format(epoch)
    topo_err_text_widget.value = "Topographic error: {}".format(topographic_err)
    quantization_err_text_widget.value = "Quantization error: {}".format(quantization_err)
    
widgets.VBox([graph_display, widgets.Box([widgets.VBox([savebox, b, infobox]), out])])

# When training with dummy data is done, epoch: 65
# Switch to SOM_Visualization for visualization of trained data

VBox(children=(Output(), Box(children=(VBox(children=(VBox(children=(Text(value='som_codemat_4props_20-02-11.h…

NumExpr defaulting to 4 threads.
