In [None]:
import numpy as np
import pandas as pdcoding
import gzip

In [None]:
#SOMToolbox Parser
#from SOMToolBox_Parse import SOMToolBox_Parse
#idata = SOMToolBox_Parse("datasets/iris/iris.vec").read_weight_file()
#weights = SOMToolBox_Parse("datasets/iris/iris.wgt.gz").read_weight_file()

In [None]:
#HitHistogram
def HitHist(_m, _n, _weights, _idata):
    hist = np.zeros(_m * _n)
    for vector in _idata: 
        position =np.argmin(np.sqrt(np.sum(np.power(_weights - vector, 2), axis=1)))
        hist[position] += 1

    return hist.reshape(_m, _n)

#U-Matrix - implementation
def UMatrix(_m, _n, _weights, _dim):
    U = _weights.reshape(_m, _n, _dim)
    U = np.insert(U, np.arange(1, _n), values=0, axis=1)
    U = np.insert(U, np.arange(1, _m), values=0, axis=0)
    #calculate interpolation
    for i in range(U.shape[0]): 
        if i%2==0:
            for j in range(1,U.shape[1],2):
                U[i,j][0] = np.linalg.norm(U[i,j-1] - U[i,j+1], axis=-1)
        else:
            for j in range(U.shape[1]):
                if j%2==0: 
                    U[i,j][0] = np.linalg.norm(U[i-1,j] - U[i+1,j], axis=-1)
                else:      
                    U[i,j][0] = (np.linalg.norm(U[i-1,j-1] - U[i+1,j+1], axis=-1) + np.linalg.norm(U[i+1,j-1] - U[i-1,j+1], axis=-1))/(2*np.sqrt(2))

    U = np.sum(U, axis=2) #move from Vector to Scalar

    for i in range(0, U.shape[0], 2): #count new values
        for j in range(0, U.shape[1], 2):
            region = []
            if j>0: region.append(U[i][j-1]) #check left border
            if i>0: region.append(U[i-1][j]) #check bottom
            if j<U.shape[1]-1: region.append(U[i][j+1]) #check right border
            if i<U.shape[0]-1: region.append(U[i+1][j]) #check upper border

            U[i,j] = np.median(region)

    return U

#SDH - implementation
def SDH(_m, _n, _weights, _idata, factor, approach):
    import heapq

    sdh_m = np.zeros( _m * _n)

    cs=0
    for i in range(factor): cs += factor-i

    for vector in _idata:
        dist = np.sqrt(np.sum(np.power(_weights - vector, 2), axis=1))
        c = heapq.nsmallest(factor, range(len(dist)), key=dist.__getitem__)
        if (approach==0): # normalized
            for j in range(factor):  sdh_m[c[j]] += (factor-j)/cs 
        if (approach==1):# based on distance
            for j in range(factor): sdh_m[c[j]] += 1.0/dist[c[j]] 
        if (approach==2): 
            dmin, dmax = min(dist[c]), max(dist[c])
            for j in range(factor): sdh_m[c[j]] += 1.0 - (dist[c[j]]-dmin)/(dmax-dmin)

    return sdh_m.reshape(_m, _n)

In [None]:
#import panel as pn
#import holoviews as hv
#from holoviews import opts
#hv.extension('bokeh')

#hithist = hv.Image(HitHist(weights['ydim'], weights['ydim'], weights['arr'], idata['arr'])).opts(xaxis=None, yaxis=None) 
#um = hv.Image(UMatrix(weights['ydim'], weights['ydim'], weights['arr'], 4)).opts(xaxis=None, yaxis=None) 
#sdh = hv.Image(SDH(weights['ydim'], weights['ydim'], weights['arr'], idata['arr'], 25, 0)).opts(xaxis=None, yaxis=None)   

#hv.Layout([hithist.relabel('HitHist').opts(cmap='kr'), 
#           um.relabel('U-Matrix').opts(cmap='jet'), sdh.relabel('SDH').opts(cmap='viridis')])

In [None]:
import gzip
import pandas as pd


class SOMToolBox_Parse:
    def __init__(self, filename):
        self.filename = filename


    def read_weight_file(self,):
        df = pd.DataFrame()
        if self.filename[-3:len(self.filename)] == '.gz':
            with gzip.open(self.filename, 'rb') as file:
                df, vec_dim, xdim, ydim = self._read_vector_file_to_df(df, file)
        else:
            with open(self.filename, 'rb') as file:
                df, vec_dim, xdim, ydim = self._read_vector_file_to_df(df, file)

        file.close()            
        return df.astype('float64'), vec_dim, xdim, ydim


    def _read_vector_file_to_df(self, df, file):
        xdim, ydim, vec_dim, position = 0, 0, 0, 0
        for byte in file:
            line = byte.decode('UTF-8')
            if line.startswith('$'):
                xdim, ydim, vec_dim = self._parse_vector_file_metadata(line, xdim, ydim, vec_dim)
                if xdim > 0 and ydim > 0 and len(df.columns) == 0:
                    df = pd.DataFrame(index=range(0, ydim * xdim), columns=range(0, vec_dim))
            else:
                if len(df.columns) == 0 or vec_dim == 0:
                    raise ValueError('Weight file has no correct Dimensional information.')
                position = self._parse_weight_file_data(line, position, vec_dim, df)
        return df, vec_dim, xdim, ydim


    def _parse_weight_file_data(self, line, position, vec_dim, df):
        splitted=line.split(' ')
        try:
            df.values[position] = list(np.array(splitted[0:vec_dim]).astype(float))
            position += 1
        except: raise ValueError('The input-vector file does not match its unit-dimension.') 
        return  position


    def _parse_vector_file_metadata(self, line, xdim, ydim, vec_dim):
        splitted = line.split(' ')
        if splitted[0] == '$XDIM':      xdim = int(splitted[1])
        elif splitted[0] == '$YDIM':    ydim = int(splitted[1])
        elif splitted[0] == '$VEC_DIM': vec_dim = int(splitted[1])
        return xdim, ydim, vec_dim  

## Evaluation

For the evaluation of the different implementations we have used two datasets and two sizes of SOM. (40x20 -small and a 100x60 - large)

The first dataset is the so called chain link data set that contains two two-dimensional rings which are intertwined in a three-dimensional space. 

The second dataset is the so called 10 clusters dataset. The clusters were generated from 10-dimensional gaussian distributions with different densities. 

The comparison of the visualisations can be found in the `visualisation_report.md` file

Chainlink dataset          |  Clusters dataset |
:-------------------------:|:-------------------------:|
![](pics/chainlink-info.PNG) | ![](pics/10clusters-info.PNG)



In [None]:
import minisom as som
 

from somtoolbox import SOMToolbox
from sklearn.preprocessing import MinMaxScaler

small_m, small_n = 40, 20
large_m, large_n = 100, 60

# Chainlink 40x20

A minisom som is trained with sigma=7, learning_rate=0.7 and iterations=1000000 using the chainlink dataset. We  produce Topographic error visualisation with parameters 4 unit and 8 unit neighborhoods and an intrinsic distance visualisation.

In [None]:
# Train
chainlink = SOMToolBox_Parse('datasets/chainlink.vec')
idata, idim, idata_x, idata_y = chainlink.read_weight_file()
idata = MinMaxScaler().fit_transform(idata)


In [None]:
# Train

chainlink_small = som.MiniSom(small_m, small_n, idim, sigma=7, learning_rate=0.7)
chainlink_small.train_random(idata, 1000000, verbose=True)

In [None]:
from somtoolbox import SOMToolbox
sm = SOMToolbox(weights=chainlink_small._weights.reshape(-1,idim),m= small_m, n= small_n,dimension=idim, input_data=idata)
sm._mainview

# Chainlink 100x60

A minisom som is trained with sigma=6, learning_rate=0.7 and iterations=500000 using the chainlink dataset. We  produce a Topographic error visualisation with parameters 4 unit and 8 unit neighborhoods and an intrinsic distance visualisation.

In [None]:
# Train

chainlink_large = som.MiniSom(large_m, large_n, idim, sigma=6, learning_rate=0.7)
chainlink_large.train_random(idata, 500000, verbose=True)


In [None]:

sm = SOMToolbox(weights=chainlink_large._weights.reshape(-1,idim), m=large_m, n=large_n, dimension=idim, input_data=idata)


sm._mainview

# Clusters 40x20

A minisom som is trained with sigma=7, learning_rate=0.7 and iterations=1000000 using the clusters dataset. We produce two Topographic error visualisations with parameters 4 unit and 8 unit neighborhoods and an intrinsic distance visualisation.

In [None]:
# Train
clusters = SOMToolBox_Parse('datasets/clusters.vec')
idata, idim, idata_x, idata_y = clusters.read_weight_file()
idata = MinMaxScaler().fit_transform(idata)
clusters_dim = idata.shape[-1]
from somtoolbox import SOMToolbox
#clusters_small = som.MiniSom(small_m, small_n, clusters_dim, sigma=0.8, learning_rate=0.7)
#clusters_small.train_random(idata, 10000, verbose=True)

In [None]:
clusters_large = som.MiniSom(small_m, small_n, idim, sigma=7, learning_rate=0.7)
clusters_large.train_random(idata, 1000000, verbose=True)

sm = SOMToolbox(weights=clusters_large._weights.reshape(-1,idim), m=small_m, n=small_n, dimension=idim, input_data=idata)


sm._mainview

In [None]:
sm1 = SOMToolbox(weights=som._weights.reshape(-1,idim), m=small_m, n=small_n, dimension=idim, input_data=idata)

sm1._mainview

# Clusters 100x60


A minisom som is trained with sigma=7, learning_rate=0.7 and iterations=500000 using the clusters dataset. We produce two Topographic error visualisations with parameters 4 unit and 8 unit neighborhoods and an intrinsic distance visualisation.

In [None]:
# Train
from somtoolbox import SOMToolbox


clusters_large = som.MiniSom(large_m, large_n, clusters_dim, sigma=7, learning_rate=0.7)
clusters_large.train_random(idata, 500000, verbose=True)



In [None]:
sm = SOMToolbox(weights=som._weights.reshape(-1,idim), m=large_m, n=large_n, dimension=idim, input_data=idata)


sm._mainview

In [None]:
sm = SOMToolbox(weights=clusters_large._weights.reshape(-1,idim), m=large_m, n=large_n, dimension=idim, input_data=idata)


sm._mainview

# Comparison with java SOMToolbox visualisation

To compare visualisations created in the java som toolbox with visualisation created in our python implementation from the pretrained soms. We are using both cluster dataset and chainlink dataset

## Clusters dataset Java Pre-Trained SOM

In [None]:
from SOMToolBox_Parse import SOMToolBox_Parse
idata = SOMToolBox_Parse("datasets/clusters.vec").read_weight_file()
weights = SOMToolBox_Parse("datasets/cluster_100x60.wgt.gz").read_weight_file()
classes = SOMToolBox_Parse("datasets/10clusters.cls").read_weight_file()

In [None]:
from somtoolbox import SOMToolbox

sm = SOMToolbox(weights=weights['arr'],m=weights['ydim'],n=weights['xdim'],
                dimension=weights['vec_dim'], input_data=idata['arr'],
               classes=classes['arr'], component_names=classes['classes_names'])
sm._mainview

## Chainlink dataset Java Pre-Trained SOM

In [None]:
from SOMToolBox_Parse import SOMToolBox_Parse
idata = SOMToolBox_Parse("datasets/chainlink.vec").read_weight_file()
weights = SOMToolBox_Parse("datasets/chainlink_40x20_1.wgt.gz").read_weight_file()


In [None]:
from somtoolbox import SOMToolbox

sm = SOMToolbox(weights=weights['arr'],m=weights['ydim'],n=weights['xdim'],
                dimension=weights['vec_dim'], input_data=idata['arr'],
               )
sm._mainview