In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.preprocessing import MinMaxScaler

In [2]:
mnist = load_digits()
(mnist.data).shape, (mnist.target).shape

((1797, 64), (1797,))

In [3]:
def minmax_scaler(data):
  scaler = MinMaxScaler()
  scaled = scaler.fit_transform(data)
  return scaled

In [4]:
def data_splitter(dataX, dataY):
  shuffled_indices = np.random.permutation(len(dataX))
  dataX = minmax_scaler(dataX)
  test_set_size = int(len(dataX) * 0.2)
  test_indices = shuffled_indices[:test_set_size]
  train_indices = shuffled_indices[test_set_size:]
  data_trainX, data_trainY = dataX[train_indices][:], dataY[train_indices]
  data_testX, data_testY =  dataX[test_indices][:], dataY[test_indices]
  return data_trainX, data_trainY, data_testX, data_testY

In [11]:
def initializer(data):
  r = int(np.sqrt(5 * np.sqrt(data.shape[0])))
  c = r
  som = np.random.uniform(0,1,size=(r,c,data.shape[1]))
  radius = float(input('=======Radius initial value=======\n-------> '))
  learning_rate = float(input('=======Learning rate initial value=======\n-------> '))
  #max_step = 30000
  max_step = 500 * r * c
  return som, learning_rate, radius, max_step

In [6]:
def distance(x, y, kind):
  if kind == 'euc': return np.sqrt(np.sum(np.power(x - y, 2)))
  elif kind == 'man': return np.sum(np.abs(x - y))

In [9]:
# def lr_nr_update(step, max_step, radius, learning_rate):
#   lr = learning_rate * np.exp(-step / max_step)
#   nr = radius * np.exp(-step / (max_step / np.log(radius)))
#   return lr, nr

# def theta(sample, winner_index, som, nr):
#   return np.exp((-(distance(som[winner_index[0]][winner_index[1]],sample,'man')**2) / (2 * (nr ** 2))))
#winner_index = []

def lr_nr_update(step, max_steps,learning_rate,radius):
  coefficient = 1.0 - (np.float64(step)/max_steps)
  lr = coefficient*learning_rate
  nr = np.ceil(coefficient * radius)
  return lr, nr  

def winner_unit(sample, som):
  #opt_distance = np.sqrt(data.shape[1])
  opt_distance = np.sqrt(len(sample))
  for i in range(int(som.shape[0])):
    for j in range(int(som.shape[1])):
      dist = distance(sample, som[i][j],'euc')
      if dist < opt_distance:
        opt_distance = dist
        winner_index = [i, j]
  return winner_index


def update_units(sample, winner_index, som, lr, nr):
  for i in range(int(som.shape[0])):
    for j in range(int(som.shape[1])):
      dist = distance(np.array(winner_index), np.array([i, j]),'man')
      if dist <= nr:
        som[i][j] += lr * (sample - som[i][j])
      else:
        continue
  return som

In [10]:
def som_fit(max_step,step,radius,learning_rate,data,som):
  n = data.shape[0]
  index = step % n
  if index > 0 : index = index - 1
  elif index == 0: index = n - 1
  sample = data[index]
  bmu_indx = winner_unit(sample, som)
  learning_rate, neighbourhood_range = lr_nr_update(step, max_step, learning_rate, radius)
  som = update_units(sample, bmu_indx, som, learning_rate, neighbourhood_range)
  return som

def som_label(dataX,dataY, som):
  # initialize label array 
  label_array = np.empty(shape = (som.shape[0], som.shape[1]), dtype = object)
  for i in range(som.shape[0]):
    for j in range(som.shape[1]):
      label_array[i][j] = []
  
  # fill label array with predicted answers
  for i in range(dataX.shape[0]):
    indx = winner_unit(dataX[i], som)
    label_array[indx[0]][indx[1]].append(dataY[i])
  som_label = np.zeros(shape = (som.shape[0], som.shape[1]))
  # fill som array with labels 
  for i in range(som_label.shape[0]):
    for j in range(som_label.shape[1]):
      if len(label_array[i][j]) == 0:
        som_label[i][j] = 2
      else:
        label_list = label_array[i][j]
        som_label[i][j] = max(label_list, key=label_list.count)
  return som_label

def som_predict(som, som_label, data_testX, data_testY):
  print('====Prediction=============')
  true_counter = 0
  for i in range(data_testX.shape[0]):
    indx = winner_unit(data_testX[i], som)
    if som_label[indx[0]][indx[1]] == data_testY[i]:
      true_counter += 1
    else: continue   
  print(f'accuracy --> {true_counter / data_testX.shape[0]}')
  return None 



## SOM 

In [12]:
def SOM(dataX, dataY, pred):
  # split_data
  data_trainX, data_trainY, data_testX, data_testY = data_splitter(dataX, dataY)
  # intialization 
  som, learning_rate, radius, max_step = initializer(data_trainX)
  # fit
  for step in range(1,max_step+1):
    if step % 1000 == 0: print(f'step {step}')
    som = som_fit(max_step,step,radius,learning_rate,data_trainX,som)
  # predict  
  Som_label = som_label(data_trainX, data_trainY, som)
  if pred == 'test':
    som_predict(som, Som_label, data_testX, data_testY)
  elif pred == 'train':
    som_predict(som, Som_label, data_trainX, data_trainY)
  return None 

In [13]:
SOM(mnist.data, mnist.target, 'test')

-------> 25
-------> 0.5
step 1000
step 2000
step 3000
step 4000
step 5000
step 6000
step 7000
step 8000
step 9000
step 10000
step 11000
step 12000
step 13000
step 14000
step 15000
step 16000
step 17000
step 18000
step 19000
step 20000
step 21000
step 22000
step 23000
step 24000
step 25000
step 26000
step 27000
step 28000
step 29000
step 30000
step 31000
step 32000
step 33000
step 34000
step 35000
step 36000
step 37000
step 38000
step 39000
step 40000
step 41000
step 42000
step 43000
step 44000
step 45000
step 46000
step 47000
step 48000
step 49000
step 50000
step 51000
step 52000
step 53000
step 54000
step 55000
step 56000
step 57000
step 58000
step 59000
step 60000
step 61000
step 62000
step 63000
step 64000
step 65000
step 66000
step 67000
step 68000
step 69000
step 70000
step 71000
step 72000
step 73000
step 74000
step 75000
step 76000
step 77000
step 78000
step 79000
step 80000
step 81000
step 82000
step 83000
step 84000
accuracy --> 0.9192200557103064
