<div align="center">
    <h1>In The Name Of GOD</h1>
</div>

# RCV1 Dataset Visualization with Self Organizing Map

## Import Libraries

In [31]:
import numpy as np
from numpy.ma.core import ceil 
from numpy import linalg as LA
from numpy import argmin, unravel_index, sqrt, ogrid, newaxis
from sklearn.metrics import DistanceMetric #distance calculation
from sklearn.utils import resample #resampling
from sklearn.preprocessing import MinMaxScaler, StandardScaler #normalization
from sklearn.pipeline import Pipeline #pipeline
from sklearn.model_selection import train_test_split #split data
from sklearn.metrics import accuracy_score #scoring
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from matplotlib import animation, colors
from tqdm import tqdm
import pprint

## Hyper parameters

In [3]:
NUM_SAMPLES = 10000 # number of samples to use
NUM_NEURONS = (5 * np.sqrt(NUM_SAMPLES)) #number of neurons in the SOM rectangular grid
GRID_SIZE = (ceil(np.sqrt(NUM_NEURONS)).astype(np.int32), ceil(np.sqrt(NUM_NEURONS)).astype(np.int32)) #size of the grid
NUM_EPOCHS = 1000 #number of epochs to train the SOM
BETA0 = 0.5 #initial learning rate
MU = 0 # initial mu for normal distribution
SIGMA0 = 10 # initial sigma for normal distribution

## Helper Functions

In [32]:
def normal(x, mu, sigma): #calculate the normal distribution
    p = 1 / np.sqrt(2 * np.pi * sigma**2)
    return p * np.exp(-0.5 / sigma**2 * (x - mu)**2)

def get_beta(epoch): #get the learning rate for the SOM
    return BETA0 * np.exp(-epoch / NUM_EPOCHS)

def get_sigma(epoch): #get the sigma for the normal distribution
    return SIGMA0 * np.exp(-epoch / NUM_EPOCHS)

def expand(x, shape): #expand the normal distribution to the grid size
    return np.tile(x[:, :, newaxis], (1, 1, shape))

def update_neurons(grid, best_match_idx, w, epoch): #update the neurons
    x0, y0 = best_match_idx #get the coordinates of the best match
    x, y = ogrid[0:GRID_SIZE[0], 0:GRID_SIZE[1]] #create a grid of coordinates
    distance_to_best_idx = sqrt((x - x0) ** 2 + (y - y0) ** 2) #calculate the distance between the neurons and the best match
    ns_values = normal(distance_to_best_idx, MU, get_sigma(epoch)) #calculate the normal distribution
    expand(ns_values, grid.shape[-1]) #expand the normal distribution to the grid size
    coefficient = np.tile(normal(ns_values, MU, get_sigma(epoch)), grid.shape) #calculate the coff for the neurons
    distances = coefficient * (grid - w) #calculate the distance between the neurons and the input
    grid = grid + get_beta(epoch) * distances #update the neurons

def find_winning_neuron(grid, x): #find the winning neuron
    distances = LA.norm(grid - x, axis=-1) #calculate the distance between the neurons and the input
    return unravel_index(argmin(distances), grid.shape[0:-1])

def get_pipeline(scaler=StandardScaler()): #create a pipeline for the data
    return Pipeline([
        ('scaler', scaler)
    ])

## Import Dataset

In [5]:
from sklearn.datasets import fetch_rcv1 #fetch the RCV1 dataset
rcv1 = fetch_rcv1() 

X, Y = rcv1.data, rcv1.target

## Data Preprocessing

In [6]:
X_resampled, Y_resampled = resample(X, Y, n_samples=NUM_SAMPLES, random_state=42) #resample data

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42) #split data into training and testing sets

pipeline = get_pipeline()
X_train_pipelined = pipeline.fit_transform(X_train.toarray()) #scale data
X_test_pipelined = pipeline.transform(X_test.toarray()) #scale data

## Self Organizing Map (SOM) Initialization

In [7]:
grid = np.random.rand(*GRID_SIZE, X_train_pipelined.shape[1]) #initialize the grid with random values

pprint.pprint(f'Grid rectangle is of width and height of {GRID_SIZE}')
pprint.pprint(f'Neuron grid is of shape {grid.shape}')

'Grid rectangle is of width and height of (23, 23)'
'Neuron grid is of shape (23, 23, 47236)'


## Training Self Organizing Map (SOM)

In [8]:
for epoch in tqdm(range(NUM_EPOCHS), desc='Epochs', leave=True): #train the SOM
    shuffle_idx = np.random.permutation(X_train_pipelined.shape[0]) #shuffle the data
    for i in range(X_train_pipelined.shape[0]):
        x = X_train_pipelined[shuffle_idx[i]] #get the data
        best_match_idx = find_winning_neuron(grid, x) #find the index of the neuron with the smallest distance to the input
        update_neurons(grid, best_match_idx, x, epoch) #update the neurons
        
        

In [9]:
tt_grid = np.arange(30).reshape(5, 3, 2)

tt_w = [3, 4]

pprint.pprint(tt_grid)
pprint.pprint(f'the norm of differences is = \n{np.linalg.norm(tt_grid - tt_w, axis=-1)}')


np.unravel_index(np.argmin(np.linalg.norm(tt_grid - tt_w, axis=-1)), shape=tt_grid.shape[0:-1])


array([[[ 0,  1],
        [ 2,  3],
        [ 4,  5]],

       [[ 6,  7],
        [ 8,  9],
        [10, 11]],

       [[12, 13],
        [14, 15],
        [16, 17]],

       [[18, 19],
        [20, 21],
        [22, 23]],

       [[24, 25],
        [26, 27],
        [28, 29]]])
('the norm of differences is = \n'
 '[[ 4.24264069  1.41421356  1.41421356]\n'
 ' [ 4.24264069  7.07106781  9.89949494]\n'
 ' [12.72792206 15.55634919 18.38477631]\n'
 ' [21.21320344 24.04163056 26.87005769]\n'
 ' [29.69848481 32.52691193 35.35533906]]')


(0, 1)

In [33]:
xx = np.arange(6).reshape(2, 3)

pprint.pprint(xx)
pprint.pprint('############################')

np.expand_dims(xx, axis=(0, 1))

xx = xx[:, :, newaxis]
np.tile(xx, (1, 1, 2))

print(np.tile(xx, (2, 3, 2)))

# pprint.pprint(np.tile(xx, (1, 1, 2)))



array([[0, 1, 2],
       [3, 4, 5]])
'############################'
[[[0 0]
  [1 1]
  [2 2]
  [0 0]
  [1 1]
  [2 2]
  [0 0]
  [1 1]
  [2 2]]

 [[3 3]
  [4 4]
  [5 5]
  [3 3]
  [4 4]
  [5 5]
  [3 3]
  [4 4]
  [5 5]]

 [[0 0]
  [1 1]
  [2 2]
  [0 0]
  [1 1]
  [2 2]
  [0 0]
  [1 1]
  [2 2]]

 [[3 3]
  [4 4]
  [5 5]
  [3 3]
  [4 4]
  [5 5]
  [3 3]
  [4 4]
  [5 5]]]
