<div align="center">
    <h1>In The Name Of GOD</h1>
</div>

# RCV1 Dataset Visualization with Self Organizing Map

## Import Libraries

In [21]:
import numpy as np
from numpy.ma.core import ceil 
from sklearn.metrics import DistanceMetric #distance calculation
from sklearn.utils import resample #resampling
from sklearn.preprocessing import MinMaxScaler, StandardScaler #normalization
from sklearn.pipeline import Pipeline #pipeline
from sklearn.model_selection import train_test_split #split data
from sklearn.metrics import accuracy_score #scoring
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from matplotlib import animation, colors
from tqdm import tqdm
import pprint

## Hyper parameters

In [2]:
NUM_SAMPLES = 10000 # number of samples to use
NUM_NEURONS = (5 * np.sqrt(NUM_SAMPLES)) #number of neurons in the SOM rectangular grid
GRID_SIZE = (ceil(np.sqrt(NUM_NEURONS)).astype(np.int32), ceil(np.sqrt(NUM_NEURONS)).astype(np.int32)) #size of the grid
NUM_EPOCHS = 1000 #number of epochs to train the SOM
BETA0 = 0.5 #initial learning rate
MU = 0 # initial mu for normal distribution
SIGMA0 = 5 # initial sigma for normal distribution

## Helper Functions

In [3]:
def normal(x, mu, sigma):
    p = 1 / np.sqrt(2 * np.pi * sigma**2)
    return p * np.exp(-0.5 / sigma**2 * (x - mu)**2)

def get_beta(epoch):
    return BETA0 * np.exp(-epoch / NUM_EPOCHS)

def update_neurons(grid, best_match_idx, w, epoch): #update the neurons
    x, y = np.ogrid[0:GRID_SIZE[0], 0:GRID_SIZE[1]]
    # grid = grid + get_beta(epoch) * ((w - grid) * np.)

def get_pipeline(scaler=StandardScaler()):
    return Pipeline([
        ('scaler', scaler)
    ])

## Import Dataset

In [4]:
from sklearn.datasets import fetch_rcv1
rcv1 = fetch_rcv1()

X, Y = rcv1.data, rcv1.target

## Data Preprocessing

In [5]:
X_resampled, Y_resampled = resample(X, Y, n_samples=NUM_SAMPLES, random_state=42) #resample data

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42) #split data into training and testing sets

pipeline = get_pipeline()
X_train_pipelined = pipeline.fit_transform(X_train.toarray()) #scale data
X_test_pipelined = pipeline.transform(X_test.toarray()) #scale data

## Self Organizing Map (SOM) Initialization

In [6]:
grid = np.random.rand(*GRID_SIZE, X_train_pipelined.shape[1]) #initialize the grid with random values

pprint.pprint(f'Grid rectangle is of width and height of {GRID_SIZE}')
pprint.pprint(f'Neuron grid is of shape {grid.shape}')

'Grid rectangle is of width and height of (23, 23)'
'Neuron grid is of shape (23, 23, 47236)'


## Training Self Organizing Map (SOM)

In [28]:
for epoch in tqdm(range(NUM_EPOCHS), desc='Epochs', leave=True): #train the SOM
    shuffle_idx = np.random.permutation(X_train_pipelined.shape[0]) #shuffle the data
    for i in range(X_train_pipelined.shape[0]):
        x = X_train_pipelined[shuffle_idx[i]] #get the data
        # best_match_idx = np.unravel_index(np.argmin(np.linalg.norm(grid - x, axis=-1)), grid.shape) #find the index of the neuron with the smallest distance to the input
        # update_neurons(grid, best_match_idx, x, epoch) #update the neurons
        
        

Epochs: 100%|██████████| 1000/1000 [00:01<00:00, 729.10it/s]
