<div align="center">
    <h1>In The Name Of GOD</h1>
</div>

# RCV1 Dataset Visualization with Self Organizing Map

## Import Libraries

In [5]:
import numpy as np
from numpy.ma.core import ceil
from scipy.spatial import distance #distance calculation
from sklearn.utils import resample #shuffle data
from sklearn.preprocessing import MinMaxScaler, StandardScaler #normalization
from sklearn.pipeline import Pipeline #pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score #scoring
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from matplotlib import animation, colors
import pprint

## Helper Functions

In [6]:
def normal(x, mu, sigma):
    p = 1 / np.sqrt(2 * np.pi * sigma**2)
    return p * np.exp(-0.5 / sigma**2 * (x - mu)**2)

def euclidean_distance(x, y):
    return distance.euclidean(x, y)

def manhattan_distance(x, y):
    return distance.cityblock(x, y)

def get_pipeline(scaler=StandardScaler()):
    return Pipeline([
        ('scaler', scaler)
    ])

## Hyper parameters

In [7]:
NUM_SAMPLES = 10000 # number of samples to use
NUM_NEURONS = (5 * np.sqrt(NUM_SAMPLES)) #number of neurons in the SOM rectangular grid
GRID_SIZE = (ceil(np.sqrt(NUM_NEURONS)).astype(np.int32), ceil(np.sqrt(NUM_NEURONS)).astype(np.int32)) #size of the grid
BETA0 = 0.5 #initial learning rate
MU = 0 # initial mu for normal distribution
SIGMA = 1 # initial sigma for normal distribution

## Import Dataset

In [8]:
from sklearn.datasets import fetch_rcv1
rcv1 = fetch_rcv1()

X, Y = rcv1.data, rcv1.target

## Data Preprocessing

In [10]:
X_resampled, Y_resampled = resample(X, Y, n_samples=NUM_SAMPLES, random_state=42) #resample data

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42) #split data into training and testing sets

pipeline = get_pipeline()
X_train_pipe_scaled = pipeline.fit_transform(X_train.toarray()) #scale data
X_test_pipe_scaled = pipeline.transform(X_test.toarray()) #scale data

## Self Organizing Map (SOM) Initialization

In [12]:
pprint.pprint(GRID_SIZE)

grid = np.random.rand(*GRID_SIZE) #initialize the grid with random values


(23, 23)
