# Self Organizing Maps  
Scenario:  
You want to analyze a dataset of customer spending across multiple categories and visualize it in 2D to identify patterns in purchasing behavior.

In [None]:
#!pip install minisom

In [22]:
from minisom import MiniSom
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
import plotly.io as pio
pio.renderers.default = 'vscode'

# Load the data
data = pd.DataFrame({
    "Electronics": [200, 250, 300, 400, 450, 100, 150, 90, 80, 600],
    "Groceries": [800, 700, 900, 200, 250, 100, 150, 800, 900, 50],
    "Clothing": [50, 60, 70, 80, 90, 40, 50, 30, 20, 100]
})

data

Unnamed: 0,Electronics,Groceries,Clothing
0,200,800,50
1,250,700,60
2,300,900,70
3,400,200,80
4,450,250,90
5,100,100,40
6,150,150,50
7,90,800,30
8,80,900,20
9,600,50,100


In [25]:
fig = px.scatter_3d(
    data,
    x="Electronics",
    y="Groceries",
    z="Clothing",
    title="Customer Purchasing Visualization",
    template="plotly_dark"
)
fig.update_traces(marker=dict(size=8))  # Adjust point size
fig.show()

In [2]:
# Normalize the data
scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(data)
data_normalized

array([[0.23076923, 0.88235294, 0.375     ],
       [0.32692308, 0.76470588, 0.5       ],
       [0.42307692, 1.        , 0.625     ],
       [0.61538462, 0.17647059, 0.75      ],
       [0.71153846, 0.23529412, 0.875     ],
       [0.03846154, 0.05882353, 0.25      ],
       [0.13461538, 0.11764706, 0.375     ],
       [0.01923077, 0.88235294, 0.125     ],
       [0.        , 1.        , 0.        ],
       [1.        , 0.        , 1.        ]])

In [3]:
# Initialize and Train the SOM
som = MiniSom(x = 5, y = 5, input_len = 3, sigma = 1.0, learning_rate = 0.5)
som.random_weights_init(data_normalized)
som.train_random(data_normalized, num_iteration = 100)

som

<minisom.MiniSom at 0x1abee529a90>

In [4]:
# Dataframe for Visualization
som_results = []
for i, x in enumerate(data_normalized):
    w = som.winner(x)
    som_results.append({'Customer': i + 1, 'Neuron X': w[0], 'Neuron Y': w[1]})

results_df = pd.DataFrame(som_results)
results_df

Unnamed: 0,Customer,Neuron X,Neuron Y
0,1,4,3
1,2,4,4
2,3,0,4
3,4,4,0
4,5,0,0
5,6,1,3
6,7,1,3
7,8,3,4
8,9,3,2
9,10,2,0


In [5]:
# Plot SOM using Plotly
fig = px.scatter(
    results_df,
    x="Neuron X",
    y="Neuron Y",
    text="Customer",
    title="Self-Organizing Map (SOM) - Customer Clustering",
    labels={"Neuron X": "Neuron X", "Neuron Y": "Neuron Y"},
    template="plotly_dark",
    size_max=15
)
fig.update_traces(textposition='top center')
fig.update_layout(showlegend=False)
fig.show()

# Metrics

In [7]:
# Quantization Error
quantization_error = np.mean([np.linalg.norm(x - som.get_weights()[som.winner(x)]) for x in data_normalized])
print(f'Quantization Error: {quantization_error}')

Quantization Error: 0.0337910764705688


In [16]:
# Grid dimensions from SOM initialization
grid_x = 5  # Set to your SOM's x dimension
grid_y = 5  # Set to your SOM's y dimension

# Compute Topographic Error
topographic_error = 0
for x in data_normalized:
    # Find the BMU
    bmu = som.winner(x)
    
    # Compute distances to all neurons
    distances = []
    for i in range(grid_x):
        for j in range(grid_y):
            distance = np.linalg.norm(x - som.get_weights()[i, j])
            distances.append(((i, j), distance))
    
    # Sort distances to find BMU and second BMU
    distances = sorted(distances, key=lambda d: d[1])
    bmu, second_bmu = distances[0][0], distances[1][0]
    
    # Check if BMU and second BMU are adjacent
    if abs(bmu[0] - second_bmu[0]) > 1 or abs(bmu[1] - second_bmu[1]) > 1:
        topographic_error += 1

# Normalize by the total number of data points
topographic_error /= len(data_normalized)
print(f'Topographic Error: {topographic_error}')


Topographic Error: 0.3


In [18]:
from sklearn.metrics import silhouette_score

# Assign cluster labels based on BMUs
cluster_labels = [
    som.winner(x)[0] * grid_y + som.winner(x)[1]  # Convert (Neuron X, Neuron Y) to a unique cluster ID
    for x in data_normalized
]

# Compute Silhouette Score
silhouette = silhouette_score(data_normalized, cluster_labels)
print(f'Silhouette Score: {silhouette}')

Silhouette Score: 0.15073204623387298
