# Unsupervised ML - Clustering and Anomaly Detection

In [8]:
import numpy as np
from dataclasses import dataclass
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#from utils import generate_blobs_dataset, centroids_animation

np.random.seed(42)

# Clustering

## k-means

In [9]:
# 3.1) 

def pairwise_distance(X, centroids):
    pairwise_dist = np.zeros((X.shape[0], centroids.shape[0]))
    for i in range(X.shape[0]):
        for j in range(centroids.shape[0]):
            pairwise_dist[i, j] = np.linalg.norm(X[i] - centroids[j])

    return pairwise_dist

In [10]:
class KMeansClustering:
    def __init__(self, n_clusters:int, max_iters:int=100, tol:float=1e-6):
        self.k = n_clusters
        self.max_iters = max_iters
        self.tol = tol
        self.centroids = None
        self.labels_ = None         # here we store the labels for the training point
        self.history = []           # here we store the history of centroids

    def _assign_clusters(self, X):
        distances = pairwise_distance(X, self.centroids)
        clusters = np.argmin(distances, axis=1)
        return clusters
    
    def _update_centroids(self, X, labels):
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(self.k)])
        return new_centroids
    
    def fit(self, X):
        self.history = []
        self.labels_history = []

        # centroidi inizializzati random all'inizio
        self.centroids = X[np.random.choice(X.shape[0], self.k, replace=False)]
        labels = self._assign_clusters(X)

        self.history.append(self.centroids.copy())
        self.labels_history.append(labels.copy())

        for _ in range(self.max_iters):
            new_centroids = self._update_centroids(X, labels)
            if np.all(np.abs(new_centroids - self.centroids) < self.tol):
                break

            self.centroids = new_centroids
            labels = self._assign_clusters(X)

            self.history.append(self.centroids.copy())
            self.labels_history.append(labels.copy())
        
        self.labels_ = labels
        return self

#### Determine the optimal number of cluster

In [None]:
# ELBOW METHOD

def Wk(X, labels, centroids):
    n_clusters = centroids.shape[0]
    Wk = np.zeros(n_clusters)

    for i in range(n_clusters):
        Wk[i] = np.sum((X[labels == i] - centroids[i]) ** 2)

    return np.sum(Wk)

def Wk_means(X, k_range, wkmean_list):
    # Plot the Wk for each k
    _, axes = plt.subplots(1, 2, figsize=(12, 5))

    # Plot the Wk for each k
    axes[0].plot(k_range, wkmean_list, marker="o")
    axes[0].set_title("W_k for Different Values of k")
    axes[0].set_xlabel("Number of Clusters (k)")
    axes[0].set_ylabel("W_k")
    axes[0].set_xticks(k_range)
    axes[0].grid()

    # Scatter plot of the data
    axes[1].scatter(X[:, 0], X[:, 1], s=10, alpha=0.7)
    axes[1].set_title("Scatter Plot of the Data")
    axes[1].set_xlabel("Feature 1")
    axes[1].set_ylabel("Feature 2")

    plt.tight_layout()
    plt.show()
    

In [None]:
# GAP STATISTIC

def generate_reference_dataset(n_samples, n_features, lower_bound, upper_bound):
    return np.random.uniform(lower_bound, upper_bound, size=(n_samples, n_features))

def optimal_k_with_gap(X, klist, n_ref_dataset=10):
    n_samples, n_features = X.shape
    lower_bounds = [np.min(X[:, i]) for i in range(n_features)]
    upper_bounds = [np.max(X[:, i]) for i in range(n_features)]

    # 1) genero il dataset di riferimento 
    ref_datasets = [generate_reference_dataset(n_samples, n_features, lower_bounds, upper_bounds) for _ in range(n_ref_datasets)]
    
    gap_list = []
    # 2) itero su k
    for k in klist:
        # Step 2.1 Fit k-means on our data using k clusters
        np.random.seed(4)  # this is a horrible thing to ensure 'proper' centroids... just for the sake of the example!
        kmeans = KMeansClustering(n_clusters=k)
        kmeans.fit(X)
        Wk = compute_Wk(X, kmeans.labels_, kmeans.centroids)

        # Step 2.2 Fit k-means on each reference dataset using k clusters
        Wk_ref = []
        for X_ref in ref_datasets:
            kmeans_ref = KMeansClustering(n_clusters=k)
            kmeans_ref.fit(X_ref)
            Wk_ref.append(compute_Wk(X_ref, kmeans_ref.labels_, kmeans_ref.centroids))

        # compute gap statistic
        log_Wk_rf = np.log(Wk_ref)
        gap = np.mean(np.log(Wk_ref)) - np.log(Wk)
        gap_list.append(gap)
        print(f"Gap statistic for k={k}: {gap}")


    # Step 3: select optimal k
    std_logWk_ref = np.std(log_Wk_rf, axis=0)          # shape (n_ref_datasets, ). Std of each Wk_ref across datasets
    upper_bound = gap_list - std_logWk_ref             # upper bound for the gap statistic

    # take the minimum k for which gap(k) > upper_bound   optimal_k = None
    optimal_k = None
    for i in range(len(gap_list)-1):
        if gap_list[i] >= upper_bound[i+1]:
            optimal_k = klist[i]
            break
    if optimal_k == None:
        optimal_k = klist[-1] # fallback
        print(f"Warning: optimal k not found. Using k={optimal_k}")

    return optimal_k, gap_list, std_logWk_ref

# Anomaly Detection

## Hotelling T^2 Test

#### Univariate

In [None]:
np.random.seed(0)
X = np.random.normal(size=(100))

mean = np.mean(X)
var = np.var(X)

outlier = np.array([5])

distance = (outlier - mean)**2 / var
print(f"Distance of outlier from mean: {distance}")

# scatter plot
plt.scatter(X, [0]*len(X), alpha=0.3, label='Normal Data')
plt.scatter(outlier, [0], color='red', alpha=0.6, label='Outlier')
plt.axvline(x=mean, color='green', linestyle='--', label='Estimated Mean')
sns.kdeplot(X, bw_adjust=0.5, fill=True, alpha=0.2, label='KDE')
plt.ylim(-0.2, 0.5)
plt.legend()

#### Multivariate