# Clustering Analysis - K-Means

**Obiettivo:** Analisi non supervisionata per scoprire pattern negli incidenti di sicurezza

**Approccio:**
1. Caricamento dati
2. Determinazione numero ottimale di cluster (Elbow + Silhouette)
3. Training K-Means
4. Analisi dei cluster
5. Confronto con IncidentGrade
6. Salvataggio modello

## 1. Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.decomposition import PCA
import pickle
import os
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')

print("Librerie importate con successo!")

Librerie importate con successo!


## 2. Caricamento e Preprocessing

In [2]:
print("Caricamento dataset...")

# Usa solo il training set per clustering
X_train = pd.read_csv('../data/processed/X_train.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')['IncidentGrade']

print(f"X_train: {X_train.shape}")
print(f"\nDistribuzione IncidentGrade:\n{y_train.value_counts(normalize=True)}")

Caricamento dataset...
X_train: (314230, 20)

Distribuzione IncidentGrade:
IncidentGrade
BenignPositive    0.485921
FalsePositive     0.301088
TruePositive      0.212990
Name: proportion, dtype: float64


In [3]:
# Standardizza le features (importante per K-Means)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

print(f"Features standardizzate: {X_scaled.shape}")
print(f"Media features: {X_scaled.mean():.2e}")
print(f"Std features: {X_scaled.std():.2f}")

Features standardizzate: (314230, 20)
Media features: -6.25e-18
Std features: 1.00


## 3. Determinazione Numero Ottimale di Cluster

In [4]:
# Elbow method + Silhouette score
k_range = range(2, 11)
inertias = []
silhouette_scores = []

print("Calcolo metriche per diversi valori di k...\n")

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette = silhouette_score(X_scaled, kmeans.labels_)
    silhouette_scores.append(silhouette)
    print(f"k={k}: Inertia={kmeans.inertia_:.2f}, Silhouette={silhouette:.4f}")

Calcolo metriche per diversi valori di k...



KeyboardInterrupt: 

In [None]:
# Plot Elbow + Silhouette
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Elbow plot
axes[0].plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Numero di Cluster (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')
axes[0].grid(True, alpha=0.3)

# Silhouette plot
axes[1].plot(k_range, silhouette_scores, 'go-', linewidth=2, markersize=8)
axes[1].set_xlabel('Numero di Cluster (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score per k')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

best_k = k_range[np.argmax(silhouette_scores)]
print(f"\nMiglior k secondo Silhouette Score: {best_k}")

## 4. Training K-Means

In [None]:
# Usa k=3 per confronto con le 3 classi di IncidentGrade
n_clusters = 3

print(f"Training K-Means con k={n_clusters}...\n")

kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=20)
cluster_labels = kmeans.fit_predict(X_scaled)

print("Training completato!")
print(f"\nDistribuzione cluster:")
print(pd.Series(cluster_labels).value_counts().sort_index())
print(f"\nInertia: {kmeans.inertia_:.2f}")
print(f"Silhouette Score: {silhouette_score(X_scaled, cluster_labels):.4f}")

## 5. Visualizzazione Cluster (PCA)

In [None]:
# PCA per visualizzazione 2D
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

print(f"Varianza spiegata dalle prime 2 componenti: {pca.explained_variance_ratio_.sum():.2%}")

In [None]:
# Plot cluster in spazio PCA
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Cluster K-Means
scatter = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], 
                         c=cluster_labels, cmap='viridis', 
                         alpha=0.5, s=10)
axes[0].scatter(pca.transform(kmeans.cluster_centers_)[:, 0],
               pca.transform(kmeans.cluster_centers_)[:, 1],
               c='red', marker='X', s=200, edgecolors='black', linewidths=2,
               label='Centroidi')
axes[0].set_xlabel('PC1')
axes[0].set_ylabel('PC2')
axes[0].set_title('K-Means Clustering (PCA)')
axes[0].legend()
plt.colorbar(scatter, ax=axes[0])

# IncidentGrade reale
grade_mapping = {grade: i for i, grade in enumerate(y_train.unique())}
grade_numeric = y_train.map(grade_mapping)
scatter2 = axes[1].scatter(X_pca[:, 0], X_pca[:, 1],
                          c=grade_numeric, cmap='coolwarm',
                          alpha=0.5, s=10)
axes[1].set_xlabel('PC1')
axes[1].set_ylabel('PC2')
axes[1].set_title('IncidentGrade Reale (PCA)')
cbar = plt.colorbar(scatter2, ax=axes[1])
cbar.set_ticks(range(len(grade_mapping)))
cbar.set_ticklabels(grade_mapping.keys())

plt.tight_layout()
plt.show()

## 6. Confronto Cluster vs IncidentGrade

In [None]:
# Crosstab cluster vs IncidentGrade
cluster_vs_grade = pd.crosstab(
    cluster_labels, y_train,
    rownames=['Cluster'],
    colnames=['IncidentGrade']
)

print("Cluster vs IncidentGrade (conteggi):")
print(cluster_vs_grade)

# Normalizzato per riga
cluster_vs_grade_norm = cluster_vs_grade.div(cluster_vs_grade.sum(axis=1), axis=0)
print("\nCluster vs IncidentGrade (percentuali per cluster):")
print(cluster_vs_grade_norm.round(3))

In [None]:
# Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(cluster_vs_grade_norm, annot=True, fmt='.2%', cmap='YlOrRd')
plt.title('Distribuzione IncidentGrade per Cluster')
plt.ylabel('Cluster')
plt.xlabel('IncidentGrade')
plt.tight_layout()
plt.show()

In [None]:
# Metriche di confronto
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y_train)

ari = adjusted_rand_score(y_encoded, cluster_labels)
nmi = normalized_mutual_info_score(y_encoded, cluster_labels)

print("\nMetriche di accordo con IncidentGrade:")
print(f"  Adjusted Rand Index: {ari:.4f}")
print(f"  Normalized Mutual Information: {nmi:.4f}")
print("\n(Valori vicini a 1 = alta concordanza, 0 = casuale)")

## 7. Analisi Caratteristiche Cluster

In [None]:
# Aggiungi cluster labels al dataframe
X_with_clusters = X_train.copy()
X_with_clusters['Cluster'] = cluster_labels

# Statistiche per cluster
cluster_stats = X_with_clusters.groupby('Cluster').agg(['mean', 'std'])

print("Statistiche features per cluster (prime 5 features):")
print(cluster_stats.iloc[:, :10].round(2))

In [None]:
# Features più discriminanti tra cluster
cluster_means = X_with_clusters.groupby('Cluster').mean()
feature_variance = cluster_means.var(axis=0).sort_values(ascending=False)

print("Top 15 features più discriminanti tra cluster:")
print(feature_variance.head(15))

## 8. Salvataggio Modello

In [None]:
# Crea cartella per K-Means
model_dir = '../models/kmeans'
os.makedirs(model_dir, exist_ok=True)

# Salva modello
with open(f'{model_dir}/model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

# Salva scaler
with open(f'{model_dir}/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Salva PCA
with open(f'{model_dir}/pca.pkl', 'wb') as f:
    pickle.dump(pca, f)

# Salva cluster assignments
pd.DataFrame({
    'cluster': cluster_labels,
    'incident_grade': y_train.values
}).to_csv(f'{model_dir}/cluster_assignments.csv', index=False)

# Salva feature importance (varianza tra cluster)
pd.DataFrame({
    'Feature': feature_variance.index,
    'Variance': feature_variance.values
}).to_csv(f'{model_dir}/feature_discrimination.csv', index=False)

# Salva metriche
metrics = {
    'model_name': 'K-Means',
    'timestamp': datetime.now().isoformat(),
    'n_clusters': int(n_clusters),
    'inertia': float(kmeans.inertia_),
    'silhouette_score': float(silhouette_score(X_scaled, cluster_labels)),
    'adjusted_rand_index': float(ari),
    'normalized_mutual_info': float(nmi),
    'n_samples': int(len(X_train)),
    'n_features': int(X_train.shape[1]),
    'pca_variance_explained': float(pca.explained_variance_ratio_.sum()),
    'cluster_sizes': pd.Series(cluster_labels).value_counts().sort_index().to_dict()
}

with open(f'{model_dir}/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"Modello salvato in {model_dir}/")
print("  - model.pkl")
print("  - scaler.pkl")
print("  - pca.pkl")
print("  - cluster_assignments.csv")
print("  - feature_discrimination.csv")
print("  - metrics.json")

## 9. Riepilogo

In [None]:
print("=" * 70)
print("RIEPILOGO FINALE - K-MEANS CLUSTERING")
print("=" * 70)

print(f"\nDATASET:")
print(f"  Samples: {len(X_train):,}")
print(f"  Features: {X_train.shape[1]}")

print(f"\nCLUSTERING:")
print(f"  Numero cluster: {n_clusters}")
print(f"  Inertia: {kmeans.inertia_:.2f}")
print(f"  Silhouette Score: {silhouette_score(X_scaled, cluster_labels):.4f}")

print(f"\nCONFRONTO CON INCIDENTGRADE:")
print(f"  Adjusted Rand Index: {ari:.4f}")
print(f"  Normalized Mutual Info: {nmi:.4f}")

print(f"\nDISTRIBUZIONE CLUSTER:")
for cluster_id in range(n_clusters):
    count = (cluster_labels == cluster_id).sum()
    pct = count / len(cluster_labels) * 100
    print(f"  Cluster {cluster_id}: {count:,} ({pct:.1f}%)")

print("\n" + "=" * 70)