# Confronto Modelli su GUIDE Test Set

**Obiettivo:** Confrontare le performance di tutti i modelli addestrati sul dataset di test reale GUIDE_Test.csv

**Modelli:**
1. XGBoost
2. Random Forest
3. K-Means (confronto con ground truth)

**Pipeline:**
1. Caricamento e preprocessing GUIDE_Test.csv (stesso preprocessing del training)
2. Caricamento modelli salvati
3. Predizioni su test set reale
4. Confronto performance e feature importance

**Metriche:**
- Test Accuracy
- Test Macro F1-Score
- Per-class F1-Score
- Confusion Matrix
- Feature Importance (top features comuni)


## 1. Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    accuracy_score,
    silhouette_score,
    adjusted_rand_score,
    normalized_mutual_info_score
)
from sklearn.decomposition import PCA
import pickle
import json
import os
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('whitegrid')

print("Librerie importate con successo!")


## 2. Preprocessing GUIDE_Test.csv


In [None]:
print("Caricamento GUIDE_Test.csv...")
df_test = pd.read_csv('../data/GUIDE_Test.csv')

print(f"Dataset test caricato: {df_test.shape[0]:,} righe, {df_test.shape[1]} colonne")

# Rimuovi record senza target
df_test = df_test[df_test['IncidentGrade'].notna()].copy()

# Rimuovi colonne con >97% missing
missing_pct = (df_test.isnull().sum() / len(df_test)) * 100
cols_to_drop = missing_pct[missing_pct > 97].index.tolist()
df_test = df_test.drop(columns=cols_to_drop)

# Rimuovi stesse colonne del training
cols_to_remove = [
    'State', 'City', 'CountryCode',
    'OSFamily', 'OSVersion', 
    'DeviceId', 'DeviceName',
    'Sha256', 'FileName', 'FolderPath',
    'AccountObjectId', 'AccountName', 'AccountSid', 'AccountUpn',
    'IpAddress', 'Url', 'NetworkMessageId', 'EmailClusterId',
    'RegistryKey', 'RegistryValueName', 'RegistryValueData',
    'ApplicationId', 'ApplicationName', 'OAuthApplicationId',
    'ThreatFamily', 'ResourceIdName', 'ResourceType', 'Roles'
]

cols_existing = [col for col in cols_to_remove if col in df_test.columns]
if cols_existing:
    df_test = df_test.drop(columns=cols_existing)

# Rimuovi duplicati
df_test = df_test.drop_duplicates(subset=['Id'], keep='first')

print(f"Dimensioni dopo pulizia: {df_test.shape}")
print(f"Distribuzione IncidentGrade:\n{df_test['IncidentGrade'].value_counts(normalize=True)}")


## 3. Caricamento Modelli e Predizioni


In [None]:
# Carica label encoders del training
with open('../data/processed/label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)

# Applica encoding categorico (usa gli stessi encoder del training)
categorical_cols = X_test.select_dtypes(include=['object', 'category']).columns.tolist()

for col in categorical_cols:
    if col in label_encoders:
        le = label_encoders[col]
        # Gestisci valori non visti nel training
        X_test[col] = X_test[col].apply(
            lambda x: x if x in le.classes_ else 'Other'
        )
        # Se 'Other' non è nelle classi, aggiungi
        if 'Other' not in le.classes_:
            le.classes_ = np.append(le.classes_, 'Other')
        X_test[col] = le.transform(X_test[col].astype(str))
    else:
        # Colonna nuova non vista nel training
        X_test[col] = 0

# Gestisci missing values
X_test = X_test.fillna(-999)

# Assicurati che le colonne corrispondano a quelle del training
X_train_sample = pd.read_csv('../data/processed/X_train.csv')
training_cols = X_train_sample.columns.tolist()

# Aggiungi colonne mancanti
for col in training_cols:
    if col not in X_test.columns:
        X_test[col] = 0

# Rimuovi colonne extra
X_test = X_test[training_cols]

print(f"X_test finale: {X_test.shape}")
print(f"Match con training: {list(X_test.columns) == training_cols}")


In [None]:
# Calcola durata e rinomina
incident_agg['Duration_seconds'] = (
    pd.to_datetime(incident_agg['Timestamp_max']) - 
    pd.to_datetime(incident_agg['Timestamp_min'])
).dt.total_seconds()

rename_map = {
    'AlertId_nunique': 'NumAlerts',
    'Id_count': 'NumEvidences',
    'EntityType_nunique': 'NumEntityTypes',
    'EvidenceRole_nunique': 'NumEvidenceRoles',
    'Hour_min': 'Hour_First',
    'Hour_max': 'Hour_Last',
    'Hour_mean': 'Hour_Avg',
    'SuspicionLevel_<lambda>': 'NumWithSuspicion',
    'LastVerdict_<lambda>': 'NumWithVerdict',
    'IncidentGrade_first': 'IncidentGrade',
}

incident_agg = incident_agg.rename(columns=rename_map)
incident_agg = incident_agg.drop(columns=['Timestamp_min', 'Timestamp_max'], errors='ignore')

# Separa features e target
X_test = incident_agg.drop(columns=['IncidentId', 'IncidentGrade'])
y_test = incident_agg['IncidentGrade']

print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")


In [None]:
# Aggregazione a livello Incident
def get_mode(x):
    mode = x.mode()
    return mode[0] if len(mode) > 0 else x.iloc[0] if len(x) > 0 else None

agg_dict = {
    'IncidentGrade': 'first',
    'AlertId': 'nunique',
    'Id': 'count',
    'EntityType': 'nunique',
    'EvidenceRole': 'nunique',
    'Category': get_mode,
    'Hour': ['min', 'max', 'mean'],
    'DayOfWeek': get_mode,
    'IsWeekend': 'max',
    'Timestamp': ['min', 'max'],
    'SuspicionLevel': lambda x: x.notna().sum(),
    'LastVerdict': lambda x: x.notna().sum(),
}

# Aggiungi colonne MITRE
mitre_cols = [col for col in df_test.columns if col.startswith('MITRE_')]
for col in mitre_cols:
    agg_dict[col] = 'sum'

incident_agg = df_test.groupby('IncidentId').agg(agg_dict).reset_index()

# Flatten colonne
incident_agg.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col 
                        for col in incident_agg.columns.values]

print(f"Dataset aggregato: {incident_agg.shape}")


In [None]:
# Features temporali
df_test['Timestamp'] = pd.to_datetime(df_test['Timestamp'])
df_test['Hour'] = df_test['Timestamp'].dt.hour
df_test['DayOfWeek'] = df_test['Timestamp'].dt.dayofweek
df_test['IsWeekend'] = (df_test['DayOfWeek'] >= 5).astype(int)

print("Features temporali create")


In [None]:
# One-hot encoding MITRE
def encode_mitre(techniques_str, frequent_techs):
    techniques = set(techniques_str.split(';'))
    features = {f'MITRE_{tech}': 0 for tech in frequent_techs}
    features['MITRE_unknown'] = 1 if 'unknown' in techniques else 0
    features['MITRE_n_rare'] = 0
    
    for tech in techniques:
        if tech in frequent_techs:
            features[f'MITRE_{tech}'] = 1
        elif tech != 'unknown':
            features['MITRE_n_rare'] += 1
    
    features['MITRE_n_rare'] = min(features['MITRE_n_rare'], 5)
    return features

mitre_encoded = pd.DataFrame([
    encode_mitre(tech, frequent_techniques) 
    for tech in df_test['MitreTechniques_normalized']
])

df_test = pd.concat([df_test, mitre_encoded], axis=1)

print(f"Features MITRE create: {mitre_encoded.shape[1]}")


In [None]:
# Normalizza codici MITRE (stessa funzione del training)
def normalize_mitre(technique):
    if pd.isna(technique):
        return 'unknown'
    techniques = str(technique).split(';')
    normalized = []
    for t in techniques:
        t = t.strip()
        if not t.startswith('T') and t != 'unknown':
            t = 'T' + t
        if '.' in t and t != 'unknown':
            t = t.split('.')[0]
        normalized.append(t)
    return ';'.join(sorted(set(normalized)))

df_test['MitreTechniques_normalized'] = df_test['MitreTechniques'].apply(normalize_mitre)

# Conta tecniche per determinare quali sono frequenti
all_techniques = []
for techniques in df_test['MitreTechniques_normalized']:
    all_techniques.extend(techniques.split(';'))
technique_counts = Counter(all_techniques)

# Usa stessa soglia del training (0.5%)
min_occurrences = len(df_test) * 0.005
frequent_techniques = [tech for tech, count in technique_counts.items() 
                      if count >= min_occurrences and tech != 'unknown']

print(f"Tecniche frequenti selezionate: {len(frequent_techniques)}")


## 4. Valutazione Performance su Test Set


In [None]:
models_dir = Path('../models')

# Carica modelli e fai predizioni
models_results = {}

# XGBoost
xgb_path = models_dir / 'xgboost' / 'model.pkl'
if xgb_path.exists():
    with open(xgb_path, 'rb') as f:
        xgb_model = pickle.load(f)
    with open(models_dir / 'xgboost' / 'label_encoder.pkl', 'rb') as f:
        xgb_le = pickle.load(f)
    
    y_test_xgb = xgb_le.transform(y_test)
    y_pred_xgb = xgb_model.predict(X_test)
    
    models_results['XGBoost'] = {
        'y_true': y_test_xgb,
        'y_pred': y_pred_xgb,
        'label_encoder': xgb_le,
        'model': xgb_model
    }
    print("✓ XGBoost caricato e predizioni completate")

# Random Forest
rf_path = models_dir / 'random_forest' / 'model.pkl'
if rf_path.exists():
    with open(rf_path, 'rb') as f:
        rf_model = pickle.load(f)
    with open(models_dir / 'random_forest' / 'label_encoder.pkl', 'rb') as f:
        rf_le = pickle.load(f)
    
    y_test_rf = rf_le.transform(y_test)
    y_pred_rf = rf_model.predict(X_test)
    
    models_results['Random Forest'] = {
        'y_true': y_test_rf,
        'y_pred': y_pred_rf,
        'label_encoder': rf_le,
        'model': rf_model
    }
    print("✓ Random Forest caricato e predizioni completate")

# K-Means
kmeans_path = models_dir / 'kmeans' / 'model.pkl'
if kmeans_path.exists():
    with open(kmeans_path, 'rb') as f:
        kmeans_model = pickle.load(f)
    with open(models_dir / 'kmeans' / 'scaler.pkl', 'rb') as f:
        kmeans_scaler = pickle.load(f)
    
    X_test_scaled = kmeans_scaler.transform(X_test)
    y_pred_kmeans = kmeans_model.predict(X_test_scaled)
    
    # Encode ground truth per confronto
    kmeans_le = LabelEncoder()
    y_test_kmeans = kmeans_le.fit_transform(y_test)
    
    models_results['K-Means'] = {
        'y_true': y_test_kmeans,
        'y_pred': y_pred_kmeans,
        'label_encoder': kmeans_le,
        'model': kmeans_model,
        'X_scaled': X_test_scaled
    }
    print("✓ K-Means caricato e predizioni completate")

print(f"\nModelli valutati: {list(models_results.keys())}")


In [None]:
# Calcola metriche per modelli supervisionati
comparison_data = []

for model_name in ['XGBoost', 'Random Forest']:
    if model_name in models_results:
        result = models_results[model_name]
        
        accuracy = accuracy_score(result['y_true'], result['y_pred'])
        macro_f1 = f1_score(result['y_true'], result['y_pred'], average='macro')
        
        # F1 per classe
        f1_per_class = f1_score(result['y_true'], result['y_pred'], average=None)
        
        comparison_data.append({
            'Model': model_name,
            'Test Accuracy': accuracy,
            'Test Macro F1': macro_f1,
            'F1 Class 0': f1_per_class[0],
            'F1 Class 1': f1_per_class[1],
            'F1 Class 2': f1_per_class[2],
            'N Test': len(result['y_true'])
        })

comparison_df = pd.DataFrame(comparison_data).sort_values('Test Macro F1', ascending=False)

print("=" * 90)
print("CONFRONTO MODELLI SU GUIDE_TEST.CSV")
print("=" * 90)
print(comparison_df.to_string(index=False))
print(f"\n⭐ Best Model: {comparison_df.iloc[0]['Model']}")
print(f"   Macro F1: {comparison_df.iloc[0]['Test Macro F1']:.4f}")


## 5. Classification Reports e Confusion Matrices


In [None]:
# Classification Reports
for model_name in ['XGBoost', 'Random Forest']:
    if model_name in models_results:
        result = models_results[model_name]
        
        print("=" * 70)
        print(f"{model_name} - Classification Report")
        print("=" * 70)
        print(classification_report(
            result['y_true'], 
            result['y_pred'],
            target_names=result['label_encoder'].classes_,
            digits=4
        ))
        print()


In [None]:
# F1-Score per classe
fig, ax = plt.subplots(figsize=(10, 6))

width = 0.35
x = np.arange(3)  # 3 classi

if 'XGBoost' in models_results and 'Random Forest' in models_results:
    xgb_f1 = [comparison_df[comparison_df['Model'] == 'XGBoost']['F1 Class 0'].values[0],
              comparison_df[comparison_df['Model'] == 'XGBoost']['F1 Class 1'].values[0],
              comparison_df[comparison_df['Model'] == 'XGBoost']['F1 Class 2'].values[0]]
    
    rf_f1 = [comparison_df[comparison_df['Model'] == 'Random Forest']['F1 Class 0'].values[0],
             comparison_df[comparison_df['Model'] == 'Random Forest']['F1 Class 1'].values[0],
             comparison_df[comparison_df['Model'] == 'Random Forest']['F1 Class 2'].values[0]]
    
    ax.bar(x - width/2, xgb_f1, width, label='XGBoost', color='steelblue', edgecolor='black')
    ax.bar(x + width/2, rf_f1, width, label='Random Forest', color='forestgreen', edgecolor='black')
    
    # Ottieni nomi classi dal primo modello
    class_names = models_results['XGBoost']['label_encoder'].classes_
    
    ax.set_ylabel('F1-Score')
    ax.set_title('F1-Score per Classe')
    ax.set_xticks(x)
    ax.set_xticklabels(class_names)
    ax.set_ylim(0, 1)
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()


In [None]:
# Salva report di confronto su test set reale
report = {
    'timestamp': datetime.now().isoformat(),
    'test_dataset': 'GUIDE_Test.csv',
    'n_test_samples': int(len(y_test)),
    'n_features': int(X_test.shape[1]),
    'best_model': best_model['Model'],
    'best_test_f1': float(best_model['Test Macro F1']),
    'best_test_accuracy': float(best_model['Test Accuracy']),
    'models_comparison': comparison_df.to_dict(orient='records'),
}

# Aggiungi ROC AUC scores
if roc_results:
    report['roc_auc_comparison'] = {}
    for model_name, roc_data in roc_results.items():
        n_classes = len(roc_data['label_encoder'].classes_)
        report['roc_auc_comparison'][model_name] = {
            'per_class': {roc_data['label_encoder'].classes_[i]: float(roc_data['roc_auc'][i]) 
                         for i in range(n_classes)},
            'micro_average': float(roc_data['roc_auc']['micro'])
        }

if len(feature_importance_dict) >= 2:
    report['top_features'] = top_features.head(10)['Mean'].to_dict()

if 'K-Means' in models_results:
    result = models_results['K-Means']
    ari = adjusted_rand_score(result['y_true'], result['y_pred'])
    nmi = normalized_mutual_info_score(result['y_true'], result['y_pred'])
    silhouette = silhouette_score(result['X_scaled'], result['y_pred'])
    
    report['kmeans_metrics'] = {
        'silhouette': float(silhouette),
        'ari': float(ari),
        'nmi': float(nmi)
    }

# Aggiungi classification reports dettagliati
for model_name in ['XGBoost', 'Random Forest']:
    if model_name in models_results:
        result = models_results[model_name]
        report[f'{model_name.lower().replace(" ", "_")}_report'] = classification_report(
            result['y_true'], 
            result['y_pred'],
            target_names=result['label_encoder'].classes_.tolist(),
            output_dict=True
        )

with open('../models/test_comparison_report.json', 'w') as f:
    json.dump(report, f, indent=2)

# Salva anche CSV
comparison_df.to_csv('../models/test_models_comparison.csv', index=False)

# Salva ROC AUC scores in CSV separato
if roc_results:
    roc_df_data = []
    for model_name, roc_data in roc_results.items():
        n_classes = len(roc_data['label_encoder'].classes_)
        for i in range(n_classes):
            class_name = roc_data['label_encoder'].classes_[i]
            roc_df_data.append({
                'Model': model_name,
                'Class': class_name,
                'AUC': roc_data['roc_auc'][i]
            })
        roc_df_data.append({
            'Model': model_name,
            'Class': 'Micro-average',
            'AUC': roc_data['roc_auc']['micro']
        })
    
    roc_df = pd.DataFrame(roc_df_data)
    roc_df.to_csv('../models/test_roc_auc_scores.csv', index=False)
    print("ROC AUC scores salvati in: ../models/test_roc_auc_scores.csv")

print("Report salvati:")
print("  - ../models/test_comparison_report.json")
print("  - ../models/test_models_comparison.csv")


In [None]:
# Plot Micro-average ROC comparison
plt.figure(figsize=(10, 8))

colors_models = {'XGBoost': 'blue', 'Random Forest': 'green'}

for model_name, roc_data in roc_results.items():
    color = colors_models.get(model_name, 'black')
    plt.plot(roc_data['fpr']['micro'], roc_data['tpr']['micro'],
            color=color, lw=2,
            label=f"{model_name} (AUC={roc_data['roc_auc']['micro']:.4f})")

plt.plot([0, 1], [0, 1], 'k--', lw=1, label='Random Classifier')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison - Micro-average (All Classes)')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Plot ROC curves per classe (side-by-side per modelli)
if len(roc_results) >= 2:
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    colors_models = {'XGBoost': 'blue', 'Random Forest': 'green'}
    
    # Ottieni nomi classi dal primo modello
    first_model = list(roc_results.keys())[0]
    class_names = roc_results[first_model]['label_encoder'].classes_
    
    # Plot per ogni classe
    for class_idx in range(3):
        ax = axes[class_idx]
        
        for model_name, roc_data in roc_results.items():
            color = colors_models.get(model_name, 'black')
            ax.plot(roc_data['fpr'][class_idx], roc_data['tpr'][class_idx],
                   color=color, lw=2,
                   label=f"{model_name} (AUC={roc_data['roc_auc'][class_idx]:.4f})")
        
        # Diagonal line
        ax.plot([0, 1], [0, 1], 'k--', lw=1, alpha=0.5)
        
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title(f'ROC Curve - {class_names[class_idx]}')
        ax.legend(loc="lower right")
        ax.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Calcola ROC curves per tutti i modelli supervisionati
roc_results = {}

for model_name in ['XGBoost', 'Random Forest']:
    if model_name in models_results:
        result = models_results[model_name]
        
        # Ottieni probabilità
        y_pred_proba = result['model'].predict_proba(X_test)
        
        # Binarizza labels
        y_test_bin = label_binarize(result['y_true'], classes=[0, 1, 2])
        n_classes = y_test_bin.shape[1]
        
        # Calcola ROC e AUC per ogni classe
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_proba[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        
        # Micro-average
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_pred_proba.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
        
        roc_results[model_name] = {
            'fpr': fpr,
            'tpr': tpr,
            'roc_auc': roc_auc,
            'label_encoder': result['label_encoder']
        }

print("ROC AUC Scores per modello:")
for model_name, roc_data in roc_results.items():
    print(f"\n{model_name}:")
    for i, class_name in enumerate(roc_data['label_encoder'].classes_):
        print(f"  {class_name}: {roc_data['roc_auc'][i]:.4f}")
    print(f"  Micro-average: {roc_data['roc_auc']['micro']:.4f}")


## 6.1 ROC Curves Comparison


In [None]:
# Plot confronto metriche
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

models = comparison_df['Model'].values
x = np.arange(len(models))

# Accuracy
axes[0].bar(x, comparison_df['Test Accuracy'], color='skyblue', edgecolor='black')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Test Accuracy su GUIDE_Test.csv')
axes[0].set_xticks(x)
axes[0].set_xticklabels(models)
axes[0].set_ylim(0, 1)
axes[0].grid(axis='y', alpha=0.3)
for i, v in enumerate(comparison_df['Test Accuracy']):
    axes[0].text(i, v + 0.02, f'{v:.4f}', ha='center', fontsize=10, fontweight='bold')

# Macro F1
axes[1].bar(x, comparison_df['Test Macro F1'], color='coral', edgecolor='black')
axes[1].set_ylabel('Macro F1-Score')
axes[1].set_title('Test Macro F1-Score su GUIDE_Test.csv')
axes[1].set_xticks(x)
axes[1].set_xticklabels(models)
axes[1].set_ylim(0, 1)
axes[1].grid(axis='y', alpha=0.3)
for i, v in enumerate(comparison_df['Test Macro F1']):
    axes[1].text(i, v + 0.02, f'{v:.4f}', ha='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()


## 6. Visualizzazione Performance


In [None]:
# Confusion Matrices
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for idx, model_name in enumerate(['XGBoost', 'Random Forest']):
    if model_name in models_results:
        result = models_results[model_name]
        
        cm = confusion_matrix(result['y_true'], result['y_pred'])
        cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
        sns.heatmap(cm_norm, annot=True, fmt='.2%', 
                   cmap='Blues' if idx == 0 else 'Greens',
                   xticklabels=result['label_encoder'].classes_,
                   yticklabels=result['label_encoder'].classes_,
                   ax=axes[idx])
        axes[idx].set_title(f'{model_name} - Confusion Matrix')
        axes[idx].set_ylabel('True Label')
        axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()


## 7. Feature Importance Comparison


In [None]:
# Carica feature importance
feature_importance_dict = {}

for model_name in supervised_models.keys():
    fi_file = models_dir / model_name / 'feature_importance.csv'
    if fi_file.exists():
        fi_df = pd.read_csv(fi_file)
        feature_importance_dict[model_name] = fi_df.set_index('Feature')['Importance']

print(f"Feature importance caricata per: {list(feature_importance_dict.keys())}")

In [None]:
# Top 10 features comuni
if len(feature_importance_dict) >= 2:
    # Combina importance da tutti i modelli
    combined_fi = pd.DataFrame(feature_importance_dict)
    
    # Media delle importance (normalizzate)
    combined_fi_norm = combined_fi.div(combined_fi.sum(axis=0), axis=1)
    combined_fi_norm['Mean'] = combined_fi_norm.mean(axis=1)
    top_features = combined_fi_norm.nlargest(15, 'Mean')
    
    print("Top 15 Features (media normalizzata):")
    print(top_features[['Mean']].round(4))

In [None]:
# F1-Score per classe
fig, ax = plt.subplots(figsize=(10, 6))

width = 0.35

if 'XGBoost' in models_results and 'Random Forest' in models_results:
    # Numero di classi dinamico
    n_classes = len(models_results['XGBoost']['label_encoder'].classes_)
    x = np.arange(n_classes)
    
    xgb_f1 = [comparison_df[comparison_df['Model'] == 'XGBoost'][f'F1 Class {i}'].values[0] 
              for i in range(n_classes)]
    
    rf_f1 = [comparison_df[comparison_df['Model'] == 'Random Forest'][f'F1 Class {i}'].values[0]
             for i in range(n_classes)]
    
    ax.bar(x - width/2, xgb_f1, width, label='XGBoost', color='steelblue', edgecolor='black')
    ax.bar(x + width/2, rf_f1, width, label='Random Forest', color='forestgreen', edgecolor='black')
    
    # Ottieni nomi classi
    class_names = models_results['XGBoost']['label_encoder'].classes_
    
    ax.set_ylabel('F1-Score')
    ax.set_title('F1-Score per Classe')
    ax.set_xticks(x)
    ax.set_xticklabels(class_names)
    ax.set_ylim(0, 1)
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()


## 8. K-Means Analysis su Test Set


In [None]:
if 'K-Means' in models_results:
    result = models_results['K-Means']
    
    # Calcola metriche
    silhouette = silhouette_score(result['X_scaled'], result['y_pred'])
    ari = adjusted_rand_score(result['y_true'], result['y_pred'])
    nmi = normalized_mutual_info_score(result['y_true'], result['y_pred'])
    
    print("=" * 80)
    print("K-MEANS CLUSTERING SU GUIDE_TEST.CSV")
    print("=" * 80)
    print(f"Numero cluster: {result['model'].n_clusters}")
    print(f"Silhouette Score: {silhouette:.4f}")
    print(f"Inertia: {result['model'].inertia_:.2f}")
    print(f"\nConfronto con IncidentGrade (ground truth):")
    print(f"  Adjusted Rand Index: {ari:.4f}")
    print(f"  Normalized Mutual Info: {nmi:.4f}")
    
    # Distribuzione cluster
    unique, counts = np.unique(result['y_pred'], return_counts=True)
    print(f"\nDistribuzione cluster:")
    for cluster_id, count in zip(unique, counts):
        pct = count / len(result['y_pred']) * 100
        print(f"  Cluster {cluster_id}: {count:,} ({pct:.1f}%)")
    
    # Crosstab cluster vs IncidentGrade
    cluster_vs_grade = pd.crosstab(
        result['y_pred'], y_test,
        rownames=['Cluster'],
        colnames=['IncidentGrade']
    )
    cluster_vs_grade_norm = cluster_vs_grade.div(cluster_vs_grade.sum(axis=1), axis=0)
    
    print("\nDistribuzione IncidentGrade per Cluster:")
    print(cluster_vs_grade_norm.round(3))
    
    # Heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(cluster_vs_grade_norm, annot=True, fmt='.2%', cmap='YlOrRd')
    plt.title('Distribuzione IncidentGrade per Cluster (GUIDE_Test)')
    plt.ylabel('Cluster')
    plt.xlabel('IncidentGrade')
    plt.tight_layout()
    plt.show()


## 9. Riepilogo e Raccomandazioni


In [None]:
# Plot ROC curves per classe (side-by-side per modelli)
if len(roc_results) >= 2:
    # Ottieni nomi classi e numero dal primo modello
    first_model = list(roc_results.keys())[0]
    class_names = roc_results[first_model]['label_encoder'].classes_
    n_classes = len(class_names)
    
    fig, axes = plt.subplots(1, n_classes, figsize=(6 * n_classes, 5))
    
    # Se c'è solo 1 classe, axes non è un array
    if n_classes == 1:
        axes = [axes]
    
    colors_models = {'XGBoost': 'blue', 'Random Forest': 'green'}
    
    # Plot per ogni classe
    for class_idx in range(n_classes):
        ax = axes[class_idx]
        
        for model_name, roc_data in roc_results.items():
            color = colors_models.get(model_name, 'black')
            ax.plot(roc_data['fpr'][class_idx], roc_data['tpr'][class_idx],
                   color=color, lw=2,
                   label=f"{model_name} (AUC={roc_data['roc_auc'][class_idx]:.4f})")
        
        # Diagonal line
        ax.plot([0, 1], [0, 1], 'k--', lw=1, alpha=0.5)
        
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title(f'ROC Curve - {class_names[class_idx]}')
        ax.legend(loc="lower right")
        ax.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()


## 10. Salva Report


In [None]:
# Bar plot AUC comparison
if len(roc_results) >= 2:
    models_list = list(roc_results.keys())
    class_names = roc_results[models_list[0]]['label_encoder'].classes_
    n_classes = len(class_names)
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # AUC per classe
    x = np.arange(n_classes)
    width = 0.35
    
    for idx, model_name in enumerate(models_list):
        auc_scores = [roc_results[model_name]['roc_auc'][i] for i in range(n_classes)]
        offset = width * (idx - 0.5)
        color = 'steelblue' if idx == 0 else 'forestgreen'
        axes[0].bar(x + offset, auc_scores, width, label=model_name, 
                   color=color, edgecolor='black')
    
    axes[0].set_ylabel('AUC Score')
    axes[0].set_title('ROC AUC per Classe')
    axes[0].set_xticks(x)
    axes[0].set_xticklabels(class_names)
    axes[0].set_ylim([0, 1])
    axes[0].legend()
    axes[0].grid(axis='y', alpha=0.3)
    
    # Micro-average AUC
    micro_aucs = [roc_results[model]['roc_auc']['micro'] for model in models_list]
    colors_bar = ['steelblue', 'forestgreen']
    axes[1].bar(range(len(models_list)), micro_aucs, color=colors_bar, edgecolor='black')
    axes[1].set_ylabel('Micro-average AUC')
    axes[1].set_title('ROC AUC Micro-average')
    axes[1].set_xticks(range(len(models_list)))
    axes[1].set_xticklabels(models_list)
    axes[1].set_ylim([0, 1])
    axes[1].grid(axis='y', alpha=0.3)
    
    for i, v in enumerate(micro_aucs):
        axes[1].text(i, v + 0.02, f'{v:.4f}', ha='center', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
