# Feature Engineering v2 - GUIDE Dataset

**Obiettivo:** Preparare il dataset GUIDE per ML con gestione integrata delle MITRE Techniques.

**Pipeline:**
1. Caricamento e pulizia
2. Processing MITRE Techniques (normalizzazione + one-hot encoding)
3. Features temporali
4. Aggregazione Evidence → Incident
5. Train/Test split e salvataggio

## 1. Setup

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

print("Librerie importate con successo!")

Librerie importate con successo!


## 2. Caricamento e Pulizia

In [2]:
print("Caricamento dataset...")
df = pd.read_csv('../data/GUIDE_Train.csv')

print(f"Dataset caricato: {df.shape[0]:,} righe, {df.shape[1]} colonne")

# Rimuovi record senza target
df = df[df['IncidentGrade'].notna()].copy()

# Rimuovi colonne con >97% missing
missing_pct = (df.isnull().sum() / len(df)) * 100
cols_to_drop = missing_pct[missing_pct > 97].index.tolist()
df = df.drop(columns=cols_to_drop)

# Rimuovi colonne geografiche, tecniche e di dettaglio non utili
# (alta cardinalità, dati anonimizzati o troppo specifici)
cols_to_remove = [
    'State', 'City', 'CountryCode',
    'OSFamily', 'OSVersion', 
    'DeviceId', 'DeviceName',
    'Sha256', 'FileName', 'FolderPath',
    'AccountObjectId', 'AccountName', 'AccountSid', 'AccountUpn',
    'IpAddress', 'Url', 'NetworkMessageId', 'EmailClusterId',
    'RegistryKey', 'RegistryValueName', 'RegistryValueData',
    'ApplicationId', 'ApplicationName', 'OAuthApplicationId',
    'ThreatFamily', 'ResourceIdName', 'ResourceType', 'Roles'
]

# Rimuovi solo quelle che esistono nel dataframe
cols_existing = [col for col in cols_to_remove if col in df.columns]
if cols_existing:
    df = df.drop(columns=cols_existing)
    print(f"Colonne dettaglio rimosse: {len(cols_existing)}")

# Rimuovi duplicati
df = df.drop_duplicates(subset=['Id'], keep='first')

print(f"Dimensioni dopo pulizia: {df.shape}")

Caricamento dataset...
Dataset caricato: 9,516,837 righe, 45 colonne
Colonne dettaglio rimosse: 24
Dimensioni dopo pulizia: (707108, 14)


## 3. Processing MITRE Techniques

In [3]:
# Normalizza codici MITRE
def normalize_mitre(technique):
    if pd.isna(technique):
        return 'unknown'
    techniques = str(technique).split(';')
    normalized = []
    for t in techniques:
        t = t.strip()
        if not t.startswith('T') and t != 'unknown':
            t = 'T' + t
        if '.' in t and t != 'unknown':
            t = t.split('.')[0]
        normalized.append(t)
    # Rimuovi duplicati e ordina
    return ';'.join(sorted(set(normalized)))

df['MitreTechniques_normalized'] = df['MitreTechniques'].apply(normalize_mitre)

# Conta occorrenze tecniche
all_techniques = []
for techniques in df['MitreTechniques_normalized']:
    all_techniques.extend(techniques.split(';'))
technique_counts = Counter(all_techniques)

# Seleziona tecniche frequenti (>0.5% del dataset)
min_occurrences = len(df) * 0.005
frequent_techniques = [tech for tech, count in technique_counts.items() 
                      if count >= min_occurrences and tech != 'unknown']

print(f"Tecniche frequenti selezionate: {len(frequent_techniques)}")
print(f"Top 10: {sorted(technique_counts.items(), key=lambda x: x[1], reverse=True)[:10]}")

Tecniche frequenti selezionate: 5
Top 10: [('unknown', 461504), ('T1078', 99734), ('T1566', 92722), ('T1110', 11447), ('T1059', 4983), ('T1003', 3739), ('T1559', 3376), ('T1106', 3312), ('T1087', 3104), ('T1485', 3050)]


In [4]:
# One-hot encoding delle tecniche MITRE a livello Evidence
def encode_mitre(techniques_str, frequent_techs):
    techniques = set(techniques_str.split(';'))
    features = {f'MITRE_{tech}': 0 for tech in frequent_techs}
    features['MITRE_unknown'] = 1 if 'unknown' in techniques else 0
    features['MITRE_n_rare'] = 0
    
    for tech in techniques:
        if tech in frequent_techs:
            features[f'MITRE_{tech}'] = 1
        elif tech != 'unknown':
            features['MITRE_n_rare'] += 1
    
    features['MITRE_n_rare'] = min(features['MITRE_n_rare'], 5)
    return features

mitre_encoded = pd.DataFrame([
    encode_mitre(tech, frequent_techniques) 
    for tech in df['MitreTechniques_normalized']
])

# Aggiungi al dataframe principale
df = pd.concat([df, mitre_encoded], axis=1)

print(f"Features MITRE create: {mitre_encoded.shape[1]}")
print(f"Shape dataset: {df.shape}")

Features MITRE create: 7
Shape dataset: (1173396, 22)


## 4. Features Temporali

In [5]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(int)

print("Features temporali create")

Features temporali create


## 5. Aggregazione a Livello Incident

In [6]:
def get_mode(x):
    mode = x.mode()
    return mode[0] if len(mode) > 0 else x.iloc[0] if len(x) > 0 else None

# Prepara aggregazioni
agg_dict = {
    'IncidentGrade': 'first',
    'AlertId': 'nunique',
    'Id': 'count',
    'EntityType': 'nunique',
    'EvidenceRole': 'nunique',
    'Category': get_mode,
    'Hour': ['min', 'max', 'mean'],
    'DayOfWeek': get_mode,
    'IsWeekend': 'max',
    'Timestamp': ['min', 'max'],
    'SuspicionLevel': lambda x: x.notna().sum(),
    'LastVerdict': lambda x: x.notna().sum(),
}

# Aggiungi colonne MITRE alle aggregazioni (somma per incident)
mitre_cols = [col for col in df.columns if col.startswith('MITRE_')]
for col in mitre_cols:
    agg_dict[col] = 'sum'

# Esegui aggregazione
incident_agg = df.groupby('IncidentId').agg(agg_dict).reset_index()

# Flatten colonne multi-livello
incident_agg.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col 
                        for col in incident_agg.columns.values]

print(f"Dataset aggregato: {incident_agg.shape}")


Dataset aggregato: (448901, 23)


In [7]:
# Calcola durata e rinomina colonne
incident_agg['Duration_seconds'] = (
    pd.to_datetime(incident_agg['Timestamp_max']) - 
    pd.to_datetime(incident_agg['Timestamp_min'])
).dt.total_seconds()

rename_map = {
    'AlertId_nunique': 'NumAlerts',
    'Id_count': 'NumEvidences',
    'EntityType_nunique': 'NumEntityTypes',
    'EvidenceRole_nunique': 'NumEvidenceRoles',
    'Hour_min': 'Hour_First',
    'Hour_max': 'Hour_Last',
    'Hour_mean': 'Hour_Avg',
    'SuspicionLevel_<lambda>': 'NumWithSuspicion',
    'LastVerdict_<lambda>': 'NumWithVerdict',
    'IncidentGrade_first': 'IncidentGrade',
}

incident_agg = incident_agg.rename(columns=rename_map)
incident_agg = incident_agg.drop(columns=['Timestamp_min', 'Timestamp_max'], errors='ignore')

print(f"Features finali: {incident_agg.shape[1] - 2}")  # -2 per ID e target

Features finali: 20


## 6. Encoding Categorici e Split

In [8]:
# Separa features e target
X = incident_agg.drop(columns=['IncidentId', 'IncidentGrade'])
y = incident_agg['IncidentGrade']

# Identifica colonne categoriche
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Riduzione cardinalità alte
for col in categorical_cols:
    if X[col].nunique() > 100:
        top_values = X[col].value_counts().head(50).index
        X[col] = X[col].apply(lambda x: x if x in top_values else 'Other')

# Label encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Gestisci missing
X = X.fillna(-999)

print(f"Features finali: {X.shape}")
print(f"Target: {y.shape}")
print(f"\nDistribuzione target:\n{y.value_counts(normalize=True)}")

Features finali: (448901, 20)
Target: (448901,)

Distribuzione target:
IncidentGrade
BenignPositive    0.485922
FalsePositive     0.301086
TruePositive      0.212991
Name: proportion, dtype: float64


In [9]:
# Split stratificato
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"\nDistribuzione y_train:\n{y_train.value_counts(normalize=True)}")

X_train: (314230, 20)
X_test: (134671, 20)

Distribuzione y_train:
IncidentGrade
BenignPositive    0.485921
FalsePositive     0.301088
TruePositive      0.212990
Name: proportion, dtype: float64


## 7. Salvataggio

In [10]:
os.makedirs('../data/processed', exist_ok=True)

X_train.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False, header=['IncidentGrade'])
y_test.to_csv('../data/processed/y_test.csv', index=False, header=['IncidentGrade'])

with open('../data/processed/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

incident_agg.to_csv('../data/processed/incident_features.csv', index=False)

print("Dataset salvati in ../data/processed/")
print(f"  - X_train.csv: {X_train.shape}")
print(f"  - X_test.csv: {X_test.shape}")
print(f"  - Features totali: {X_train.shape[1]}")

Dataset salvati in ../data/processed/
  - X_train.csv: (314230, 20)
  - X_test.csv: (134671, 20)
  - Features totali: 20
