In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix
)

# -----------------------------
# Carregar dataset
# -----------------------------
df = pd.read_csv('Dataset/df_sample.csv')
print("LIMPEZA E PRÉ-PROCESSAMENTO DO DATASET")
print(f"\nDataset original: {df.shape}")
print(f"Duplicatas: {df.duplicated().sum()}")

# -----------------------------
# Separar features e target
# -----------------------------
X = df.drop(columns=['Label'])
y = df['Label']

# -----------------------------
# Verificar distribuição de classes
# -----------------------------
print(f"\nDistribuição de classes:")
print(f"   Label 0 (Normal): {(y == 0).sum()} ({100*(y == 0).sum()/len(y):.1f}%)")
print(f"   Label 1 (Ataque): {(y == 1).sum()} ({100*(y == 1).sum()/len(y):.1f}%)")

print("\n" + "=" * 70)
print("CROSS-VALIDATION - RANDOM FOREST")
print("=" * 70)

# -----------------------------
# Funções de métricas
# -----------------------------
def if_precision(y_true, y_pred): return precision_score(y_true, y_pred)
def if_recall(y_true, y_pred): return recall_score(y_true, y_pred)
def if_f1(y_true, y_pred): return f1_score(y_true, y_pred)
def if_roc_auc(y_true, y_pred): return roc_auc_score(y_true, y_pred)
def if_pr_auc(y_true, y_pred): return average_precision_score(y_true, y_pred)

scoring = {
    'accuracy': 'accuracy',
    'precision': make_scorer(if_precision),
    'recall': make_scorer(if_recall),
    'f1': make_scorer(if_f1),
    'roc_auc': make_scorer(if_roc_auc),
    'pr_auc': make_scorer(if_pr_auc)
}

# -----------------------------
# Random Forest com StratifiedKFold
# -----------------------------
rf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_scores = cross_validate(rf, X, y, cv=cv, scoring=scoring)

print("\nResultados Random Forest:\n")
for metric in scoring.keys():
    mean = np.mean(rf_scores[f'test_{metric}'])
    std = np.std(rf_scores[f'test_{metric}'])
    print(f"{metric.upper():<12} Média = {mean:.4f} | Desvio padrão = {std:.4f}")

# -----------------------------
# Matriz de Confusão - Random Forest
# -----------------------------
rf_preds = np.zeros(len(y))
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    rf.fit(X_train, y_train)
    rf_preds[test_idx] = rf.predict(X_test)

rf_cm = confusion_matrix(y, rf_preds)
print("\nMatriz de Confusão - Random Forest:")
print(rf_cm)

# =====================================================================
# ======================  ISOLATION FOREST ============================
# =====================================================================

print("\n" + "=" * 70)
print("CROSS-VALIDATION - ISOLATION FOREST (APERFEIÇOADO)")
print("=" * 70)

# Treinar apenas em dados benignos
X_benign = X[y == 0]

# Padronizar 
scaler = StandardScaler()
X_benign_scaled = scaler.fit_transform(X_benign)
X_scaled = scaler.transform(X)

# Isolation Forest
iso = IsolationForest(
    n_estimators=500,
    max_samples=0.6,
    bootstrap=True,
    contamination='auto',   
    random_state=42,
    n_jobs=-1
)

iso_metrics = {m: [] for m in scoring.keys()}
iso_preds = np.zeros(len(y))

# Cross-validation manual (estratificada)
for train_idx, test_idx in cv.split(X_scaled, y):
    X_train = X_scaled[train_idx]
    y_train = y.iloc[train_idx]

    # Treina apenas com benignos do fold
    X_train_benign = X_train[y_train == 0]
    iso.fit(X_train_benign)

    X_test = X_scaled[test_idx]
    y_test = y.iloc[test_idx]

    pred = iso.predict(X_test)
    pred = np.where(pred == -1, 1, 0)  # -1 → ataque
    iso_preds[test_idx] = pred

    # Métricas
    iso_metrics['accuracy'].append(np.mean(pred == y_test))
    iso_metrics['precision'].append(precision_score(y_test, pred, zero_division=0))
    iso_metrics['recall'].append(recall_score(y_test, pred, zero_division=0))
    iso_metrics['f1'].append(f1_score(y_test, pred, zero_division=0))
    iso_metrics['roc_auc'].append(roc_auc_score(y_test, pred))
    iso_metrics['pr_auc'].append(average_precision_score(y_test, pred))

# -----------------------------
# RELATÓRIO FINAL - ISOLATION FOREST
# -----------------------------
print("\nResultados Isolation Forest:\n")
for metric in scoring.keys():
    mean = np.mean(iso_metrics[metric])
    std = np.std(iso_metrics[metric])
    print(f"{metric.upper():<12} Média = {mean:.4f} | Desvio padrão = {std:.4f}")

# -----------------------------
# Matriz de Confusão - Isolation Forest
# -----------------------------
iso_cm = confusion_matrix(y, iso_preds)
print("\nMatriz de Confusão - Isolation Forest:")
print(iso_cm)


LIMPEZA E PRÉ-PROCESSAMENTO DO DATASET

Dataset original: (584227, 20)
Duplicatas: 0

Distribuição de classes:
   Label 0 (Normal): 535925 (91.7%)
   Label 1 (Ataque): 48302 (8.3%)

CROSS-VALIDATION - RANDOM FOREST

Resultados Random Forest:

ACCURACY     Média = 0.9998 | Desvio padrão = 0.0000
PRECISION    Média = 0.9986 | Desvio padrão = 0.0003
RECALL       Média = 0.9983 | Desvio padrão = 0.0002
F1           Média = 0.9985 | Desvio padrão = 0.0002
ROC_AUC      Média = 0.9991 | Desvio padrão = 0.0001
PR_AUC       Média = 0.9971 | Desvio padrão = 0.0004

Matriz de Confusão - Random Forest:
[[535859     66]
 [    80  48222]]

CROSS-VALIDATION - ISOLATION FOREST (APERFEIÇOADO)

Resultados Isolation Forest:

ACCURACY     Média = 0.9271 | Desvio padrão = 0.0005
PRECISION    Média = 0.6312 | Desvio padrão = 0.0080
RECALL       Média = 0.2836 | Desvio padrão = 0.0024
F1           Média = 0.3914 | Desvio padrão = 0.0035
ROC_AUC      Média = 0.6343 | Desvio padrão = 0.0013
PR_AUC       Média 