In [1]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy.stats import boxcox
from sklearn.preprocessing import RobustScaler, label_binarize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    classification_report, confusion_matrix,
    precision_recall_curve, roc_curve, auc, f1_score
)
from imblearn.over_sampling import SMOTE

# =========================
# CONFIGURATION
# =========================
TRAIN_PATH = r'X:\Dissertacao\python_projects\dataset\ISCX-Bot-2014\ISCX_csv\Testing_file.csv'
TEST_PATH  = r'X:\Dissertacao\python_projects\dataset\ISCX-Bot-2014\ISCX_csv\Training_file.csv'
SAMPLE_SIZE = 300_000
RANDOM_STATE = 42
OUTPUT_SUMMARY = "semi_supervised_ensemble_summary.json"

# =========================
# HELPER FUNCTIONS
# =========================
def safe_boxcox(series):
    arr = np.asarray(series.fillna(0.0).astype(float) + 1e-6)
    try:
        if np.any(arr <= 0):
            arr = arr - arr.min() + 1e-6
        out, _ = boxcox(arr + 1e-6)
        return out
    except Exception:
        return np.log1p(arr)

def safe_sample(df, n, seed=RANDOM_STATE):
    if n >= len(df):
        return df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return df.sample(n=n, random_state=seed).reset_index(drop=True)

def ensure_numeric(df, cols):
    for c in cols:
        if c not in df.columns:
            df[c] = 0.0
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0.0)
    return df

def combined_map(train, test, col):
    train[col] = train.get(col, "").astype(str)
    test[col] = test.get(col, "").astype(str)
    uniques = pd.unique(pd.concat([train[col], test[col]], axis=0))
    mapping = {v: i for i, v in enumerate(uniques)}
    train[col + "_enc"] = train[col].map(mapping).fillna(-1).astype(int)
    test[col + "_enc"] = test[col].map(mapping).fillna(-1).astype(int)
    return mapping

# =========================
# LOAD DATA
# =========================
df_train = pd.read_csv(TRAIN_PATH, low_memory=False, encoding='ISO-8859-1')
df_test  = pd.read_csv(TEST_PATH,  low_memory=False, encoding='ISO-8859-1')

for df in (df_train, df_test):
    if 'Info' in df.columns:
        df['Info'] = df['Info'].fillna("Unknown")
    for col in ['Time', 'Source', 'Destination', 'Length']:
        if col in df.columns:
            df.dropna(subset=[col], inplace=True)

combined_map(df_train, df_test, 'Protocol')
combined_map(df_train, df_test, 'Source')
combined_map(df_train, df_test, 'Destination')

# =========================
# FEATURE ENGINEERING
# =========================
def build_features(df):
    df = df.copy()
    src, dst = 'Source_enc', 'Destination_enc'
    df['Time'] = pd.to_numeric(df.get('Time', 0), errors='coerce').fillna(0)
    df['Length'] = pd.to_numeric(df.get('Length', 0), errors='coerce').fillna(0)
    
    df['Time_Diff'] = df.groupby(src)['Time'].diff().fillna(0)
    df['Inter_Arrival_Time'] = df.groupby(src)['Time_Diff'].transform(lambda s: s.rolling(10, min_periods=1).mean()).fillna(0)
    df['Burst_Rate'] = np.where(df['Inter_Arrival_Time'] > 1e-6, 1.0 / df['Inter_Arrival_Time'], 0)
    df['Pkt_Per_Src'] = df.groupby(src)['Length'].transform('count').fillna(0)
    df['Session_Dur_Src'] = df.groupby(src)['Time'].transform(lambda x: x.max() - x.min()).fillna(0)
    df['Bytes_Per_Session_Src'] = df.groupby(src)['Length'].transform('sum').fillna(0)
    df['Packets_Per_Session_Src'] = df['Pkt_Per_Src']
    df['Length_Mean'] = df.groupby(src)['Length'].transform('mean').fillna(0)
    df['Length_Std'] = df.groupby(src)['Length'].transform('std').fillna(0)
    
    df['BoxCox_Length'] = safe_boxcox(df['Length'])
    df['Log_BRate'] = np.log1p(df['Burst_Rate'])
    df['Log_IATime'] = np.log1p(df['Inter_Arrival_Time'])
    df['Rate_to_Length'] = df['Burst_Rate'] / (df['BoxCox_Length'] + 1e-6)
    df['IAT_to_Session'] = df['Inter_Arrival_Time'] / (df['Session_Dur_Src'] + 1e-6)
    df['Burst_Variability'] = df['Burst_Rate'].rolling(5, min_periods=1).std().fillna(0)
    df['Pkt_Rate_Change'] = df['Pkt_Per_Src'].diff().fillna(0)
    
    features = [
        'Time_Diff', 'Log_BRate', 'Log_IATime', 'BoxCox_Length', 'Length_Mean', 'Length_Std',
        'Pkt_Per_Src', 'Session_Dur_Src', 'Rate_to_Length', 'IAT_to_Session', 
        'Burst_Variability', 'Pkt_Rate_Change', 'Bytes_Per_Session_Src', 'Packets_Per_Session_Src'
    ]
    df = ensure_numeric(df, features)
    return df, features

df_train, features_train = build_features(df_train)
df_test, features_test   = build_features(df_test)
NUM_FEATURES = list(dict.fromkeys(features_train + features_test))

df_train_sample = safe_sample(df_train, SAMPLE_SIZE)
df_test_sample  = safe_sample(df_test, SAMPLE_SIZE)

scaler = RobustScaler()
scaler.fit(df_train_sample[NUM_FEATURES])
X_train = pd.DataFrame(scaler.transform(df_train_sample[NUM_FEATURES]), columns=NUM_FEATURES)
X_test = pd.DataFrame(scaler.transform(df_test_sample[NUM_FEATURES]), columns=NUM_FEATURES)

# =========================
# PSEUDO-LABELS (using heuristic thresholds)
# =========================
thresholds = {
    "Neris": lambda df: (df["Burst_Rate"] > df["Burst_Rate"].quantile(0.9)) & (df["Pkt_Per_Src"] > df["Pkt_Per_Src"].quantile(0.8)),
    "Virut": lambda df: (df["BoxCox_Length"] > df["BoxCox_Length"].quantile(0.8)),
    "Zeus": lambda df: (df["Inter_Arrival_Time"].between(df["Inter_Arrival_Time"].quantile(0.05),
                                                         df["Inter_Arrival_Time"].quantile(0.6))),
}

def label_profiles(df):
    df["Pred_Botnet_Profile"] = "Normal"
    for label, rule in thresholds.items():
        mask = rule(df)
        df.loc[mask, "Pred_Botnet_Profile"] = label
    return df

df_train_sample = label_profiles(df_train_sample)
df_test_sample = label_profiles(df_test_sample)

# =========================
# SUPERVISED MODEL TRAINING
# =========================
X_train_sup, X_test_sup = X_train, X_test
y_train_sup, y_test_sup = df_train_sample["Pred_Botnet_Profile"], df_test_sample["Pred_Botnet_Profile"]

try:
    sm = SMOTE(random_state=RANDOM_STATE)
    X_bal, y_bal = sm.fit_resample(X_train_sup, y_train_sup)
except Exception:
    X_bal, y_bal = X_train_sup, y_train_sup

X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_bal, y_bal, test_size=0.2, random_state=RANDOM_STATE, stratify=y_bal)

rf = RandomForestClassifier(n_estimators=300, class_weight="balanced_subsample", random_state=RANDOM_STATE, n_jobs=-1)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, subsample=0.8, random_state=RANDOM_STATE)
lsvc_base = LinearSVC(C=0.5, class_weight='balanced', max_iter=5000, random_state=RANDOM_STATE)
lsvc = CalibratedClassifierCV(lsvc_base, cv=3)
sgd_base = SGDClassifier(loss='log_loss', penalty='elasticnet', alpha=1e-4, max_iter=1500, tol=1e-3, learning_rate='adaptive', random_state=RANDOM_STATE)
sgd = CalibratedClassifierCV(sgd_base, cv=3)

models = {"rf": rf, "gb": gb, "lsvc": lsvc, "sgd": sgd}

for name, model in models.items():
    model.fit(X_train_sub, y_train_sub)

weights = {}
for name, model in models.items():
    preds = model.predict(X_val)
    weights[name] = max(f1_score(y_val, preds, average="weighted"), 1e-6)
total = sum(weights.values())
weights = {k: v/total for k, v in weights.items()}

ensemble = VotingClassifier(
    estimators=[(k, v) for k, v in models.items()],
    voting="soft",
    weights=list(weights.values()),
    n_jobs=-1
)
ensemble.fit(X_bal, y_bal)

# =========================
# EVALUATION
# =========================
y_pred_ens = ensemble.predict(X_test_sup)
print("=== Ensemble Report ===")
print(classification_report(y_test_sup, y_pred_ens, zero_division=0))

cm = confusion_matrix(y_test_sup, y_pred_ens, labels=ensemble.classes_)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=ensemble.classes_, yticklabels=ensemble.classes_, cmap="Blues")
plt.title("Confusion Matrix - Ensemble")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()

# =========================
# PRECISION-RECALL / ROC
# =========================
labels = sorted(list(pd.unique(y_test_sup)))
y_test_bin = label_binarize(y_test_sup, classes=labels)
y_score = ensemble.predict_proba(X_test_sup)

plt.figure(figsize=(8,6))
for i, lab in enumerate(labels):
    prec, rec, _ = precision_recall_curve(y_test_bin[:, i], y_score[:, i])
    plt.plot(rec, prec, lw=2, label=f"{lab} (AUPR={auc(rec, prec):.3f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curves")
plt.legend()
plt.show()

plt.figure(figsize=(8,6))
for i, lab in enumerate(labels):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    plt.plot(fpr, tpr, lw=2, label=f"{lab} (AUC={auc(fpr, tpr):.3f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curves")
plt.legend()
plt.show()

summary = {
    "train_class_distribution": dict(Counter(y_bal)),
    "test_class_distribution": dict(Counter(y_test_sup)),
    "ensemble_weights": weights,
    "ensemble_report": classification_report(y_test_sup, y_pred_ens, output_dict=True, zero_division=0),
    "features_used": NUM_FEATURES,
}
with open(OUTPUT_SUMMARY, "w") as f:
    json.dump(summary, f, indent=2)
print(f"Summary saved to {OUTPUT_SUMMARY}")


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


KeyboardInterrupt: 