In [3]:
import pandas as pd
from collections import Counter

# Load separately
train = pd.read_csv("/kaggle/input/cicds-unsw-alligned-dataset/CICIDS_aligned_train.csv")
test  = pd.read_csv("/kaggle/input/cicds-unsw-alligned-dataset/CICIDS_aligned_test.csv")

print("Train:", train.shape, " Test:", test.shape)

# Clean duplicate label columns
for df in [train, test]:
    for col in ["Label", "raw_label", "Attack", "attack_label", "class", "Category"]:
        if col in df.columns:
            df.drop(columns=[col], inplace=True, errors="ignore")

# Clean attack_cat safely
for df in [train, test]:
    df["attack_cat"] = pd.to_numeric(df["attack_cat"], errors="coerce")
    df.dropna(subset=["attack_cat"], inplace=True)
    df["attack_cat"] = df["attack_cat"].astype(int)

print("âœ… Labels cleaned")

# ---------------------------
# Controlled Class Sampling
# ---------------------------

from collections import Counter

def balance(df):
    counts = Counter(df["attack_cat"])
    print("Before:", counts)

    balanced = []

    for cls, cnt in counts.items():

        # Very large classes â†’ keep 80k
        if cnt > 80000:
            balanced.append(df[df["attack_cat"] == cls].sample(80000, random_state=42))

        # Medium classes â†’ keep 40k
        elif cnt > 40000:
            balanced.append(df[df["attack_cat"] == cls].sample(40000, random_state=42))

        # Small classes â†’ keep all (important!)
        else:
            balanced.append(df[df["attack_cat"] == cls])

    df_bal = pd.concat(balanced).sample(frac=1, random_state=42).reset_index(drop=True)
    print("After:", Counter(df_bal["attack_cat"]))
    return df_bal
train_bal = balance(train)
test_bal  = balance(test)

# Save balanced sets
train_bal.to_csv("CICIDS2018_train_balanced_alligned.csv", index=False)
test_bal.to_csv("CICIDS2018_test_balanced_alligned.csv", index=False)

print("âœ… Saved:")
print("Train:", train_bal.shape, " Test:", test_bal.shape)

Train: (5327625, 133)  Test: (1331907, 133)
âœ… Labels cleaned
Before: Counter({5: 4263206, 2: 620764, 3: 157254, 0: 115628, 4: 94786, 8: 75238, 1: 638, 6: 68, 7: 43})
After: Counter({0: 80000, 3: 80000, 4: 80000, 5: 80000, 2: 80000, 8: 40000, 1: 638, 6: 68, 7: 43})
Before: Counter({5: 1065802, 2: 155191, 3: 39314, 0: 28907, 4: 23697, 8: 18810, 1: 159, 6: 17, 7: 10})
After: Counter({2: 80000, 5: 80000, 3: 39314, 0: 28907, 4: 23697, 8: 18810, 1: 159, 6: 17, 7: 10})
âœ… Saved:
Train: (440749, 132)  Test: (270914, 132)


In [4]:
import pandas as pd
import joblib
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

# ===== LOAD =====
train = pd.read_csv("/kaggle/working/CICIDS2018_train_balanced_alligned.csv")
test  = pd.read_csv("/kaggle/working/CICIDS2018_test_balanced_alligned.csv")

X_train = train.drop(columns=["attack_cat"])
y_train = train["attack_cat"]

X_test  = test.drop(columns=["attack_cat"])
y_test  = test["attack_cat"]

print("Train:", train.shape, "| Test:", test.shape)
print("Class distribution:", Counter(y_train))

# ===== SCALE =====
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)
joblib.dump(scaler, "scaler.pkl")

# ===== CLASS WEIGHTS =====
classes = sorted(y_train.unique())
cw = compute_class_weight("balanced", classes=classes, y=y_train)
class_weights = {classes[i]: float(cw[i]) for i in range(len(cw))}

# ===== MODELS =====
model_lgb = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight=class_weights,
    random_state=42
)

model_xgb = XGBClassifier(
    objective="multi:softprob",
    num_class=len(classes),
    tree_method="hist",
    n_estimators=300,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss",
    random_state=42
)

model_cat = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    class_weights=list(cw),
    verbose=False,
    loss_function="MultiClass"
)

model_rf = RandomForestClassifier(
    n_estimators=260,
    max_depth=14,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

print("\nTraining models...")
model_lgb.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_cat.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

joblib.dump(model_lgb, "lgb.pkl")
joblib.dump(model_xgb, "xgb.pkl")
joblib.dump(model_cat, "cat.pkl")
joblib.dump(model_rf, "rf.pkl")

print("\nEvaluating ensemble...")
p1 = model_lgb.predict_proba(X_test)
p2 = model_xgb.predict_proba(X_test)
p3 = model_cat.predict_proba(X_test)
p4 = model_rf.predict_proba(X_test)

p_avg = (p1 + p2 + p3 + p4) / 4.0
y_pred = p_avg.argmax(axis=1)

print("âœ… Accuracy:", accuracy_score(y_test, y_pred))
print("ðŸŽ¯ Macro F1:", f1_score(y_test, y_pred, average="macro"))
print("\n", classification_report(y_test, y_pred, zero_division=0))

Train: (440749, 132) | Test: (270914, 132)
Class distribution: Counter({0: 80000, 3: 80000, 4: 80000, 5: 80000, 2: 80000, 8: 40000, 1: 638, 6: 68, 7: 43})

Training models...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.057808 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13708
[LightGBM] [Info] Number of data points in the train set: 440749, number of used features: 69
[LightGBM] [Info] Start training from score -2.197225
[LightGBM] [Info] Start training from score -2.197225
[LightGBM] [Info] Start training from score -2.197225
[LightGBM] [Info] Start training from score -2.197225
[LightGBM] [Info] Start training from score -2.197225
[LightGBM] [Info] Start training from score -2.197225
[LightGBM] [Info] Start training from score -2.197225
[LightGBM] [Info] Start training from score -2.197225
[LightGBM] [Info] Start training from 

In [6]:
import pandas as pd
import joblib

cic = pd.read_csv("/kaggle/working/CICIDS2018_train_balanced_alligned.csv")

feature_list = [c for c in cic.columns if c != "attack_cat"]
joblib.dump(feature_list, "aligned_feature_list.pkl")

print("âœ… Saved aligned feature list:", len(feature_list))

âœ… Saved aligned feature list: 131
