In [6]:
# AdÄ±m 1: Kurulum, veri yÃ¼kleme, temizlik (Kaggle iÃ§in)
import os, glob, warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_DIR = "/kaggle/input/cicddos2019"
TARGET   = "__label__"

def infer_label(fname):  # Syn-training.parquet -> Syn
    return os.path.basename(fname).split("-")[0]
def infer_split(fname):  # train/test bilgisi
    b = os.path.basename(fname).lower()
    return "train" if "train" in b else ("test" if "test" in b else "unknown")

# 1.1 Parquetleri birleÅŸtir
frames=[]
for f in sorted(glob.glob(os.path.join(DATA_DIR,"*.parquet"))):
    df = pd.read_parquet(f)
    df["__split__"] = infer_split(f)
    df[TARGET]      = infer_label(f)
    frames.append(df)
raw = pd.concat(frames, ignore_index=True)

# 1.2 Temizlik: inf->NaN, kategorik/sabit kolonlarÄ± at
raw = raw.replace([np.inf,-np.inf], np.nan)
META = [TARGET,"__split__"]
features = [c for c in raw.columns if c not in META]
num_cols = [c for c in features if pd.api.types.is_numeric_dtype(raw[c])]
df = raw.drop(columns=[c for c in features if c not in num_cols]).copy()
constant = df[num_cols].nunique(dropna=False)
const_cols = constant[constant<=1].index.tolist()
if const_cols:
    df.drop(columns=const_cols, inplace=True)

# 1.3 Train/Test ayÄ±r
train_df = df[df["__split__"]=="train"].copy()
test_df  = df[df["__split__"]=="test"].copy()
X_train, y_train = train_df.drop(columns=META), train_df[TARGET].astype(str)
X_test,  y_test  = test_df.drop(columns=META),  test_df[TARGET].astype(str)

print("YÃ¼klendi âœ“",
      "\nTrain:", X_train.shape, "| Test:", X_test.shape,
      "\nTrain sÄ±nÄ±flarÄ±:", sorted(y_train.unique()),
      "\nTest  sÄ±nÄ±flarÄ±:", sorted(y_test.unique()))

YÃ¼klendi âœ“ 
Train: (125170, 65) | Test: (306201, 65) 
Train sÄ±nÄ±flarÄ±: ['LDAP', 'MSSQL', 'NetBIOS', 'Portmap', 'Syn', 'UDP', 'UDPLag'] 
Test  sÄ±nÄ±flarÄ±: ['DNS', 'LDAP', 'MSSQL', 'NTP', 'NetBIOS', 'SNMP', 'Syn', 'TFTP', 'UDP', 'UDPLag']


In [7]:
# AdÄ±m 2: Ortak sÄ±nÄ±flar (closed-set)
common = sorted(set(y_train.unique()).intersection(set(y_test.unique())))
train_mask = y_train.isin(common)
test_mask  = y_test.isin(common)

X_tr, y_tr = X_train[train_mask].copy(), y_train[train_mask].copy()
X_te, y_te = X_test[test_mask].copy(),  y_test[test_mask].copy()

print("Ortak sÄ±nÄ±flar:", common)
print("Yeni Train/Test:", X_tr.shape, X_te.shape)

Ortak sÄ±nÄ±flar: ['LDAP', 'MSSQL', 'NetBIOS', 'Syn', 'UDP', 'UDPLag']
Yeni Train/Test: (120065, 65) (38973, 65)


In [8]:
# AdÄ±m 3: Manuel undersampling (imblearn yok)
from collections import Counter
tmp = X_tr.copy(); tmp[TARGET]=y_tr.values
min_count = tmp[TARGET].value_counts().min()
balanced = (tmp.groupby(TARGET, group_keys=False)
              .apply(lambda x: x.sample(min_count, random_state=RANDOM_STATE)))
y_tr_bal = balanced[TARGET].astype(str)
X_tr_bal = balanced.drop(columns=TARGET)

print("Dengeleme Ã¶nce:", Counter(y_tr))
print("Dengeleme sonra:", Counter(y_tr_bal))
print("Yeni eÄŸitim boyutu:", X_tr_bal.shape)

Dengeleme Ã¶nce: Counter({'Syn': 70336, 'UDP': 17770, 'UDPLag': 12639, 'MSSQL': 10974, 'LDAP': 6715, 'NetBIOS': 1631})
Dengeleme sonra: Counter({'LDAP': 1631, 'MSSQL': 1631, 'NetBIOS': 1631, 'Syn': 1631, 'UDP': 1631, 'UDPLag': 1631})
Yeni eÄŸitim boyutu: (9786, 65)


In [9]:
# --- AdÄ±m 4 (final patch): mutual_info_score ile manuel Ã¶zellik seÃ§imi + LGBM ---
import numpy as np, pandas as pd
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from lightgbm import LGBMClassifier

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

def mi_scores(X, y):
    """Her sÃ¼tun iÃ§in mutual information puanÄ± hesapla (saf sklearn.metrics)"""
    scores = {}
    y_enc = pd.factorize(y)[0]
    for col in X.columns:
        # SÃ¼rekli deÄŸiÅŸkenleri kategoriye bÃ¶lmek iÃ§in kÃ¼Ã§Ã¼k quantile binning
        x_bin = pd.qcut(X[col].rank(method="first"), q=10, duplicates="drop")
        x_enc = pd.factorize(x_bin)[0]
        mi = mutual_info_score(x_enc, y_enc)
        scores[col] = mi
    return pd.Series(scores).sort_values(ascending=False)

# MI puanlarÄ±nÄ± hesapla
mi = mi_scores(X_tr_bal, y_tr_bal)
print("En yÃ¼ksek bilgiye sahip 10 Ã¶zellik:")
print(mi.head(10))

# k deÄŸerleri listesi
k_list = [15, 25, 35, 50]
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

best_k, best_cv, selected_cols = None, -1.0, None

for k in k_list:
    top_cols = mi.head(k).index
    X_sel = X_tr_bal[top_cols]
    clf = LGBMClassifier(
        n_estimators=200, learning_rate=0.1,
        num_leaves=31, subsample=0.8, colsample_bytree=0.8,
        random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1
    )
    acc = cross_val_score(clf, X_sel, y_tr_bal, cv=cv, scoring="accuracy").mean()
    print(f"k={k:2d} | CV Acc={acc:.4f}")
    if acc > best_cv:
        best_cv, best_k, selected_cols = acc, k, top_cols

print(f"\nâœ… En iyi k={best_k} (CV Acc={best_cv:.4f})")
print("SeÃ§ilen ilk 10 Ã¶zellik:", list(selected_cols[:10]))

# seÃ§ilen kolonlarla train/test setleri
X_tr_sel = X_tr_bal[selected_cols].copy()
X_te_sel = X_te[selected_cols].copy()

En yÃ¼ksek bilgiye sahip 10 Ã¶zellik:
SYN Flag Count           1.524883
CWE Flag Count           1.405789
RST Flag Count           1.385206
Fwd PSH Flags            1.385206
Bwd Packet Length Std    1.377581
ACK Flag Count           1.339930
Active Min               1.300542
Protocol                 1.294937
Active Std               1.292313
Idle Std                 1.291538
dtype: float64
k=15 | CV Acc=0.3732
k=25 | CV Acc=0.5504
k=35 | CV Acc=0.6403
k=50 | CV Acc=0.6605

âœ… En iyi k=50 (CV Acc=0.6605)
SeÃ§ilen ilk 10 Ã¶zellik: ['SYN Flag Count', 'CWE Flag Count', 'RST Flag Count', 'Fwd PSH Flags', 'Bwd Packet Length Std', 'ACK Flag Count', 'Active Min', 'Protocol', 'Active Std', 'Idle Std']


In [10]:
# --- AdÄ±m 7: MBO ile LightGBM hiperparametre optimizasyonu ---
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

# Arama alanÄ± (parametre sÄ±nÄ±rlarÄ±)
param_bounds = {
    "learning_rate": (0.01, 0.2),
    "num_leaves": (20, 150),
    "max_depth": (3, 20),
    "subsample": (0.5, 1.0),
    "colsample_bytree": (0.5, 1.0),
    "min_child_samples": (5, 40)
}

# --- YardÄ±mcÄ± fonksiyonlar ---
def sample_params():
    """Bir birey iÃ§in rastgele parametre seti oluÅŸtur."""
    return {
        "learning_rate": np.random.uniform(*param_bounds["learning_rate"]),
        "num_leaves": int(np.random.uniform(*param_bounds["num_leaves"])),
        "max_depth": int(np.random.uniform(*param_bounds["max_depth"])),
        "subsample": np.random.uniform(*param_bounds["subsample"]),
        "colsample_bytree": np.random.uniform(*param_bounds["colsample_bytree"]),
        "min_child_samples": int(np.random.uniform(*param_bounds["min_child_samples"]))
    }

def evaluate_params(params):
    """3-katlÄ± CV doÄŸruluk deÄŸeri (fitness)."""
    model = LGBMClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=1, **params)
    return cross_val_score(model, X_tr_sel, y_tr_bal, cv=cv, scoring="accuracy").mean()

# --- MBO parametreleri ---
n_males = 4
n_females = 4
pop_size = n_males + n_females
n_gen = 10
attraction_coeff = 0.3
flight_coeff = 0.1
mutation_prob = 0.1




In [11]:
# --- PopÃ¼lasyon baÅŸlat ---
positions = [sample_params() for _ in range(pop_size)]
fitness = np.array([evaluate_params(p) for p in positions])
best_idx = np.argmax(fitness)
best_params = positions[best_idx].copy()
best_fit = fitness[best_idx]

print(f"BaÅŸlangÄ±Ã§ doÄŸruluk: {best_fit:.4f}")

# --- Ana MBO dÃ¶ngÃ¼sÃ¼ ---
for gen in range(n_gen):
    new_positions = []
    for i in range(pop_size):
        new_p = positions[i].copy()
        # Rastgele kÃ¼Ã§Ã¼k mutasyon (uÃ§uÅŸ hareketi)
        for k, v in param_bounds.items():
            if np.random.rand() < mutation_prob:
                step = np.random.uniform(-0.1, 0.1) * (v[1] - v[0])
                new_val = np.clip(new_p[k] + step, *v)
                new_p[k] = int(new_val) if k in ["num_leaves", "max_depth", "min_child_samples"] else new_val
        new_positions.append(new_p)

    # DeÄŸerlendir
    new_fitness = np.array([evaluate_params(p) for p in new_positions])

    # Attraction: en iyiye yaklaÅŸ
    for i in range(pop_size):
        for k in param_bounds.keys():
            delta = best_params[k] - new_positions[i][k]
            new_positions[i][k] += attraction_coeff * delta + np.random.uniform(-flight_coeff, flight_coeff)
            low, high = param_bounds[k]
            new_positions[i][k] = max(low, min(high, new_positions[i][k]))

    # Yeni fitness hesapla
    fitness = np.array([evaluate_params(p) for p in new_positions])
    gen_best_idx = np.argmax(fitness)
    if fitness[gen_best_idx] > best_fit:
        best_fit = fitness[gen_best_idx]
        best_params = new_positions[gen_best_idx].copy()

    print(f"Nesil {gen+1}/{n_gen} | En iyi CV doÄŸruluk: {best_fit:.4f}")

print("\nâœ… MBO tamamlandÄ±.")
print("En iyi parametre seti:")
for k, v in best_params.items():
    print(f" - {k}: {v:.4f}" if isinstance(v, float) else f" - {k}: {v}")

# --- Nihai test ---
final_model = LGBMClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1, **best_params)
final_model.fit(X_tr_sel, y_tr_bal)
y_pred = final_model.predict(X_te_sel)

acc = accuracy_score(y_te, y_pred)
f1m = f1_score(y_te, y_pred, average="macro")

print(f"\nðŸ“Š TEST Sonucu (MBO-optimize LightGBM):")
print(f"Accuracy = {acc:.4f}")
print(f"Macro-F1 = {f1m:.4f}")

from sklearn.metrics import classification_report
print("\nSÄ±nÄ±f bazlÄ± rapor:")
print(classification_report(y_te, y_pred, digits=4))

BaÅŸlangÄ±Ã§ doÄŸruluk: 0.6733


[LightGBM] [Fatal] Parameter num_leaves should be of type int, got "141.44985844582976"
[LightGBM] [Fatal] Parameter num_leaves should be of type int, got "141.44985844582976"
[LightGBM] [Fatal] Parameter num_leaves should be of type int, got "141.44985844582976"


ValueError: 
All the 3 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/sklearn.py", line 1560, in fit
    super().fit(
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/sklearn.py", line 1049, in fit
    self._Booster = train(
                    ^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/engine.py", line 297, in train
    booster = Booster(params=params, train_set=train_set)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/basic.py", line 3656, in __init__
    train_set.construct()
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/basic.py", line 2590, in construct
    self._lazy_init(
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/basic.py", line 2187, in _lazy_init
    self.__init_from_np2d(data, params_str, ref_dataset)
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/basic.py", line 2318, in __init_from_np2d
    _safe_call(
  File "/usr/local/lib/python3.11/dist-packages/lightgbm/basic.py", line 313, in _safe_call
    raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8"))
lightgbm.basic.LightGBMError: Parameter num_leaves should be of type int, got "141.44985844582976"
