In [6]:
# AdÄ±m 1: Kurulum, veri yÃ¼kleme, temizlik (Kaggle iÃ§in)
import os, glob, warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_DIR = "/kaggle/input/cicddos2019"
TARGET   = "__label__"

def infer_label(fname):  # Syn-training.parquet -> Syn
    return os.path.basename(fname).split("-")[0]
def infer_split(fname):  # train/test bilgisi
    b = os.path.basename(fname).lower()
    return "train" if "train" in b else ("test" if "test" in b else "unknown")

# 1.1 Parquetleri birleÅŸtir
frames=[]
for f in sorted(glob.glob(os.path.join(DATA_DIR,"*.parquet"))):
    df = pd.read_parquet(f)
    df["__split__"] = infer_split(f)
    df[TARGET]      = infer_label(f)
    frames.append(df)
raw = pd.concat(frames, ignore_index=True)

# 1.2 Temizlik: inf->NaN, kategorik/sabit kolonlarÄ± at
raw = raw.replace([np.inf,-np.inf], np.nan)
META = [TARGET,"__split__"]
features = [c for c in raw.columns if c not in META]
num_cols = [c for c in features if pd.api.types.is_numeric_dtype(raw[c])]
df = raw.drop(columns=[c for c in features if c not in num_cols]).copy()
constant = df[num_cols].nunique(dropna=False)
const_cols = constant[constant<=1].index.tolist()
if const_cols:
    df.drop(columns=const_cols, inplace=True)

# 1.3 Train/Test ayÄ±r
train_df = df[df["__split__"]=="train"].copy()
test_df  = df[df["__split__"]=="test"].copy()
X_train, y_train = train_df.drop(columns=META), train_df[TARGET].astype(str)
X_test,  y_test  = test_df.drop(columns=META),  test_df[TARGET].astype(str)

print("YÃ¼klendi âœ“",
      "\nTrain:", X_train.shape, "| Test:", X_test.shape,
      "\nTrain sÄ±nÄ±flarÄ±:", sorted(y_train.unique()),
      "\nTest  sÄ±nÄ±flarÄ±:", sorted(y_test.unique()))

YÃ¼klendi âœ“ 
Train: (125170, 65) | Test: (306201, 65) 
Train sÄ±nÄ±flarÄ±: ['LDAP', 'MSSQL', 'NetBIOS', 'Portmap', 'Syn', 'UDP', 'UDPLag'] 
Test  sÄ±nÄ±flarÄ±: ['DNS', 'LDAP', 'MSSQL', 'NTP', 'NetBIOS', 'SNMP', 'Syn', 'TFTP', 'UDP', 'UDPLag']


In [7]:
# AdÄ±m 2: Ortak sÄ±nÄ±flar (closed-set)
common = sorted(set(y_train.unique()).intersection(set(y_test.unique())))
train_mask = y_train.isin(common)
test_mask  = y_test.isin(common)

X_tr, y_tr = X_train[train_mask].copy(), y_train[train_mask].copy()
X_te, y_te = X_test[test_mask].copy(),  y_test[test_mask].copy()

print("Ortak sÄ±nÄ±flar:", common)
print("Yeni Train/Test:", X_tr.shape, X_te.shape)

Ortak sÄ±nÄ±flar: ['LDAP', 'MSSQL', 'NetBIOS', 'Syn', 'UDP', 'UDPLag']
Yeni Train/Test: (120065, 65) (38973, 65)


In [8]:
# AdÄ±m 3: Manuel undersampling (imblearn yok)
from collections import Counter
tmp = X_tr.copy(); tmp[TARGET]=y_tr.values
min_count = tmp[TARGET].value_counts().min()
balanced = (tmp.groupby(TARGET, group_keys=False)
              .apply(lambda x: x.sample(min_count, random_state=RANDOM_STATE)))
y_tr_bal = balanced[TARGET].astype(str)
X_tr_bal = balanced.drop(columns=TARGET)

print("Dengeleme Ã¶nce:", Counter(y_tr))
print("Dengeleme sonra:", Counter(y_tr_bal))
print("Yeni eÄŸitim boyutu:", X_tr_bal.shape)

Dengeleme Ã¶nce: Counter({'Syn': 70336, 'UDP': 17770, 'UDPLag': 12639, 'MSSQL': 10974, 'LDAP': 6715, 'NetBIOS': 1631})
Dengeleme sonra: Counter({'LDAP': 1631, 'MSSQL': 1631, 'NetBIOS': 1631, 'Syn': 1631, 'UDP': 1631, 'UDPLag': 1631})
Yeni eÄŸitim boyutu: (9786, 65)


In [9]:
# --- AdÄ±m 4 (final patch): mutual_info_score ile manuel Ã¶zellik seÃ§imi + LGBM ---
import numpy as np, pandas as pd
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from lightgbm import LGBMClassifier

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

def mi_scores(X, y):
    """Her sÃ¼tun iÃ§in mutual information puanÄ± hesapla (saf sklearn.metrics)"""
    scores = {}
    y_enc = pd.factorize(y)[0]
    for col in X.columns:
        # SÃ¼rekli deÄŸiÅŸkenleri kategoriye bÃ¶lmek iÃ§in kÃ¼Ã§Ã¼k quantile binning
        x_bin = pd.qcut(X[col].rank(method="first"), q=10, duplicates="drop")
        x_enc = pd.factorize(x_bin)[0]
        mi = mutual_info_score(x_enc, y_enc)
        scores[col] = mi
    return pd.Series(scores).sort_values(ascending=False)

# MI puanlarÄ±nÄ± hesapla
mi = mi_scores(X_tr_bal, y_tr_bal)
print("En yÃ¼ksek bilgiye sahip 10 Ã¶zellik:")
print(mi.head(10))

# k deÄŸerleri listesi
k_list = [15, 25, 35, 50]
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

best_k, best_cv, selected_cols = None, -1.0, None

for k in k_list:
    top_cols = mi.head(k).index
    X_sel = X_tr_bal[top_cols]
    clf = LGBMClassifier(
        n_estimators=200, learning_rate=0.1,
        num_leaves=31, subsample=0.8, colsample_bytree=0.8,
        random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1
    )
    acc = cross_val_score(clf, X_sel, y_tr_bal, cv=cv, scoring="accuracy").mean()
    print(f"k={k:2d} | CV Acc={acc:.4f}")
    if acc > best_cv:
        best_cv, best_k, selected_cols = acc, k, top_cols

print(f"\nâœ… En iyi k={best_k} (CV Acc={best_cv:.4f})")
print("SeÃ§ilen ilk 10 Ã¶zellik:", list(selected_cols[:10]))

# seÃ§ilen kolonlarla train/test setleri
X_tr_sel = X_tr_bal[selected_cols].copy()
X_te_sel = X_te[selected_cols].copy()

En yÃ¼ksek bilgiye sahip 10 Ã¶zellik:
SYN Flag Count           1.524883
CWE Flag Count           1.405789
RST Flag Count           1.385206
Fwd PSH Flags            1.385206
Bwd Packet Length Std    1.377581
ACK Flag Count           1.339930
Active Min               1.300542
Protocol                 1.294937
Active Std               1.292313
Idle Std                 1.291538
dtype: float64
k=15 | CV Acc=0.3732
k=25 | CV Acc=0.5504
k=35 | CV Acc=0.6403
k=50 | CV Acc=0.6605

âœ… En iyi k=50 (CV Acc=0.6605)
SeÃ§ilen ilk 10 Ã¶zellik: ['SYN Flag Count', 'CWE Flag Count', 'RST Flag Count', 'Fwd PSH Flags', 'Bwd Packet Length Std', 'ACK Flag Count', 'Active Min', 'Protocol', 'Active Std', 'Idle Std']


In [5]:
# === TAM VERÄ° HAZIRLIK (CIC-DDoS + dengeleme + Ã¶zellik seÃ§imi) ===
import os, glob, warnings, numpy as np, pandas as pd
from collections import Counter
from sklearn.metrics import mutual_info_score
warnings.filterwarnings("ignore")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
DATA_DIR = "/kaggle/input/cicddos2019"
TARGET = "__label__"

# --- 1. Veri yÃ¼kleme ---
def infer_label(fname):  # Syn-training.parquet -> Syn
    return os.path.basename(fname).split("-")[0]

def infer_split(fname):  # train/test bilgisi
    b = os.path.basename(fname).lower()
    return "train" if "train" in b else ("test" if "test" in b else "unknown")

frames = []
for f in sorted(glob.glob(os.path.join(DATA_DIR, "*.parquet"))):
    df = pd.read_parquet(f)
    df["__split__"] = infer_split(f)
    df[TARGET] = infer_label(f)
    frames.append(df)

raw = pd.concat(frames, ignore_index=True)
raw = raw.replace([np.inf, -np.inf], np.nan)

# --- 2. SÃ¼tun temizliÄŸi ---
META = [TARGET, "__split__"]
features = [c for c in raw.columns if c not in META]
num_cols = [c for c in features if pd.api.types.is_numeric_dtype(raw[c])]
df = raw.drop(columns=[c for c in features if c not in num_cols]).copy()

constant = df[num_cols].nunique(dropna=False)
const_cols = constant[constant <= 1].index.tolist()
if const_cols:
    df.drop(columns=const_cols, inplace=True)

# --- 3. Train/Test ayÄ±r ---
train_df = df[df["__split__"] == "train"].copy()
test_df = df[df["__split__"] == "test"].copy()
X_train, y_train = train_df.drop(columns=META), train_df[TARGET].astype(str)
X_test, y_test = test_df.drop(columns=META), test_df[TARGET].astype(str)

print("YÃ¼klendi âœ“")
print("Train:", X_train.shape, "| Test:", X_test.shape)

# --- 4. Closed-set filtreleme ---
common = sorted(set(y_train.unique()).intersection(set(y_test.unique())))
train_mask = y_train.isin(common)
test_mask = y_test.isin(common)
X_tr = X_train[train_mask].copy()
y_tr = y_train[train_mask].copy()
X_te = X_test[test_mask].copy()
y_te = y_test[test_mask].copy()
print("Ortak sÄ±nÄ±flar:", common)

# --- 5. Manuel dengeleme (undersampling) ---
tmp = X_tr.copy()
tmp["__y__"] = y_tr.values
min_count = tmp["__y__"].value_counts().min()
balanced = tmp.groupby("__y__", group_keys=False).apply(
    lambda x: x.sample(min_count, random_state=RANDOM_STATE)
)
y_tr_bal = balanced["__y__"].astype(str)
X_tr_bal = balanced.drop(columns="__y__")
print("Denge sonrasÄ± boyut:", X_tr_bal.shape)

# --- 6. Ã–zellik seÃ§imi (Mutual Information ile) ---
def mi_scores(X, y):
    scores = {}
    y_enc = pd.factorize(y)[0]
    for col in X.columns:
        x_bin = pd.qcut(X[col].rank(method="first"), q=10, duplicates="drop")
        x_enc = pd.factorize(x_bin)[0]
        scores[col] = mutual_info_score(x_enc, y_enc)
    return pd.Series(scores).sort_values(ascending=False)

mi = mi_scores(X_tr_bal, y_tr_bal)
selected_cols = mi.head(25).index
X_tr_sel = X_tr_bal[selected_cols].copy()
X_te_sel = X_te[selected_cols].copy()

print("\nâœ… Veri tamamen hazÄ±r!")
print("EÄŸitim:", X_tr_sel.shape, "| Test:", X_te_sel.shape)


YÃ¼klendi âœ“
Train: (125170, 65) | Test: (306201, 65)
Ortak sÄ±nÄ±flar: ['LDAP', 'MSSQL', 'NetBIOS', 'Syn', 'UDP', 'UDPLag']
Denge sonrasÄ± boyut: (9786, 65)

âœ… Veri tamamen hazÄ±r!
EÄŸitim: (9786, 25) | Test: (38973, 25)


In [11]:
# === GeliÅŸtirilmiÅŸ MBO: Acc-F1 fitness + k (Ã¶zellik sayÄ±sÄ±) optimizasyonu ===
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, mutual_info_score
from lightgbm import LGBMClassifier
from sklearn.utils import check_random_state

RANDOM_STATE = 42
rng = check_random_state(RANDOM_STATE)

# --- MI puanlarÄ±nÄ± hesapla (Ã¶zellik sayÄ±sÄ± k'yÄ± MBO seÃ§ecek) ---
def mi_scores(X, y, q=10):
    scores = {}
    y_enc = pd.factorize(y)[0]
    for col in X.columns:
        x_bin = pd.qcut(X[col].rank(method="first"), q=q, duplicates="drop")
        x_enc = pd.factorize(x_bin)[0]
        scores[col] = mutual_info_score(x_enc, y_enc)
    return pd.Series(scores).sort_values(ascending=False)

mi_series = mi_scores(X_tr_bal, y_tr_bal)

# --- Arama alanlarÄ± (LightGBM + k) ---
bounds = {
    "learning_rate": (0.02, 0.2),
    "num_leaves": (31, 255),
    "max_depth": (6, 32),
    "subsample": (0.6, 1.0),
    "colsample_bytree": (0.6, 1.0),
    "min_child_samples": (10, 100),
    "reg_alpha": (0.0, 1.0),
    "reg_lambda": (0.0, 2.0),
    "n_estimators": (200, 1200),
    "k_feats": (20, 60)
}
int_keys = ["num_leaves", "max_depth", "min_child_samples", "n_estimators", "k_feats"]

def clamp_cast(p):
    pp = {}
    for k,(lo,hi) in bounds.items():
        v = max(lo, min(hi, p[k]))
        if k in int_keys:
            v = int(round(v))
        pp[k] = v
    return pp

def sample_params():
    p = {k: rng.uniform(lo, hi) for k,(lo,hi) in bounds.items()}
    return clamp_cast(p)

# --- Fitness fonksiyonu: birleÅŸik skor (0.5*Acc + 0.5*F1) ---
def fitness_score(params):
    params = clamp_cast(params)
    k = params.pop("k_feats")
    cols = mi_series.head(k).index
    X = X_tr_bal[cols].values
    y = y_tr_bal.values

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    accs, f1s = [], []

    for tr_idx, va_idx in cv.split(X, y):
        Xtr, Xva = X[tr_idx], X[va_idx]
        ytr, yva = y[tr_idx], y[va_idx]

        clf = LGBMClassifier(
            random_state=RANDOM_STATE,
            class_weight="balanced",
            verbose=-1,
            n_jobs=1,
            **params
        )

        clf.fit(Xtr, ytr)
        yhat = clf.predict(Xva)

        accs.append(accuracy_score(yva, yhat))
        f1s.append(f1_score(yva, yhat, average="macro"))

    acc = float(np.mean(accs))
    f1m = float(np.mean(f1s))
    return 0.5*acc + 0.5*f1m, acc, f1m, cols

# --- MBO parametreleri ---
pop_size = 12
n_gen = 8
attract = 0.35
flight = 0.08
mut_p = 0.15

# --- BaÅŸlat ---
population = [sample_params() for _ in range(pop_size)]
scores, details = [], []

for p in population:
    s, a, f, cols = fitness_score(p)
    scores.append(s)
    details.append((a, f, cols))

scores = np.array(scores)
best_idx = int(np.argmax(scores))
best_p = population[best_idx].copy()
best_s = float(scores[best_idx])
best_acc, best_f1, best_cols = details[best_idx]

print(f"BaÅŸlangÄ±Ã§ | Score={best_s:.4f} (Acc={best_acc:.4f}, F1={best_f1:.4f}), k={len(best_cols)}")

# --- Ana dÃ¶ngÃ¼ ---
for g in range(n_gen):
    new_pop = []
    for i in range(pop_size):
        p = population[i].copy()
        for k,(lo,hi) in bounds.items():
            if rng.rand() < mut_p:
                p[k] = p[k] + rng.uniform(-0.1,0.1)*(hi-lo)
        new_pop.append(clamp_cast(p))

    for i in range(pop_size):
        for k,(lo,hi) in bounds.items():
            delta = best_p[k] - new_pop[i][k]
            new_pop[i][k] = new_pop[i][k] + attract*delta + rng.uniform(-flight, flight)
        new_pop[i] = clamp_cast(new_pop[i])

    scores, details = [], []
    for p in new_pop:
        s, a, f, cols = fitness_score(p)
        scores.append(s)
        details.append((a, f, cols))

    scores = np.array(scores)
    gen_best = int(np.argmax(scores))
    if scores[gen_best] > best_s:
        best_s = float(scores[gen_best])
        best_p = new_pop[gen_best].copy()
        best_acc, best_f1, best_cols = details[gen_best]

    population = new_pop
    print(f"Nesil {g+1}/{n_gen} | Score={best_s:.4f} (Acc={best_acc:.4f}, F1={best_f1:.4f}), k={len(best_cols)}")

print("\nâœ… MBO bitti.")
print("En iyi parametreler:")
for k,v in best_p.items():
    print(f" - {k}: {v}")

# --- Nihai test ---
X_tr_final = X_tr_bal[best_cols].values
X_te_final = X_te[best_cols].values

final = LGBMClassifier(
    random_state=RANDOM_STATE,
    class_weight="balanced",
    verbose=-1,
    n_jobs=-1,
    **{kk:vv for kk,vv in best_p.items() if kk!='k_feats'}
).fit(X_tr_final, y_tr_bal)

y_pred = final.predict(X_te_final)
acc = accuracy_score(y_te, y_pred)
f1m = f1_score(y_te, y_pred, average="macro")

print(f"\nðŸ“Š TEST (MBO+Balanced+best_k={len(best_cols)})")
print(f"Accuracy = {acc:.4f}")
print(f"Macro-F1 = {f1m:.4f}")


BaÅŸlangÄ±Ã§ | Score=0.6515 (Acc=0.6544, F1=0.6486), k=42
Nesil 1/8 | Score=0.6607 (Acc=0.6642, F1=0.6572), k=51
Nesil 2/8 | Score=0.6643 (Acc=0.6680, F1=0.6606), k=51
Nesil 3/8 | Score=0.6643 (Acc=0.6680, F1=0.6606), k=51
Nesil 4/8 | Score=0.6646 (Acc=0.6681, F1=0.6610), k=48
Nesil 5/8 | Score=0.6646 (Acc=0.6681, F1=0.6610), k=48
Nesil 6/8 | Score=0.6648 (Acc=0.6684, F1=0.6613), k=47
Nesil 7/8 | Score=0.6659 (Acc=0.6696, F1=0.6622), k=49
Nesil 8/8 | Score=0.6659 (Acc=0.6696, F1=0.6622), k=49

âœ… MBO bitti.
En iyi parametreler:
 - learning_rate: 0.02
 - num_leaves: 87
 - max_depth: 27
 - subsample: 0.9595193455783716
 - colsample_bytree: 0.6638216465257722
 - min_child_samples: 72
 - reg_alpha: 0.3811980548222325
 - reg_lambda: 0.9057365751040158
 - n_estimators: 633
 - k_feats: 49

ðŸ“Š TEST (MBO+Balanced+best_k=49)
Accuracy = 0.5247
Macro-F1 = 0.4560


In [13]:
# === AdÄ±m 8: MBO + XGBoost hiperparametre optimizasyonu (LabelEncoder + dÃ¼zgÃ¼n hizalÄ± sÃ¼rÃ¼m) ===
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.utils import check_random_state

RANDOM_STATE = 42
rng = check_random_state(RANDOM_STATE)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

# --- Etiketleri sayÄ±sallaÅŸtÄ±r (XGBoost bunu ister) ---
label_enc = LabelEncoder()
y_tr_bal_enc = label_enc.fit_transform(y_tr_bal)
y_te_enc = label_enc.transform(y_te)

# --- Parametre aralÄ±klarÄ± (XGBoost) ---
bounds = {
    "learning_rate": (0.01, 0.3),
    "max_depth": (3, 15),
    "min_child_weight": (1, 10),
    "subsample": (0.6, 1.0),
    "colsample_bytree": (0.6, 1.0),
    "gamma": (0, 5),
    "reg_lambda": (0.5, 3),
    "n_estimators": (200, 1000),
    "k_feats": (20, 60)
}
int_keys = ["max_depth", "min_child_weight", "n_estimators", "k_feats"]

def clamp_cast(p):
    pp = {}
    for k, (lo, hi) in bounds.items():
        v = max(lo, min(hi, p[k]))
        if k in int_keys:
            v = int(round(v))
        pp[k] = v
    return pp

def sample_params():
    p = {k: rng.uniform(lo, hi) for k, (lo, hi) in bounds.items()}
    return clamp_cast(p)

# --- Fitness fonksiyonu (Accuracy + F1) / 2 ---
def fitness_score(params):
    params = clamp_cast(params)
    k = params.pop("k_feats")
    cols = mi_series.head(k).index
    X = X_tr_bal[cols].values
    y = y_tr_bal_enc

    accs, f1s = [], []
    for tr_idx, va_idx in cv.split(X, y):
        Xtr, Xva = X[tr_idx], X[va_idx]
        ytr, yva = y[tr_idx], y[va_idx]

        clf = XGBClassifier(
            random_state=RANDOM_STATE,
            n_jobs=1,
            tree_method="hist",
            objective="multi:softmax",
            num_class=len(np.unique(y_tr_bal_enc)),
            eval_metric="mlogloss",
            verbosity=0,
            **params
        )

        clf.fit(Xtr, ytr)
        yhat = clf.predict(Xva)

        accs.append(accuracy_score(yva, yhat))
        f1s.append(f1_score(yva, yhat, average="macro"))

    acc = float(np.mean(accs))
    f1m = float(np.mean(f1s))
    return 0.5 * acc + 0.5 * f1m, acc, f1m, cols

# --- MBO ayarlarÄ± ---
pop_size = 12
n_gen = 8
attract = 0.35
flight = 0.08
mut_p = 0.15

# --- PopÃ¼lasyon baÅŸlat ---
population = [sample_params() for _ in range(pop_size)]
scores, details = [], []

for p in population:
    s, a, f, cols = fitness_score(p)
    scores.append(s)
    details.append((a, f, cols))

scores = np.array(scores)
best_idx = int(np.argmax(scores))
best_p = population[best_idx].copy()
best_s = float(scores[best_idx])
best_acc, best_f1, best_cols = details[best_idx]

print(f"BaÅŸlangÄ±Ã§ | Score={best_s:.4f} (Acc={best_acc:.4f}, F1={best_f1:.4f}), k={len(best_cols)}")

# --- MBO ana dÃ¶ngÃ¼sÃ¼ ---
for g in range(n_gen):
    new_pop = []
    for i in range(pop_size):
        p = population[i].copy()
        for k, (lo, hi) in bounds.items():
            if rng.rand() < mut_p:
                p[k] = p[k] + rng.uniform(-0.1, 0.1) * (hi - lo)
        new_pop.append(clamp_cast(p))

    for i in range(pop_size):
        for k, (lo, hi) in bounds.items():
            delta = best_p[k] - new_pop[i][k]
            new_pop[i][k] = new_pop[i][k] + attract * delta + rng.uniform(-flight, flight)
        new_pop[i] = clamp_cast(new_pop[i])

    scores, details = [], []
    for p in new_pop:
        s, a, f, cols = fitness_score(p)
        scores.append(s)
        details.append((a, f, cols))

    scores = np.array(scores)
    gen_best = int(np.argmax(scores))
    if scores[gen_best] > best_s:
        best_s = float(scores[gen_best])
        best_p = new_pop[gen_best].copy()
        best_acc, best_f1, best_cols = details[gen_best]

    population = new_pop
    print(f"Nesil {g+1}/{n_gen} | Score={best_s:.4f} (Acc={best_acc:.4f}, F1={best_f1:.4f}), k={len(best_cols)}")

print("\nâœ… MBO + XGBoost tamamlandÄ±.")
print("En iyi parametreler:")
for k, v in best_p.items():
    print(f" - {k}: {v}")

# --- Nihai test ---
X_tr_final = X_tr_bal[best_cols].values
X_te_final = X_te[best_cols].values

final = XGBClassifier(
    random_state=RANDOM_STATE,
    n_jobs=-1,
    tree_method="hist",
    objective="multi:softmax",
    num_class=len(np.unique(y_tr_bal_enc)),
    eval_metric="mlogloss",
    verbosity=0,
    **{kk: vv for kk, vv in best_p.items() if kk != "k_feats"}
).fit(X_tr_final, y_tr_bal_enc)

y_pred_enc = final.predict(X_te_final)
y_pred = label_enc.inverse_transform(y_pred_enc)

acc = accuracy_score(y_te, y_pred)
f1m = f1_score(y_te, y_pred, average="macro")

print(f"\nðŸ“Š TEST (MBO + XGBoost + best_k={len(best_cols)})")
print(f"Accuracy = {acc:.4f}")
print(f"Macro-F1 = {f1m:.4f}")


BaÅŸlangÄ±Ã§ | Score=0.6674 (Acc=0.6712, F1=0.6637), k=44
Nesil 1/8 | Score=0.6674 (Acc=0.6712, F1=0.6637), k=44
Nesil 2/8 | Score=0.6697 (Acc=0.6740, F1=0.6653), k=49
Nesil 3/8 | Score=0.6697 (Acc=0.6740, F1=0.6653), k=49
Nesil 4/8 | Score=0.6697 (Acc=0.6740, F1=0.6653), k=49
Nesil 5/8 | Score=0.6697 (Acc=0.6740, F1=0.6653), k=49
Nesil 6/8 | Score=0.6697 (Acc=0.6740, F1=0.6653), k=49
Nesil 7/8 | Score=0.6697 (Acc=0.6740, F1=0.6653), k=49
Nesil 8/8 | Score=0.6697 (Acc=0.6740, F1=0.6653), k=49

âœ… MBO + XGBoost tamamlandÄ±.
En iyi parametreler:
 - learning_rate: 0.13708630408502848
 - max_depth: 14
 - min_child_weight: 5
 - subsample: 0.8020186825666225
 - colsample_bytree: 0.6719067711227534
 - gamma: 1.2511997872425664
 - reg_lambda: 1.0593203229409496
 - n_estimators: 681
 - k_feats: 49

ðŸ“Š TEST (MBO + XGBoost + best_k=49)
Accuracy = 0.5229
Macro-F1 = 0.4542
