In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, precision_score
import lightgbm as lgb


In [9]:
# Load features
X = np.load("X_mgtab_large.npy")
y = np.load("y_mgtab_large.npy")

print("X shape:", X.shape)
print("Bot ratio:", y.mean())


X shape: (10199, 790)
Bot ratio: 0.26943818021374644


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [11]:
iso = IsolationForest(
    n_estimators=200,
    contamination=0.1,
    random_state=42,
    n_jobs=-1
)

iso.fit(X_train)

# Anomaly score (higher = more suspicious)
anomaly_train = -iso.score_samples(X_train)
anomaly_test = -iso.score_samples(X_test)

# Normalize to [0,1]
anomaly_train = (anomaly_train - anomaly_train.min()) / (anomaly_train.max() - anomaly_train.min())
anomaly_test = (anomaly_test - anomaly_test.min()) / (anomaly_test.max() - anomaly_test.min())


In [12]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

lgb_model.fit(X_train, y_train)

# Bot probability
bot_prob_train = lgb_model.predict_proba(X_train)[:,1]
bot_prob_test = lgb_model.predict_proba(X_test)[:,1]


[LightGBM] [Info] Number of positive: 2061, number of negative: 5588
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.111126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 198400
[LightGBM] [Info] Number of data points in the train set: 7649, number of used features: 790
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269447 -> initscore=-0.997430
[LightGBM] [Info] Start training from score -0.997430




In [13]:
degree_train = X_train[:, -1]
degree_test = X_test[:, -1]

# Normalize
network_risk_train = (degree_train - degree_train.min()) / (degree_train.max() - degree_train.min() + 1e-6)
network_risk_test = (degree_test - degree_test.min()) / (degree_test.max() - degree_test.min() + 1e-6)


In [14]:
def fuse_risk(bot_prob, anomaly, network):
    return (
        0.40 * bot_prob +
        0.35 * anomaly +
        0.25 * network
    )


In [15]:
risk_train = fuse_risk(bot_prob_train, anomaly_train, network_risk_train)
risk_test = fuse_risk(bot_prob_test, anomaly_test, network_risk_test)


In [16]:
auc = roc_auc_score(y_test, risk_test)
precision = precision_score(y_test, risk_test > 0.6)

print("ROC-AUC:", round(auc, 4))
print("Precision@0.6:", round(precision, 4))


ROC-AUC: 0.9532
Precision@0.6: 0.9126


In [20]:
import joblib

joblib.dump(lgb_model, "mgtab_lightgbm.pkl")
print("[SAVED] LightGBM model")


[SAVED] LightGBM model


In [21]:
joblib.dump(iso, "mgtab_isolation_forest.pkl")
print("[SAVED] Isolation Forest model")


[SAVED] Isolation Forest model
