In [1]:
!pip install xgboost



In [2]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from collections import Counter

In [3]:
df_standard = pd.read_csv('df.eneemr')

https://xgboost.readthedocs.io/en/latest/index.html
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

KLASSENIMBALANZ BERÃœCKSICHTIGT

In [4]:
leakage_features = [
    "summonerLevel",
    "winrate",
    "smurf_score",
    "smurf_ratio"
]

df_model = df_standard.copy()

X = (
    df_model
    .select_dtypes(include=["float64", "int64"])
    .drop(columns=["smurf_flag"] + leakage_features, errors="ignore")
)

y = df_model["smurf_flag"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)

In [5]:
counter = Counter(y_train)
scale_pos_weight = counter[0] / counter[1]

print("Class distribution:", counter)
print("scale_pos_weight:", scale_pos_weight)

Class distribution: Counter({0: 19272, 1: 498})
scale_pos_weight: 38.69879518072289


In [6]:
# Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# PCA
pca = PCA(n_components=0.45, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [7]:
# Random Forest (balanced)

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train_pca, y_train)
y_pred_rf = rf.predict(X_test_pca)
y_prob_rf = rf.predict_proba(X_test_pca)[:, 1]

print("\nRandom Forest:")
print(classification_report(y_test, y_pred_rf, zero_division=0))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))
print("PR-AUC:", average_precision_score(y_test, y_prob_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# XGBoost (balanced)

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    scale_pos_weight=scale_pos_weight,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train_pca, y_train)
y_pred_xgb = xgb.predict(X_test_pca)
y_prob_xgb = xgb.predict_proba(X_test_pca)[:, 1]

print("\nXGBoost:")
print(classification_report(y_test, y_pred_xgb, zero_division=0))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_xgb))
print("PR-AUC:", average_precision_score(y_test, y_prob_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))


Random Forest:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      8260
           1       1.00      0.72      0.84       214

    accuracy                           0.99      8474
   macro avg       1.00      0.86      0.92      8474
weighted avg       0.99      0.99      0.99      8474

ROC-AUC: 0.9400126722635831
PR-AUC: 0.8072138394261024
Confusion Matrix:
 [[8260    0]
 [  60  154]]

XGBoost:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8260
           1       0.66      0.74      0.70       214

    accuracy                           0.98      8474
   macro avg       0.83      0.86      0.84      8474
weighted avg       0.98      0.98      0.98      8474

ROC-AUC: 0.9161939082618633
PR-AUC: 0.7729001836773248
Confusion Matrix:
 [[8179   81]
 [  56  158]]


In [1]:
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], X_test_pca[:, 2], c=y_test, cmap='coolwarm', alpha=0.6)
ax.set_xlabel('PCA 1')
ax.set_ylabel('PCA 2')
ax.set_zlabel('PCA 3')
ax.set_title('PCA 3D-Plot with smurf_flag')
legend1 = ax.legend(*scatter.legend_elements(), title="smurf_flag")
ax.add_artist(legend1)
plt.show()

NameError: name 'plt' is not defined