In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
DATA_PATH_FEATURES = '../data/elliptic_txs_features.csv'
DATA_PATH_CLASSES = '../data/elliptic_txs_classes.csv'
print("Loading data...")
df_classes = pd.read_csv(DATA_PATH_CLASSES)
df_features = pd.read_csv(DATA_PATH_FEATURES, header=None)
df_features.rename(columns={0: 'txId', 1: 'time_step'}, inplace=True)
df_merged = pd.merge(df_features, df_classes, on='txId', how='left')
df_clean = df_merged[df_merged['class'] != 'unknown'].copy()
df_clean['class'] = df_clean['class'].map({'1': 1, '2': 0})
X = df_clean.drop(columns=['txId', 'class', 'time_step'])
y = df_clean['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
num_neg = (y_train == 0).sum()
num_pos = (y_train == 1).sum()
scale_weight = num_neg / num_pos

print(f"Training XGBoost with scale_pos_weight={scale_weight:.2f}...")
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_weight,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train, y_train)
print("\n--- XGBoost Evaluation ---")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Licit (0)', 'Fraud (1)']))

Loading data...
Training XGBoost with scale_pos_weight=9.31...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- XGBoost Evaluation ---
              precision    recall  f1-score   support

   Licit (0)       0.99      0.99      0.99     12587
   Fraud (1)       0.93      0.95      0.94      1383

    accuracy                           0.99     13970
   macro avg       0.96      0.97      0.97     13970
weighted avg       0.99      0.99      0.99     13970

