# 0. Import modules

In [13]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, average_precision_score, make_scorer
import pandas as pd
import numpy as np

# 1. Load and prepare data

In [9]:
# Load data
data = pd.read_csv("AlarmGrundlag_ModelParametre_Merged1.1.csv", delimiter=";")
data = data.drop(columns=["Customer_Refnr", "RUN_DATE", "CASE_CLOSE_DATE", "SCENARIO_NAME", "ALERT_ID", "CASE_ID", "Customer_Risk_Profile_Current"])
data['CASE_STATUS_CODE'] = data['CASE_STATUS_CODE'].replace({'C': 0, 'R': 1})
data.dropna(subset=['Customer_Risk_Profile_BeforeAlert'], inplace=True)
data = pd.get_dummies(data, columns=['Customer_Risk_Profile_BeforeAlert'], prefix='RiskGroup')
# Replace infinities with NaN for easier handling
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any NaNs that might have been infinities initially
data.dropna(inplace=True)

  data['CASE_STATUS_CODE'] = data['CASE_STATUS_CODE'].replace({'C': 0, 'R': 1})


## 1.1 Log1p Transformation

In [10]:
columns_to_log = ['Express_Ratio_SumDKK', 'Express_Ratio_Count', 'MobilePay_Count_DebitCreditRatio', 'MobilePay_Sum_DebitCreditRatio']
for column in columns_to_log:
    data[column] = np.log1p(data[column])

## 1.2 Split into X and y

In [11]:
y = data['CASE_STATUS_CODE']
X = data.drop('CASE_STATUS_CODE', axis=1)

## 1.3 Split into Train and Test (Validation is handled in the cross-validation)

In [12]:
# Assuming X_train, X_test, y_train, y_test are already defined from:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## 1.4 No PCA for Random Forest

# 2. Modelling

In [14]:
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2, 0.3]
}

xgb_clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='aucpr', seed=42)

# Custom scorer for AUPRC
auprc_scorer = make_scorer(average_precision_score, needs_proba=True)

# Setting up GridSearchCV with the AUPRC scorer
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring=auprc_scorer, n_jobs=-1, cv=3, verbose=3)
grid_search.fit(X_train, y_train)



Fitting 3 folds for each of 1296 candidates, totalling 3888 fits


In [15]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred_prob = best_model.predict_proba(X_test)[:, 1]
y_pred = best_model.predict(X_test)

# Evaluation metrics
print("Best parameters found: ", best_params)
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_prob))

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
pr_auc = auc(recall, precision)
print("AU-PRC:", pr_auc)
# Prepare a DMatrix if needed for separate direct XGBoost CV (not needed for GridSearchCV)
data_dmatrix = xgb.DMatrix(data=X, label=y)

# Cross-validation
cv_results = xgb.cv(dtrain=data_dmatrix, params=best_params, nfold=5,
                    num_boost_round=100, early_stopping_rounds=10,
                    metrics="aucpr", as_pandas=True, seed=42)

print(f"Mean AUPRC of the test set: {cv_results['test-aucpr-mean'].iloc[-1]}")  # mean AUPRC score of the test set

Best parameters found:  {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      1900
           1       0.86      0.69      0.76       420

    accuracy                           0.92      2320
   macro avg       0.90      0.83      0.86      2320
weighted avg       0.92      0.92      0.92      2320

ROC-AUC Score: 0.9328984962406015
AU-PRC: 0.837808632065747


Parameters: { "n_estimators" } are not used.



Mean AUPRC of the test set: 0.7913416991748637
