In [4]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
import xgboost as xgb

In [5]:
df = pd.read_parquet('data-val.parquet')

In [6]:
# Dropping columns
df = df.drop(columns=['Age_x','CIF_CLSCUS', 'COB_DATE', 'DATE_TIME', 'BRN_OPN_CIF', 'MA_PHONG_GIAO_DICH_VCB', 'CIF_MASK', 'IS_TM', 'Unnamed: 0', 'SUM_CBALQ_LH_6m', 'SUM_CBALQ_LH_3m', 'AVG_GR_SUM_CBALQ_LH'])

In [7]:
# Replacing inf values with nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [8]:
# Model

# Split features and target
X = df.drop(columns=['IS_BANCAS'])
y = df['IS_BANCAS']


# Handle categorical variables
# For simplicity, using one-hot encoding, though other methods (target encoding) could be considered
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', missing=np.nan)

# Fit the model
xgb_clf.fit(X_train, y_train)
# Make predictions
y_pred = xgb_clf.predict(X_test)

Parameters: { "use_label_encoder" } are not used.



In [9]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
accuracy = round(accuracy, 4)
print("Model Accuracy:", str(accuracy*100) + "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 95.65%

Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      0.99      0.98     25995
         1.0       0.66      0.24      0.35      1347

    accuracy                           0.96     27342
   macro avg       0.81      0.61      0.66     27342
weighted avg       0.95      0.96      0.95     27342



In [10]:
roc_auc = roc_auc_score(y_test, y_pred)
gini_index = 2 * roc_auc - 1

print("ROC AUC Score:", roc_auc)
print("Gini Index:", gini_index)

ROC AUC Score: 0.6149433397119799
Gini Index: 0.2298866794239598
