In [None]:
from sklearn.metrics import accuracy_score, auc, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
import xgboost as xgb

In [None]:
df = pd.read_parquet('data-val.parquet')

In [None]:
# Dropping columns
df = df.drop(columns=['Age_x','CIF_CLSCUS', 'COB_DATE', 'DATE_TIME', 'BRN_OPN_CIF', 'MA_PHONG_GIAO_DICH_VCB', 'CIF_MASK', 'IS_TM', 'Unnamed: 0', 'SUM_CBALQ_LH_6m', 'SUM_CBALQ_LH_3m', 'AVG_GR_SUM_CBALQ_LH'])

In [None]:
# Replacing inf values with nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
# Model

# Split features and target
X = df.drop(columns=['IS_BANCAS'])
y = df['IS_BANCAS']


# Handle categorical variables
# For simplicity, using one-hot encoding, though other methods (target encoding) could be considered
X = pd.get_dummies(X, drop_first=True)



# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(eval_metric='logloss', missing=np.nan)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 500],   # Number of trees
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size shrinkage
    'max_depth': [3, 5, 7, 10],             # Maximum tree depth
    'subsample': [0.6, 0.8, 1.0],           # Fraction of samples used for training
    'colsample_bytree': [0.6, 0.8, 1.0],    # Fraction of features used per tree
    'gamma': [0, 0.1, 0.5, 1.0],            # Minimum loss reduction for split
    'reg_lambda': [0.1, 1.0, 10],           # L2 regularization
    'reg_alpha': [0, 0.1, 1.0]              # L1 regularization
}
# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_grid,
    n_iter=50,          # Number of parameter settings sampled
    scoring='accuracy', # Metric to evaluate performance
    cv=3,               # 3-fold cross-validation
    verbose=1,          # Print progress
    random_state=42,    # Reproducibility
    n_jobs=-1           # Use all available cores
)
# Fit the model
random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
accuracy = round(accuracy, 4)
print("Model Accuracy:", str(accuracy*100) + "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
roc_auc_score = roc_auc_score(y_test, y_pred)
gini_index = 2 * roc_auc - 1

print("ROC AUC Score:", roc_auc_score.round(2))
print("Gini Index:", gini_index.round(2))

In [None]:

# Step 4: Evaluate predictions (e.g., ROC Curve)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

# Insert the root point (0, 0) into fpr and tpr
fpr = [0] + list(fpr)  # Add 0 at the beginning of fpr
tpr = [0] + list(tpr)  # Add 0 at the beginning of tpr

# Define your custom background color
background_color = '#181926'  # A dark gray example, adjust as needed
text_colour = "#cad3f5"
axis_colour = "#b8c0e0"
roc_colour = '#f5a97f'
auc_colour = "#eed49f"
guess_colour = '#8aadf4'

plt.figure(figsize=(8, 6), facecolor=background_color)
ax = plt.gca()  # Get the current Axes
ax.set_facecolor(background_color)  # Set the background color of the Axes

# Customize axis colors
ax.tick_params(axis='x', colors=axis_colour)  # Set x-axis tick color
ax.tick_params(axis='y', colors=axis_colour)  # Set y-axis tick color
ax.spines['bottom'].set_color(axis_colour)  # Set bottom spine color
ax.spines['left'].set_color(axis_colour)    # Set left spine color
ax.spines['top'].set_color(axis_colour)  # Set bottom spine color
ax.spines['right'].set_color(axis_colour)    # Set left spine color

# Fill the area under the ROC curve
plt.fill_between(fpr, tpr, color=auc_colour, alpha=0.3, label='AUC Region')


# Plot the ROC curve
plt.plot(fpr, tpr, color=roc_colour, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color=guess_colour, linestyle='--', label='Random Guess')
plt.xlabel('False Positive Rate', color=text_colour)  # White text for better contrast
plt.ylabel('True Positive Rate', color=text_colour)  # White text for better contrast
plt.title('Receiver Operating Characteristic (ROC) Curve', color=text_colour)
plt.legend(loc='lower right', facecolor=background_color, edgecolor=text_colour)
plt.grid(alpha=0.1, color=axis_colour)  # Adjust grid line color for visibility

# Set the x and y limits to start at 0
plt.xlim(0, 1)  # x-axis starts at 0
plt.ylim(0, 1)  # y-axis starts at 0

plt.show()