In [9]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score

In [10]:
data_df = pd.read_csv('data.csv')

In [11]:
# Dropping columns
data_df = data_df.drop(columns=['Age_x', 'Unnamed: 0.1', 'Unnamed: 0', 'CIF_CLSCUS', 'COB_DATE', 'DATE_TIME', 'BRN_OPN_CIF', 'MA_PHONG_GIAO_DICH_VCB', 'CIF_MASK'])
data_df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [12]:
target = 'IS_BANCAS'  # Replace with your target column
features = [col for col in data_df.columns if col != target]

# Check for categorical featuresC
categorical_features = data_df.select_dtypes(include=['object', 'category']).columns.tolist()

# Split data into train and test
X = data_df[features]
y = data_df[target]

# Replace NaN in categorical features with a placeholder
for cat_col in categorical_features:
    X.loc[:, cat_col] = X[cat_col].astype(str).fillna("missing")


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for cat_col in categorical_features:
    X_train.loc[:, cat_col] = X_train[cat_col].astype(str).fillna("missing")
    X_test.loc[:, cat_col] = X_test[cat_col].astype(str).fillna("missing")

# Create CatBoost Pool objects
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)

# Initialize and train the CatBoost model
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    cat_features=categorical_features,
    loss_function='Logloss',
    verbose=100
)

model.fit(train_pool)

# Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_pred)
gini_index = 2 * roc_auc - 1

print("ROC AUC Score:", roc_auc)
print("Gini Index:", gini_index)

# Save the model
# model.save_model("catboost_model.cbm")

0:	learn: 0.5313152	total: 1.12s	remaining: 18m 43s
100:	learn: 0.1226616	total: 1m 41s	remaining: 15m 6s
200:	learn: 0.1202277	total: 3m 20s	remaining: 13m 17s
300:	learn: 0.1186528	total: 4m 57s	remaining: 11m 31s
400:	learn: 0.1172788	total: 6m 35s	remaining: 9m 50s
500:	learn: 0.1161064	total: 8m 13s	remaining: 8m 11s
600:	learn: 0.1150516	total: 9m 50s	remaining: 6m 31s
700:	learn: 0.1140403	total: 11m 28s	remaining: 4m 53s
800:	learn: 0.1130598	total: 13m 6s	remaining: 3m 15s
900:	learn: 0.1121365	total: 14m 43s	remaining: 1m 37s
999:	learn: 0.1112909	total: 16m 19s	remaining: 0us
Accuracy: 0.962263024194845
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98    207338
         1.0       0.73      0.33      0.45     10353

    accuracy                           0.96    217691
   macro avg       0.85      0.66      0.72    217691
weighted avg       0.96      0.96      0.96    217691

ROC AUC Score: 0.660380250418389
Gini Index: 0.3207