In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier


In [44]:
def normalized_gini(y_true, y_pred_proba):
    auc = roc_auc_score(y_true, y_pred_proba)
    return 2 * auc - 1


In [45]:
df = pd.read_csv("training_data.csv")

In [46]:
target_col = "target"

X = df.drop(columns=[target_col])
y = df[target_col]


In [47]:
cat_cols = X.select_dtypes(include=["object"]).columns
X = X.drop(columns=cat_cols)


In [48]:
imputer = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)


In [49]:
neg = (y == 0).sum()
pos = (y == 1).sum()

scale_pos_weight = neg/pos
print("scale_pos_weight:", scale_pos_weight)


scale_pos_weight: 26.436992221261885


In [50]:
# from sklearn.model_selection import StratifiedKFold

# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


In [51]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score

# auc_scores = []

# for train_idx, val_idx in skf.split(X, y):

#     X_tr, X_va = X.iloc[train_idx], X.iloc[val_idx]
#     y_tr, y_va = y.iloc[train_idx], y.iloc[val_idx]

#     scaler = StandardScaler()
#     X_tr = scaler.fit_transform(X_tr)
#     X_va = scaler.transform(X_va)

#     model = LogisticRegression(
#         class_weight="balanced",
#         max_iter=1000
#     )

#     model.fit(X_tr, y_tr)

#     preds = model.predict_proba(X_va)[:, 1]
#     auc_scores.append(roc_auc_score(y_va, preds))


In [52]:
# mean_auc = np.mean(auc_scores)
# gini = 2 * mean_auc - 1

# print("AUC:", mean_auc)
# print("Normalized Gini:", gini)


In [53]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scores = []
gini_scores = []


In [56]:
for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold+1}")

    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

CatBoostClassifier(
    iterations=4000,
    depth=8,
    learning_rate=0.02,
    l2_leaf_reg=8,
    eval_metric="AUC",
    auto_class_weights="Balanced",
    subsample=0.8,
    colsample_bylevel=0.8,
    random_seed=42
)


model.fit(
    X_tr, y_tr,
    eval_set=(X_va, y_va)
    )

y_va_pred = model.predict_proba(X_va)[:, 1]

auc = roc_auc_score(y_va, y_va_pred)
gini = 2 * auc - 1

auc_scores.append(auc)
gini_scores.append(gini)

print(f"AUC: {auc:.4f} | Gini: {gini:.4f}")
print("\n==== FINAL CV RESULTS ====")
print("Mean AUC:", np.mean(auc_scores))
print("Mean Gini:", np.mean(gini_scores))



Fold 1

Fold 2

Fold 3

Fold 4

Fold 5
AUC: 0.6434 | Gini: 0.2867

==== FINAL CV RESULTS ====
Mean AUC: 0.6433646369290827
Mean Gini: 0.2867292738581655


In [57]:
train_auc_scores = []
val_auc_scores = []

# TRAIN predictions
y_tr_pred = model.predict_proba(X_tr)[:, 1]
train_auc = roc_auc_score(y_tr, y_tr_pred)

# VALIDATION predictions
y_va_pred = model.predict_proba(X_va)[:, 1]
val_auc = roc_auc_score(y_va, y_va_pred)

train_auc_scores.append(train_auc)
val_auc_scores.append(val_auc)

print(f"Train AUC: {train_auc:.4f} | Val AUC: {val_auc:.4f}")


Train AUC: 0.6825 | Val AUC: 0.6434
