In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier


In [3]:
import catboost
print(catboost.__version__)


1.2.8


In [15]:
def normalized_gini(y_true, y_pred_proba):
    auc = roc_auc_score(y_true, y_pred_proba)
    return 2 * auc - 1


In [16]:
df = pd.read_csv("training_data.csv")

In [17]:
target_col = "target"

X = df.drop(columns=[target_col])
y = df[target_col]


In [18]:
cat_cols = X.select_dtypes(include=["object"]).columns
X = X.drop(columns=cat_cols)


In [19]:
imputer = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)


In [20]:
neg = (y == 0).sum()
pos = (y == 1).sum()

scale_pos_weight = neg/pos
print("scale_pos_weight:", scale_pos_weight)


scale_pos_weight: 26.436992221261885


In [23]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X)) 

auc_scores = []
gini_scores = []


In [24]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scores = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold+1}")

    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    model = CatBoostClassifier(
        iterations=15000,
        learning_rate=0.03,
        depth=5,
        l2_leaf_reg=20,
        auto_class_weights="Balanced",
        subsample=0.9,
        colsample_bylevel=0.7,
        random_strength=1.0,
        bagging_temperature=1.0,
        eval_metric="AUC",
        loss_function="Logloss",
        random_seed=42,
        verbose=False,
        use_best_model=True
    )

    model.fit(
        X_tr, y_tr,
        eval_set=(X_va, y_va),
        early_stopping_rounds=500
    )

    # validation predictions
    va_preds = model.predict_proba(X_va)[:, 1]

    # ðŸ”¥ CRITICAL LINE (THIS IS WHAT YOU WERE MESSING UP)
    oof_preds[va_idx] = va_preds

    fold_auc = roc_auc_score(y_va, va_preds)
    auc_scores.append(fold_auc)

    print(f"Fold AUC: {fold_auc:.4f}")
  
oof_auc = roc_auc_score(y, oof_preds)
oof_gini = 2 * oof_auc - 1

print("\n==== FOLD RESULTS ====")
print("Mean Fold AUC:", np.mean(auc_scores))
print("Mean Fold Gini:", 2 * np.mean(auc_scores) - 1)

print("\n==== OOF CV RESULTS ====")
print("OOF AUC:", oof_auc)
print("OOF Gini:", oof_gini)



Fold 1
Fold AUC: 0.6366

Fold 2
Fold AUC: 0.6377

Fold 3
Fold AUC: 0.6362

Fold 4
Fold AUC: 0.6431

Fold 5
Fold AUC: 0.6459

==== FOLD RESULTS ====
Mean Fold AUC: 0.6398894827801916
Mean Fold Gini: 0.2797789655603833

==== OOF CV RESULTS ====
OOF AUC: 0.6397996263328194
OOF Gini: 0.2795992526656388


In [None]:
train_auc_scores = []
val_auc_scores = []

# TRAIN predictions
y_tr_pred = model.predict_proba(X_tr)[:, 1]
train_auc = roc_auc_score(y_tr, y_tr_pred)

# VALIDATION predictions
y_va_pred = model.predict_proba(X_va)[:, 1]
val_auc = roc_auc_score(y_va, y_va_pred)

train_auc_scores.append(train_auc)
val_auc_scores.append(val_auc)

print(f"Train AUC: {train_auc:.4f} | Val AUC: {val_auc:.4f}")


In [27]:
# # 1. Load the unseen data
# test_df = pd.read_csv("test_data.csv")

# # 2. Match the preprocessing (Drop cats and use the ALREADY FITTED imputer)
# X_test = test_df.drop(columns=cat_cols, errors='ignore')
# X_test_preprocessed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# # 3. Predict probabilities using your trained model
# test_probs = model.predict_proba(X_test_preprocessed)[:, 1]

# # 4. Create the final file
# submission = pd.DataFrame({
#     'index': test_df.index,
#     'target_probability': test_probs
# })

# submission.to_csv("test_predictions.csv", index=False)

# print("Process Complete. Predictions saved to test_predictions.csv")

Process Complete. Predictions saved to test_predictions.csv


In [30]:
# 1. SAVE THE CATBOOST OOF FILE
# (This uses the oof_preds you calculated in your loop)
cat_oof_df = pd.DataFrame({
    "row_idx": range(len(y)), 
    "cat_oof": oof_preds, 
    "target": y
})
cat_oof_df.to_csv("cat_oof.csv", index=False)

# 2. GENERATE AND SAVE THE CATBOOST TEST FILE
# Load unseen data
test_df = pd.read_csv("test_data.csv")

# Preprocess (Drop cats and use the ALREADY FITTED imputer)
X_test = test_df.drop(columns=cat_cols, errors='ignore')
X_test_preprocessed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Predict probabilities using the model from your loop
cat_test_preds = model.predict_proba(X_test_preprocessed)[:, 1]

# Save Test file
cat_test_df = pd.DataFrame({
    "row_idx": range(len(cat_test_preds)),
    "cat_test": cat_test_preds
})
cat_test_df.to_csv("cat_test.csv", index=False)

print("Both CAT OOF and TEST files saved successfully.")

Both CAT OOF and TEST files saved successfully.
