In [1]:
import pandas as pd

df = pd.read_csv('/kaggle/input/processed-churn-data/processed_churn_data.csv')


In [2]:
import sklearn
print("scikit-learn version:", sklearn.__version__)


scikit-learn version: 1.2.2


In [3]:
import imblearn
print("imbalanced-learn version:", imblearn.__version__)


imbalanced-learn version: 0.10.1


In [4]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Assuming 'df' is your fully processed DataFrame with features & target 'Churn'
X = df.drop(columns=['Churn'])
y = df['Churn']

# Split again just to ensure reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to balance classes on training set only
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"After SMOTE, training data shape: {X_train_smote.shape}, Churn distribution: {y_train_smote.value_counts(normalize=True)}")


After SMOTE, training data shape: (8260, 26), Churn distribution: Churn
0    0.5
1    0.5
Name: proportion, dtype: float64


In [5]:
print(f"After SMOTE, training data shape: {X_train_smote.shape}, Churn distribution: {y_train_smote.value_counts(normalize=True)}")


After SMOTE, training data shape: (8260, 26), Churn distribution: Churn
0    0.5
1    0.5
Name: proportion, dtype: float64


In [6]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

# Initialize CatBoostClassifier with some strong baseline parameters
model = CatBoostClassifier(
    iterations=1000,          # Number of boosting iterations
    learning_rate=0.05,       # Moderate learning rate
    depth=6,                  # Tree depth
    eval_metric='AUC',        # Use AUC as evaluation metric
    random_seed=42,
    early_stopping_rounds=50, # Stop if no improvement after 50 rounds
    verbose=100               # Print training progress every 100 iterations
)

# Train the model on SMOTE balanced training data
model.fit(
    X_train_smote, y_train_smote,
    eval_set=(X_test, y_test),  # Validate on original test data (unbalanced)
    use_best_model=True         # Use the best iteration based on validation score
)

# Predict probabilities and classes on the test set
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc:.4f}")

# Optional: Confusion Matrix to see true positives/negatives
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


0:	test: 0.8108981	best: 0.8108981 (0)	total: 57.2ms	remaining: 57.1s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8304805069
bestIteration = 33

Shrink model to first 34 iterations.
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.74      0.81      1033
           1       0.51      0.76      0.61       374

    accuracy                           0.75      1407
   macro avg       0.71      0.75      0.71      1407
weighted avg       0.79      0.75      0.76      1407

ROC AUC Score: 0.8305
Confusion Matrix:
[[764 269]
 [ 89 285]]


In [7]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Assuming df, X, y, X_train, X_test, y_train, y_test are already defined

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define a few hyperparameter combinations to try
param_grid = [
    {'depth': 4, 'learning_rate': 0.1, 'iterations': 100},
    {'depth': 6, 'learning_rate': 0.05, 'iterations': 200},
    {'depth': 8, 'learning_rate': 0.03, 'iterations': 300},
]

best_auc = 0
best_model = None

for params in param_grid:
    print(f"Training with params: {params}")
    model = CatBoostClassifier(
        **params,
        random_seed=42,
        early_stopping_rounds=50,
        verbose=50,
        loss_function='Logloss',
        eval_metric='AUC'
    )
    model.fit(X_train_smote, y_train_smote, eval_set=(X_test, y_test), use_best_model=True)
    
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, y_proba)
    print(f"AUC: {auc:.4f}")
    
    if auc > best_auc:
        best_auc = auc
        best_model = model
    
    print(classification_report(y_test, y_pred))

print(f"Best AUC after tuning: {best_auc:.4f}")


Training with params: {'depth': 4, 'learning_rate': 0.1, 'iterations': 100}
0:	test: 0.8069418	best: 0.8069418 (0)	total: 3.63ms	remaining: 360ms
50:	test: 0.8293792	best: 0.8307290 (35)	total: 160ms	remaining: 154ms
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8307289914
bestIteration = 35

Shrink model to first 36 iterations.
AUC: 0.8307
              precision    recall  f1-score   support

           0       0.90      0.73      0.81      1033
           1       0.52      0.78      0.62       374

    accuracy                           0.75      1407
   macro avg       0.71      0.76      0.72      1407
weighted avg       0.80      0.75      0.76      1407

Training with params: {'depth': 6, 'learning_rate': 0.05, 'iterations': 200}
0:	test: 0.8108981	best: 0.8108981 (0)	total: 5.13ms	remaining: 1.02s
50:	test: 0.8299641	best: 0.8304805 (33)	total: 213ms	remaining: 622ms
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8304805069
bestIterati

In [8]:
import numpy as np
from sklearn.metrics import precision_recall_curve

# Use your best_model from tuning step
y_proba = best_model.predict_proba(X_test)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)

# Find threshold with best precision-recall balance (you can pick your own preference)
for p, r, t in zip(precisions, recalls, np.append(thresholds, 1)):
    print(f"Threshold: {t:.2f}, Precision: {p:.2f}, Recall: {r:.2f}")

# Pick a threshold for higher precision (e.g., 0.6 or 0.7) and evaluate

chosen_threshold = 0.6
y_pred_thresh = (y_proba >= chosen_threshold).astype(int)

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
print("Classification Report at threshold", chosen_threshold)
print(classification_report(y_test, y_pred_thresh))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_thresh))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))


Threshold: 0.02, Precision: 0.27, Recall: 1.00
Threshold: 0.02, Precision: 0.27, Recall: 1.00
Threshold: 0.02, Precision: 0.27, Recall: 1.00
Threshold: 0.03, Precision: 0.27, Recall: 1.00
Threshold: 0.03, Precision: 0.27, Recall: 1.00
Threshold: 0.03, Precision: 0.27, Recall: 1.00
Threshold: 0.03, Precision: 0.27, Recall: 1.00
Threshold: 0.03, Precision: 0.27, Recall: 1.00
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.03, Precision: 0.27, Recall: 0.99
Threshold: 0.

In [9]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Calculate class weights manually or use sklearn utility
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_dict = dict(zip(classes, class_weights))

print("Class weights:", class_weights_dict)

# Train CatBoost with class weights
model = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.05,
    random_seed=42,
    early_stopping_rounds=50,
    class_weights=class_weights_dict,
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# Predict and evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))


Class weights: {0: 0.6809927360774818, 1: 1.8812709030100334}
0:	learn: 0.6701641	test: 0.6714983	best: 0.6714983 (0)	total: 5.78ms	remaining: 5.78s
100:	learn: 0.4316393	test: 0.4952457	best: 0.4927690 (61)	total: 333ms	remaining: 2.97s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4927689809
bestIteration = 61

Shrink model to first 62 iterations.
              precision    recall  f1-score   support

           0       0.91      0.70      0.79      1033
           1       0.49      0.81      0.61       374

    accuracy                           0.73      1407
   macro avg       0.70      0.75      0.70      1407
weighted avg       0.80      0.73      0.74      1407

ROC AUC Score: 0.8393327673408535


In [10]:
import numpy as np

best_acc = 0
best_thresh = 0.5
for thresh in np.arange(0.3, 0.8, 0.05):
    preds = (y_proba >= thresh).astype(int)
    acc = (preds == y_test).mean()
    print(f"Threshold: {thresh:.2f}, Accuracy: {acc:.4f}")
    if acc > best_acc:
        best_acc = acc
        best_thresh = thresh

print(f"Best threshold: {best_thresh}, Best accuracy: {best_acc}")


Threshold: 0.30, Accuracy: 0.6482
Threshold: 0.35, Accuracy: 0.6759
Threshold: 0.40, Accuracy: 0.6937
Threshold: 0.45, Accuracy: 0.7107
Threshold: 0.50, Accuracy: 0.7278
Threshold: 0.55, Accuracy: 0.7569
Threshold: 0.60, Accuracy: 0.7740
Threshold: 0.65, Accuracy: 0.7875
Threshold: 0.70, Accuracy: 0.7946
Threshold: 0.75, Accuracy: 0.8017
Best threshold: 0.7499999999999999, Best accuracy: 0.8017057569296375


In [11]:
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
import numpy as np

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scores = []
accuracy_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    smote = SMOTE(random_state=42)
    X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

    model = CatBoostClassifier(
        iterations=2000,
        depth=6,
        learning_rate=0.03,
        early_stopping_rounds=100,
        random_seed=42,
        verbose=100,
        class_weights=class_weights_dict
    )

    model.fit(X_train_sm, y_train_sm, eval_set=(X_val, y_val), use_best_model=True)

    y_val_proba = model.predict_proba(X_val)[:, 1]
    
    # Threshold tuning on validation fold (optional: optimize for accuracy)
    best_acc = 0
    best_thresh = 0.5
    for thresh in np.arange(0.3, 0.8, 0.01):
        y_pred_thresh = (y_val_proba >= thresh).astype(int)
        acc = (y_pred_thresh == y_val).mean()
        if acc > best_acc:
            best_acc = acc
            best_thresh = thresh
    
    print(f"Fold {fold+1} best accuracy: {best_acc:.4f} at threshold {best_thresh:.2f}")

    auc_scores.append(roc_auc_score(y_val, y_val_proba))
    accuracy_scores.append(best_acc)

print(f"Mean ROC AUC: {np.mean(auc_scores):.4f}")
print(f"Mean Accuracy: {np.mean(accuracy_scores):.4f}")


0:	learn: 0.6740550	test: 0.6806303	best: 0.6806303 (0)	total: 5.62ms	remaining: 11.2s
100:	learn: 0.3523330	test: 0.5307620	best: 0.5307620 (100)	total: 425ms	remaining: 7.99s
200:	learn: 0.3232469	test: 0.5280801	best: 0.5277590 (197)	total: 849ms	remaining: 7.6s
300:	learn: 0.3030292	test: 0.5253331	best: 0.5252942 (299)	total: 1.27s	remaining: 7.2s
400:	learn: 0.2866385	test: 0.5226434	best: 0.5221248 (382)	total: 1.7s	remaining: 6.76s
500:	learn: 0.2722479	test: 0.5212268	best: 0.5210792 (496)	total: 2.12s	remaining: 6.33s
600:	learn: 0.2599712	test: 0.5204777	best: 0.5203882 (594)	total: 2.54s	remaining: 5.9s
700:	learn: 0.2496122	test: 0.5205616	best: 0.5196581 (641)	total: 2.96s	remaining: 5.49s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.5196581209
bestIteration = 641

Shrink model to first 642 iterations.
Fold 1 best accuracy: 0.7960 at threshold 0.79
0:	learn: 0.6728463	test: 0.6798077	best: 0.6798077 (0)	total: 4.67ms	remaining: 9.33s
100:	learn: 0.

In [14]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np

# Assuming X_train and X_test are pandas DataFrames
categorical_columns = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
print("Categorical columns detected:", categorical_columns)
cat_features = [X_train.columns.get_loc(col) for col in categorical_columns]


# Calculate class weights to handle imbalance
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_dict = dict(zip(classes, class_weights))
print("Class weights:", class_weights_dict)

# Initialize CatBoost model with class weights and categorical features
model = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.05,
    random_seed=42,
    early_stopping_rounds=50,
    class_weights=class_weights_dict,
    cat_features=cat_features,
    verbose=100
)

# Train model
model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# Predict and evaluate
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))

# Find best threshold for accuracy
best_acc = 0
best_thresh = 0.5
for thresh in np.arange(0.3, 0.8, 0.05):
    preds = (y_proba >= thresh).astype(int)
    acc = (preds == y_test).mean()
    print(f"Threshold: {thresh:.2f}, Accuracy: {acc:.4f}")
    if acc > best_acc:
        best_acc = acc
        best_thresh = thresh

print(f"Best threshold: {best_thresh}, Best accuracy: {best_acc}")


Categorical columns detected: []
Class weights: {0: 0.6809489227789881, 1: 1.8816053511705686}
0:	learn: 0.6718266	test: 0.6726001	best: 0.6726001 (0)	total: 4.52ms	remaining: 4.51s
100:	learn: 0.4308952	test: 0.4516201	best: 0.4516201 (100)	total: 341ms	remaining: 3.04s
200:	learn: 0.3922528	test: 0.4194151	best: 0.4194151 (200)	total: 673ms	remaining: 2.68s
300:	learn: 0.3594461	test: 0.3941970	best: 0.3941970 (300)	total: 1s	remaining: 2.33s
400:	learn: 0.3323666	test: 0.3718854	best: 0.3718854 (400)	total: 1.34s	remaining: 2s
500:	learn: 0.3077323	test: 0.3519274	best: 0.3519274 (500)	total: 1.67s	remaining: 1.66s
600:	learn: 0.2870702	test: 0.3358558	best: 0.3358558 (600)	total: 2s	remaining: 1.33s
700:	learn: 0.2690534	test: 0.3208904	best: 0.3208904 (700)	total: 2.33s	remaining: 996ms
800:	learn: 0.2524343	test: 0.3077170	best: 0.3077170 (800)	total: 2.67s	remaining: 663ms
900:	learn: 0.2368042	test: 0.2956269	best: 0.2956269 (900)	total: 3.05s	remaining: 335ms
999:	learn: 0.223

In [15]:
import optuna
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming X_train, y_train are your training data as pandas DataFrame/Series

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 1500),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 5),
        'verbose': 0,
        'task_type': 'CPU'  # change to 'GPU' if you have GPU support
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        train_pool = Pool(X_tr, y_tr, cat_features=cat_features)
        val_pool = Pool(X_val, y_val, cat_features=cat_features)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, use_best_model=True)

        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)
        accuracies.append(acc)

    return np.mean(accuracies)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print(f'Best trial accuracy: {study.best_value}')
print('Best hyperparameters:', study.best_params)


[I 2025-06-03 11:05:48,850] A new study created in memory with name: no-name-86cb16b6-c994-4dd8-9f7b-4db802e5fb25
[I 2025-06-03 11:05:53,687] Trial 0 finished with value: 0.7817289915137162 and parameters: {'iterations': 772, 'depth': 9, 'learning_rate': 0.04383430566677442, 'l2_leaf_reg': 2.035100397551941, 'border_count': 46, 'random_strength': 2.5628985627947896e-05, 'bagging_temperature': 0.11996958739620422, 'scale_pos_weight': 1.9639792891171082}. Best is trial 0 with value: 0.7817289915137162.
[I 2025-06-03 11:06:02,985] Trial 1 finished with value: 0.7827962897177818 and parameters: {'iterations': 1416, 'depth': 4, 'learning_rate': 0.012397072146621687, 'l2_leaf_reg': 9.521374031520919, 'border_count': 87, 'random_strength': 2.478978990240549, 'bagging_temperature': 0.7926572146488085, 'scale_pos_weight': 1.8505728643244965}. Best is trial 1 with value: 0.7827962897177818.
[I 2025-06-03 11:06:05,916] Trial 2 finished with value: 0.7531138740872312 and parameters: {'iterations':

Best trial accuracy: 0.805902348529702
Best hyperparameters: {'iterations': 677, 'depth': 7, 'learning_rate': 0.045060072209470614, 'l2_leaf_reg': 7.389161452084198, 'border_count': 237, 'random_strength': 1.1847226917286827e-05, 'bagging_temperature': 0.2913515511468162, 'scale_pos_weight': 1.0209249479944078}


In [16]:
from sklearn.metrics import f1_score

def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 3000),
        'depth': trial.suggest_int('depth', 6, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float('random_strength', 1e-7, 10, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 5),
        'verbose': 0,
        'task_type': 'CPU'
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        train_pool = Pool(X_tr, y_tr, cat_features=cat_features)
        val_pool = Pool(X_val, y_val, cat_features=cat_features)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100, use_best_model=True)

        preds = model.predict(X_val)
        f1 = f1_score(y_val, preds)
        f1_scores.append(f1)

    return np.mean(f1_scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=40)

print(f'Best trial F1: {study.best_value}')
print('Best hyperparameters:', study.best_params)


[I 2025-06-03 11:09:20,659] A new study created in memory with name: no-name-e3e94a3a-203c-489b-bc9c-3c4e26263e98
[I 2025-06-03 11:09:25,180] Trial 0 finished with value: 0.6296143526671731 and parameters: {'iterations': 1033, 'depth': 8, 'learning_rate': 0.05806867622357314, 'l2_leaf_reg': 3.5326129079349218, 'border_count': 205, 'random_strength': 0.00045528874039951046, 'bagging_temperature': 0.5655872245188747, 'scale_pos_weight': 3.2740686355148902}. Best is trial 0 with value: 0.6296143526671731.
[I 2025-06-03 11:09:28,627] Trial 1 finished with value: 0.634482933882843 and parameters: {'iterations': 1285, 'depth': 7, 'learning_rate': 0.048666493456544495, 'l2_leaf_reg': 1.0446548841854737, 'border_count': 239, 'random_strength': 8.833426276597396e-07, 'bagging_temperature': 0.9530472206136087, 'scale_pos_weight': 2.5066635573783227}. Best is trial 1 with value: 0.634482933882843.
[I 2025-06-03 11:09:32,085] Trial 2 finished with value: 0.6244570487161744 and parameters: {'iterat

Best trial F1: 0.6393589033436633
Best hyperparameters: {'iterations': 2550, 'depth': 9, 'learning_rate': 0.08681837153079544, 'l2_leaf_reg': 4.463582363041937, 'border_count': 41, 'random_strength': 6.06827524456507, 'bagging_temperature': 0.7579951420146809, 'scale_pos_weight': 2.1745072022925083}


In [17]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 2000),
        'depth': trial.suggest_int('depth', 6, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.1, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 5),
        'border_count': trial.suggest_int('border_count', 32, 128),
        'random_strength': trial.suggest_float('random_strength', 1e-7, 1, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 3),
        'verbose': 0,
        'task_type': 'CPU'
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        train_pool = Pool(X_tr, y_tr, cat_features=cat_features)
        val_pool = Pool(X_val, y_val, cat_features=cat_features)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=100, use_best_model=True)

        preds = model.predict(X_val)
        acc = (preds == y_val).mean()
        accuracies.append(acc)

    return np.mean(accuracies)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print(f'Best trial accuracy: {study.best_value}')
print('Best hyperparameters:', study.best_params)


[I 2025-06-03 11:17:04,666] A new study created in memory with name: no-name-a903f0c8-5656-4d98-b495-6b0f473f6dc5
[I 2025-06-03 11:17:08,634] Trial 0 finished with value: 0.7683992895204262 and parameters: {'iterations': 1710, 'depth': 7, 'learning_rate': 0.03400171723917107, 'l2_leaf_reg': 3.036869644938348, 'border_count': 119, 'random_strength': 0.32042898931348296, 'bagging_temperature': 0.16101012746477572, 'scale_pos_weight': 2.3410194327448197}. Best is trial 0 with value: 0.7683992895204262.
[I 2025-06-03 11:17:18,332] Trial 1 finished with value: 0.7947040852575489 and parameters: {'iterations': 1695, 'depth': 10, 'learning_rate': 0.06940317379494983, 'l2_leaf_reg': 1.9116193735026434, 'border_count': 71, 'random_strength': 0.6174485786021636, 'bagging_temperature': 0.2181694238616092, 'scale_pos_weight': 1.4770389793924223}. Best is trial 1 with value: 0.7947040852575489.
[I 2025-06-03 11:17:28,490] Trial 2 finished with value: 0.7859958160647326 and parameters: {'iterations'

Best trial accuracy: 0.8009259917110716
Best hyperparameters: {'iterations': 1428, 'depth': 9, 'learning_rate': 0.037143342225491295, 'l2_leaf_reg': 1.6957887579018691, 'border_count': 99, 'random_strength': 0.028197032022741972, 'bagging_temperature': 0.0364640867256832, 'scale_pos_weight': 1.0619461367712402}


In [18]:
# Assuming cat_features is correct and defined properly

model = CatBoostClassifier(
    iterations=3000,
    depth=9,
    learning_rate=0.037,
    l2_leaf_reg=1.7,
    border_count=99,
    random_strength=0.028,
    bagging_temperature=0.036,
    scale_pos_weight=1.06,
    early_stopping_rounds=200,
    verbose=100,
)

model.fit(
    Pool(X_train, y_train, cat_features=cat_features),
    eval_set=Pool(X_test, y_test, cat_features=cat_features),
    use_best_model=True,
)

# Predict probabilities
probs = model.predict_proba(X_test)[:, 1]

# Threshold tuning
best_acc = 0
best_thresh = 0.5
for thresh in np.arange(0.3, 0.8, 0.01):
    preds = (probs >= thresh).astype(int)
    acc = (preds == y_test).mean()
    if acc > best_acc:
        best_acc = acc
        best_thresh = thresh

print(f"Best threshold: {best_thresh}, Best accuracy: {best_acc:.4f}")


0:	learn: 0.6656348	test: 0.6659893	best: 0.6659893 (0)	total: 10.4ms	remaining: 31.3s
100:	learn: 0.3165868	test: 0.3445657	best: 0.3445657 (100)	total: 875ms	remaining: 25.1s
200:	learn: 0.2600154	test: 0.2984984	best: 0.2984984 (200)	total: 1.74s	remaining: 24.2s
300:	learn: 0.2189641	test: 0.2659815	best: 0.2659815 (300)	total: 2.6s	remaining: 23.4s
400:	learn: 0.1880207	test: 0.2413115	best: 0.2413115 (400)	total: 3.47s	remaining: 22.5s
500:	learn: 0.1643782	test: 0.2240570	best: 0.2240570 (500)	total: 4.34s	remaining: 21.6s
600:	learn: 0.1440364	test: 0.2082914	best: 0.2082914 (600)	total: 5.2s	remaining: 20.8s
700:	learn: 0.1277506	test: 0.1955338	best: 0.1955338 (700)	total: 6.08s	remaining: 19.9s
800:	learn: 0.1133948	test: 0.1848721	best: 0.1848721 (800)	total: 6.98s	remaining: 19.2s
900:	learn: 0.1024955	test: 0.1768697	best: 0.1768697 (900)	total: 7.85s	remaining: 18.3s
1000:	learn: 0.0922786	test: 0.1691632	best: 0.1691632 (1000)	total: 8.72s	remaining: 17.4s
1100:	learn: 

In [22]:
import joblib

# Save CatBoost model
model.save_model("catboost_churn_model.cbm")

# Save best threshold
joblib.dump(best_thresh, "best_threshold.pkl")


['best_threshold.pkl']