#### Import Required Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
import xgboost as xgb
import pickle
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [3]:
X_train = train.drop("Exited", axis=1)
y_train = train["Exited"]

X_test = test.drop("Exited", axis=1)
y_test = test["Exited"]

In [4]:
def engg_features(X):
    X["BalanceSalaryRatio"] = X["Balance"] / X["EstimatedSalary"]
    X["TenureByAge"] = X["Tenure"] / X["Age"]
    X["CreditScoreGivenAge"] = X["CreditScore"] / X["Age"]
    X["HasBalance"] = np.where(X["Balance"] > 0, 1, 0)
    X["ActiveByAge"] = X["IsActiveMember"] * X["Age"]
    X['AgeCategory'] = pd.cut(X['Age'], bins=[0, 35, 55, np.inf], labels=['Young', 'MiddleAge', 'Senior'])
    return X


In [5]:
preprocessing_pipeline = pickle.load(open("../models/pipeline.pkl", "rb"))

In [6]:
models = {
    "logistic regression": LogisticRegression(),
    "random forest": RandomForestClassifier(),
    "gradient boosting": HistGradientBoostingClassifier(),
    "svc": SVC(),
    "xgboost": xgb.XGBClassifier()
}

results = {}

for name, model in models.items():
    model_pipeline = Pipeline([
        ("preprocessing_pipeline", preprocessing_pipeline),
        ("model", model)
    ])

    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)

    f1 = f1_score(y_test, y_pred)
    results[name] = f1

results
    

{'logistic regression': 0.4201388888888889,
 'random forest': 0.5749235474006116,
 'gradient boosting': 0.5973254086181278,
 'svc': 0.5418060200668896,
 'xgboost': 0.5611510791366906}

#### Hyperparameter Tuning Using Optuna

In [7]:
def objective(trial):
    classifier_name = trial.suggest_categorical("classifier", ["LogisticRegression", "RandomForest", "GradientBoosting", "SVC", "XGBoost"])

    # xgb_weight = np.sum(y_train == 0) / np.sum(y_train == 1)


    if classifier_name == "LogisticRegression":
        lr_params = {
            "C": trial.suggest_float("C", 0.0001, 10, log=True),
            "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
            "class_weight": "balanced",
            "max_iter": 1000,
            "random_state": 42
        }

        model = LogisticRegression(**lr_params)
    
    elif classifier_name == "RandomForest":
        bootstrap = trial.suggest_categorical("bootstrap", [True, False])

        rf_params = {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 3, 20),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "bootstrap": bootstrap,
            "class_weight": "balanced_subsample" if bootstrap else "balanced", 
            "random_state": 42
        }

        model = RandomForestClassifier(**rf_params)

    elif classifier_name == "GradientBoosting":
        # gb_params = {
        #     "n_estimators": trial.suggest_int("n_estimators", 50, 100),
        #     "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        #     "max_depth": trial.suggest_int("max_depth", 3, 20),
        #     "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        #     "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        #     "random_state": 42
        # }

        # model = GradientBoostingClassifier(**gb_params)

        hgb_params = {
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "max_iter": trial.suggest_int("max_iter", 50, 300),
            "max_depth": trial.suggest_int("max_depth", 3, 20),
            "l2_regularization": trial.suggest_float("l2_regularization", 0, 10),
            "class_weight": "balanced",
            "random_state": 42
        }

        model = HistGradientBoostingClassifier(**hgb_params)
    
    elif classifier_name == "SVC":        
        svc_params = {
            "C": trial.suggest_float("C", 0.1, 100, log=True),
            "kernel": trial.suggest_categorical("kernel", ["linear", "rbf", "poly", "sigmoid"]), 
            "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
            "class_weight": "balanced",
            "random_state": 42
        }
        
        model = SVC(**svc_params)

    elif classifier_name == "XGBoost":
        xgb_params = {
        'learning_rate': trial.suggest_float("xgb_learning_rate", 0.01, 0.3),
        'n_estimators': trial.suggest_int("xgb_n_estimators", 50, 300),
        'max_depth': trial.suggest_int("xgb_max_depth", 3, 10),
        'subsample': trial.suggest_float("xgb_subsample", 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float("xgb_scale_pos_weight", 2.0, 6.0),
        'random_state': 42,
        'verbosity': 0
        }
        
        model = xgb.XGBClassifier(**xgb_params)

    
    pipeline = Pipeline([
        ("preprocessing", preprocessing_pipeline), 
        ("model", model)
    ])

    scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring="f1", n_jobs=-1)
    return scores.mean()

In [8]:
print("-" * 50)

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=100)

print(f"Best Accuracy: {study.best_value:.4f}")
print(f"Best Params: {study.best_params}")

[I 2025-12-12 11:21:33,015] A new study created in memory with name: no-name-b1c24853-f515-42b3-a871-f8cdd01fdd31


--------------------------------------------------


[I 2025-12-12 11:21:42,524] Trial 0 finished with value: 0.5689144030536722 and parameters: {'classifier': 'SVC', 'C': 19.68925278990451, 'kernel': 'poly', 'gamma': 'auto'}. Best is trial 0 with value: 0.5689144030536722.
[I 2025-12-12 11:21:47,173] Trial 1 finished with value: 0.5929383069553614 and parameters: {'classifier': 'GradientBoosting', 'learning_rate': 0.18860696108341524, 'max_iter': 288, 'max_depth': 12, 'l2_regularization': 6.977724982013285}. Best is trial 1 with value: 0.5929383069553614.
[I 2025-12-12 11:21:56,553] Trial 2 finished with value: 0.5118743840449053 and parameters: {'classifier': 'SVC', 'C': 11.506960615460955, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 1 with value: 0.5929383069553614.
[I 2025-12-12 11:21:58,405] Trial 3 finished with value: 0.5721075087572624 and parameters: {'classifier': 'SVC', 'C': 4.158681264188398, 'kernel': 'poly', 'gamma': 'auto'}. Best is trial 1 with value: 0.5929383069553614.
[I 2025-12-12 11:21:58,910] Trial 4 finishe

Best Accuracy: 0.6298
Best Params: {'classifier': 'XGBoost', 'xgb_learning_rate': 0.028076908785974193, 'xgb_n_estimators': 145, 'xgb_max_depth': 5, 'xgb_subsample': 0.7531012252732929, 'xgb_scale_pos_weight': 2.143367559901703}


In [9]:
study.trials_dataframe()["params_classifier"].value_counts()

params_classifier
XGBoost               69
SVC                    9
GradientBoosting       8
RandomForest           7
LogisticRegression     7
Name: count, dtype: int64

In [10]:
study.trials_dataframe().groupby("params_classifier")["value"].mean().sort_values(ascending=False)

params_classifier
XGBoost               0.614396
GradientBoosting      0.604188
RandomForest          0.589903
SVC                   0.512140
LogisticRegression    0.490684
Name: value, dtype: float64

In [11]:
if study.best_params["classifier"] == "LogisticRegression":
    model = LogisticRegression(**dict(list(study.best_params.items())[1:]))
elif study.best_params["classifier"] == "RandomForest":
    model = RandomForestClassifier(**dict(list(study.best_params.items())[1:]))
elif study.best_params["classifier"] == "GradientBoosting":
    model = HistGradientBoostingClassifier(**dict(list(study.best_params.items())[1:]))
elif study.best_params["classifier"] == "SVC":
    model = SVC(**dict(list(study.best_params.items())[1:]))
elif study.best_params["classifier"] == "XGBoost": 
    best = study.best_params

    xgb_params = {
        'learning_rate':    best["xgb_learning_rate"],
        'n_estimators':     best["xgb_n_estimators"],
        'max_depth':        best["xgb_max_depth"],
        'subsample':        best["xgb_subsample"],
        'scale_pos_weight': best["xgb_scale_pos_weight"],
        'random_state':     42,
        'verbosity':        0
    }

    model = xgb.XGBClassifier(**xgb_params)


pipeline = Pipeline([
        ("preprocessing", preprocessing_pipeline), 
        ("model", model)
    ])

pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('feature_engg', ...), ('preprocessor', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function eng...001A564DED6C0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('ohe', ...), ('ord', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<ufunc 'log1p'>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
# y_prob = pipeline.predict_proba(X_test)[:,1]

# threshold = 0.35
# y_pred = (y_prob >= threshold).astype(int)

In [29]:
y_pred = pipeline.predict(X_test)

In [30]:
print(f1_score(y_test, y_pred))

0.6325


In [31]:
print(confusion_matrix(y_test, y_pred))

[[1453  140]
 [ 154  253]]


In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.91      0.91      1593
           1       0.64      0.62      0.63       407

    accuracy                           0.85      2000
   macro avg       0.77      0.77      0.77      2000
weighted avg       0.85      0.85      0.85      2000



In [33]:
pickle.dump(pipeline, open("../models/final_model_pipeline.pkl","wb"))