## Model_Training

#### Objective

* To train a baseline churn model
* To Use Logistic regression for interpretability
* To Evaluate with business-appropriate metrics
* To produce artifacts that can be compared later

In [1]:
import pandas as pd
from pathlib import Path
from src.data_ingestion import load_engineered_data
from src.preprocessing import split_features_target, split_train_test, drop_unused_columns
from src.features import build_preprocessor
from src.modeling import (build_gradient_boosting, build_logisitc_regression, build_random_forest)
from src.evaluation import evaluate_classifier, compare_models
from src.utils import save_model_inputs
import joblib

In [2]:
#Configuration/Feature Groups
#Different feature types need different preprocessing tenchniques
NUMERICAL_FEATURES = ['age', 'tenure','balance','credit_score','products_number',
                      'products_per_tenure','balance_per_product','churn_risk_score']
BINARY_FEATURES = ['credit_card','active_member','inactive_single_product',
                   'zero_balance','high_balance','early_customer']
CATEGORICAL_FEATURES = ['age_group','credit_score_band']


ENGINEERED_DATA_PATH = Path("C:\customerchurnprediction\data\engineered\engineeredbank_churn.csv")


In [10]:
#Load Data
df = load_engineered_data(ENGINEERED_DATA_PATH)
df = drop_unused_columns(df, columns_to_drop=['country', 'gender'])


In [11]:
#Split Features
X, y = split_features_target(df, "churn")
X_train, X_test, y_train, y_test = split_train_test(X,y)

In [12]:
#Build Preprocessor
preprocessor = build_preprocessor(
    numerical_features= NUMERICAL_FEATURES,
    categorical_features= CATEGORICAL_FEATURES,
)

In [13]:
#Initialize models
models = {
    "Logistic Regression": build_logisitc_regression(preprocessor),
    "Random Forest": build_random_forest(preprocessor),
    "Gradient Boosting": build_gradient_boosting(preprocessor),
}

In [14]:
#Train Models & Evaluate
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    results[name] = evaluate_classifier(model, X_test, y_test)

comparson = compare_models(results)
display(comparson)

Unnamed: 0,roc_auc,confusion_matrix,classification_report
Gradient Boosting,0.851512,"[[1541, 52], [234, 173]]","{'0': {'precision': 0.8681690140845071, 'recal..."
Random Forest,0.838908,"[[1247, 346], [110, 297]]","{'0': {'precision': 0.9189388356669123, 'recal..."
Logistic Regression,0.506269,"[[31, 1562], [1, 406]]","{'0': {'precision': 0.96875, 'recall': 0.01946..."


### Save the best model Artifact

In [15]:
best_model = models["Gradient Boosting"]
joblib.dump(best_model, 'C:\customerchurnprediction\data\models\gb_churn_model.joblib')



['C:\\customerchurnprediction\\data\\models\\gb_churn_model.joblib']