# 02 â€” Modeling, Imbalance Handling, and Evaluation

In [11]:
import os, sys
# go up one folder from notebooks/ to the project root
sys.path.append(os.path.abspath(".."))

from src.preprocess import load_raw, clean_basic, split_Xy, build_preprocessor, make_train_test
from src.imbalance_handler import smote_balance, get_class_weights, describe_distribution
from src.models import get_random_forest, get_keras_classifier
from src.evaluate import evaluate_binary, plot_confusion_matrix, plot_roc
import numpy as np
import matplotlib.pyplot as plt
import os

CSV_PATH = 'data/TelcoCustomerChurn.csv'
assert os.path.exists(CSV_PATH), "Please place the Telco CSV in data/ first."

raw = load_raw(CSV_PATH)
df = clean_basic(raw)
X, y = split_Xy(df)
X_train, X_test, y_train, y_test = make_train_test(X, y, test_size=0.2, random_state=42)
print('Train dist:', describe_distribution(y_train))
print('Test  dist:', describe_distribution(y_test))

preprocessor = build_preprocessor(X_train)

Train dist: {0: '4139 (73.46%)', 1: '1495 (26.54%)'}
Test  dist: {0: '1035 (73.46%)', 1: '374 (26.54%)'}


## Random Forest (Baseline)

In [12]:
from sklearn.pipeline import Pipeline
rf = get_random_forest()
rf_pipe = Pipeline([("prep", preprocessor), ("model", rf)])
rf_pipe.fit(X_train, y_train)
y_pred_rf = rf_pipe.predict(X_test)
y_proba_rf = rf_pipe.predict_proba(X_test)[:,1]
metrics_rf = evaluate_binary(y_test, y_pred_rf, y_proba_rf)
metrics_rf

{'accuracy': 0.7842441447835344,
 'precision': 0.6223776223776224,
 'recall': 0.47593582887700536,
 'f1': 0.5393939393939394,
 'roc_auc': 0.8233589087809037}

In [13]:
os.makedirs('results/confusion_matrices', exist_ok=True)
os.makedirs('results/metrics', exist_ok=True)
plot_confusion_matrix(y_test, y_pred_rf, 'results/confusion_matrices/rf_confusion_nb.png', 'RF Confusion (NB)')
plot_roc(y_test, y_proba_rf, 'results/confusion_matrices/rf_roc_nb.png', 'RF ROC (NB)')
metrics_rf

{'accuracy': 0.7842441447835344,
 'precision': 0.6223776223776224,
 'recall': 0.47593582887700536,
 'f1': 0.5393939393939394,
 'roc_auc': 0.8233589087809037}

## Neural Network (with SMOTE)

In [14]:
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)
X_train_bal, y_train_bal = smote_balance(X_train_trans, y_train)

nn = get_keras_classifier(input_dim=X_train_bal.shape[1])
nn.fit(X_train_bal, y_train_bal, verbose=0)
y_pred_nn = nn.predict(X_test_trans)
y_proba_nn = nn.predict_proba(X_test_trans)[:,1]
metrics_nn = evaluate_binary(y_test, y_pred_nn, y_proba_nn)
metrics_nn

{'accuracy': 0.7473385379701917,
 'precision': 0.516728624535316,
 'recall': 0.7433155080213903,
 'f1': 0.6096491228070176,
 'roc_auc': 0.8287865870986075}

In [15]:
plot_confusion_matrix(y_test, y_pred_nn, 'results/confusion_matrices/nn_confusion_nb.png', 'NN Confusion (NB)')
plot_roc(y_test, y_proba_nn, 'results/confusion_matrices/nn_roc_nb.png', 'NN ROC (NB)')
metrics_nn

{'accuracy': 0.7473385379701917,
 'precision': 0.516728624535316,
 'recall': 0.7433155080213903,
 'f1': 0.6096491228070176,
 'roc_auc': 0.8287865870986075}