In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, auc, roc_auc_score, roc_curve, accuracy_score
from pycaret.classification import *

In [None]:
df = pd.read_csv("F:/Data/datas/WA_Fn-UseC_-Telco-Customer-Churn.csv", index_col=0)

**Fix Data Error**
- 0 tenure value causes the blank value in the total charges column and makes that column detected automatically as object data type
- We replace the blank string values with 0.0 float number

In [None]:
# Fix data type error
df.loc[df['tenure'] == 0, 'TotalCharges'] = 0.0
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
print("Missing Values:", df['TotalCharges'].isna().sum())

### Model Setup

- We do split our data into train data and test data to prevent data leakage to test our model (even though the PyCaret itself already handles this data train-test split automatically to prevent the data leakage issue)
- We also use the original data frame because the PyCaret will handle the all data-processed steps

In [None]:
data, data_unseen = train_test_split(df, test_size=0.1, random_state=123)

In [None]:
exp01 = setup(data=data, target="Churn", session_id=RANDOM_SEED, ignore_features=["customerID"], 
                numeric_features=["SeniorCitizen"], normalize=True,
                feature_selection=True, remove_outliers=True,
                remove_multicollinearity=True, fix_imbalance=True,
                transformation=True, ignore_low_variance=True, pca=True, 
                bin_numeric_features=["MonthlyCharges", "TotalCharges"],
                silent=True, experiment_name="customer-churn-prediction",
                log_experiment=True)

In [None]:
classification_setup = setup(
    data=data,
    target='Churn',
    session_id=123,
    remove_outliers=True,
    outliers_method='iforest',
    outliers_threshold=0.05,
    train_size=0.7,
    normalize=False,
    normalize_method='zscore',
    transformation=True,
    transformation_method='yeo-johnson',
    fix_imbalance=True,
    fix_imbalance_method='SMOTE',
    fold_strategy='stratifiedkfold',
    fold=5,
    data_split_stratify=True,
)

In [None]:
data_unseen = classification_setup.pipeline.transform(data_unseen)

### Get the best model

In [None]:
get_config('y_train_transformed').value_counts()

In [None]:
get_config('y_test_transformed').value_counts()

In [None]:
best_model = compare_models(budget_time=1)

- We chose the AdaBoost Classifier as our general model because it achieved the highest AUC and F1 scores.
- We selected Naive Bayes for optimizing recall due to its high recall yet good AUC score.

### Create a Model

In [None]:
lda_model = create_model('lda')

In [None]:
lda_model_tuned = tune_model(lda_model, optimize='AUC')

In [None]:
ada_model = create_model('ada')

In [None]:
nb_model = create_model('nb')

In [None]:
test_data_X = test_data.drop('Churn', axis=1)
test_data_y = test_data['Churn']

In [None]:
y_pred_ada_proba = ada_model.predict_proba(get_config('X_test'))
y_pred_nb_proba = nb_model.predict_proba(get_config('X_test'))

print("ada model test accuracy:", ada_model.score(get_config('X_test'), get_config('y_test')))
print("nb model test accuracy:", nb_model.score(get_config('X_test'), get_config('y_test')))
print("ada model auc score:", roc_auc_score(get_config('y_test'), y_pred_ada_proba[:, 1]))
print("nb model auc score:", roc_auc_score(get_config('y_test'), y_pred_nb_proba[:, 1]))

In [None]:
y_pred_ada_proba = ada_model.predict_proba(test_data_X)
y_pred_nb_proba = nb_model.predict_proba(test_data_X)

print("ada model test accuracy:", ada_model.score(test_data_X, test_data_y))
print("nb model test accuracy:", nb_model.score(test_data_X, test_data_y))
print("ada model auc score:", roc_auc_score(test_data_y, y_pred_ada_proba[:, 1]))
print("nb model auc score:", roc_auc_score(test_data_y, y_pred_nb_proba[:, 1]))

### Model Evaluation

In [None]:
evaluate_model(ada_model)

In [None]:
evaluate_model(nb_model)

### Tuning Model

In [None]:
custom_grid = {'n_estimators': [50, 100, 150, 200], 'learning_rate': [0.05, 0.1, 0.2, 0.5, 1]}

In [None]:
ada_model = tune_model(ada_model, optimize='auc', custom_grid=custom_grid)

In [None]:
param_grid_gnb = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}

In [None]:
nb_model = tune_model(nb_model, optimize='recall', custom_grid=param_grid_gnb)

- We decide to choose **Recall** optimization as the evaluation scoring metric.
- we assume that:
    - The cost of acquiring new customers is higher than retaining existing ones.
    - Retention Efforts Are Not Exorbitantly Expensive