# Telco Customer Churn Prediction Results

## Introduction and Context

## Libraries and Configurations

In [1]:
import sys
from pathlib import Path

# path to project root
project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [None]:
import src.utils as ut
import src.preprocess as pp
import src.config as cf
import src.train as tn
import src.evaluate as et

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve

pd.options.display.max_columns = None

## Data Loading

In [None]:
raw_data = ut.load_data(cf.paths['data_raw'])
raw_data.head()

## Data Splitting and Preprocessing

### Target - Feature and Train - Test Split

In [4]:
X = raw_data.drop(columns = 'Churn', axis = 1)
y = raw_data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = cf.test_size, random_state = cf.random_state)

# hard copies for error analysis
X_train_plain = X_train.copy()
X_test_plain = X_test.copy()
y_train_plain = y_train.copy()
y_test_plain = y_test.copy()

### Pipeline Definition

In [5]:
# create preprocessing pipeline for target variable
target_preprocessing_pipeline = pp.TargetPreprocess()

# create preprocessing pipeline for feature variables
feature_preprocessing_pipeline = Pipeline(steps = [
    ('feature_selection', pp.FeatureSelector()),
    ('data_cleaning', pp.DataCleaning()),
    ('feature_engineering', pp.FeatureEngineering()),
    ('outlier_detection', pp.OutlierDetector()),
    ('missing_values', pp.MissingValuesHandler()),
    ('encoding', pp.CategoricalEncoder()),
    ('scaling', pp.Scaling())
])

### Target Variable Preprocessing

In [6]:
y_train = target_preprocessing_pipeline.fit_transform(y_train)
y_test = target_preprocessing_pipeline.fit_transform(y_test)


### Feature Variable Preprocessing

In [None]:
X_train = feature_preprocessing_pipeline.fit_transform(X_train)

In [None]:
X_train.head()

In [None]:
X_test = feature_preprocessing_pipeline.fit_transform(X_test)

In [None]:
X_test.head()

## Modeling

### Logistic Regression Classifier

#### Training

##### Model Fittting

In [None]:
# model fitting
lr_results = tn.train_model(
    model_name = 'LogisticRegression',
    X_train = X_train,
    y_train = y_train,
    mode = 'manual'
    )

# train score
lr_train_score = lr_results['cv_train_score']

# validation score
lr_val_score = lr_results['cv_val_score']

# best model
lr_best_model = lr_results['best_model']

# best parameters
lr_best_params = lr_results['best_params']

##### Overfit Evaluation

In [None]:
lr_scores_df = pd.DataFrame(
    [lr_train_score, lr_val_score],
    index = ['Train Score', 'Validation Score']
)

lr_scores_df = lr_scores_df.style.format({
    'mean' : '{:.2%}',
    'std': '{:.2%}'
})

lr_scores_df

##### Best Hyperparameters

In [None]:
pd.DataFrame(
    list(lr_best_params.items()),
    columns = ['Hyperparameter', 'Value']
)

#### Testing

##### Predictions

In [14]:
# values predictions
y_pred = lr_best_model.predict(X_test)

# probabability predictions
y_pred_proba = lr_best_model.predict_proba(X_test)[:, 1]

##### Classification Report

In [None]:
et.generate_classification_report(y_test, y_pred)

##### Confusion Matrix

In [None]:
et.plot_confusion_matrix(y_test, y_pred)

##### AUC-ROC Curve

In [None]:
et.plot_roc_curve(y_test, y_pred_proba)

##### Precision - Recall Curve

In [None]:
et.plot_precision_recall(y_test, y_pred_proba)

##### Error Analysis

In [19]:
missclassified = X_test_plain[y_test != y_pred]

### SVM Classifier

#### Training

##### Model Fittting

In [None]:
# model fitting
svm_results = tn.train_model(
    model_name = 'SVM',
    X_train = X_train,
    y_train = y_train,
    mode = 'manual'
    )

# train score
svm_train_score = svm_results['cv_train_score']

# validation score
svm_val_score = svm_results['cv_val_score']

# best model
svm_best_model = svm_results['best_model']

# best parameters
svm_best_params = svm_results['best_params']

##### Overfit Evaluation

In [None]:
svm_scores_df = pd.DataFrame(
    [svm_train_score, svm_val_score],
    index = ['Train Score', 'Validation Score']
)

svm_scores_df = svm_scores_df.style.format({
    'mean' : '{:.2%}',
    'std': '{:.2%}'
})

svm_scores_df

##### Best Hyperparameters

In [None]:
pd.DataFrame(
    list(svm_best_params.items()),
    columns = ['Hyperparameter', 'Value']
)

#### Testing

##### Predictions

In [None]:
# values predictions
y_pred = svm_best_model.predict(X_test)

# probabability predictions
y_pred_proba = svm_best_model.predict_proba(X_test)[:, 1]

##### Classification Report

In [None]:
et.generate_classification_report(y_test, y_pred)

##### Confusion Matrix

In [None]:
et.plot_confusion_matrix(y_test, y_pred)

##### AUC-ROC Curve

In [None]:
et.plot_roc_curve(y_test, y_pred_proba)

##### Precision - Recall Curve

In [None]:
et.plot_precision_recall(y_test, y_pred_proba)

##### Error Analysis

In [None]:
missclassified = X_test_plain[y_test != y_pred]

### Random Forest Classifier

#### Training

##### Model Fitting

In [None]:
# model fitting
rf_results = tn.train_model(
    model_name = 'RandomForest',
    X_train = X_train,
    y_train = y_train,
    mode = 'manual' 
)

# train score
rf_train_score = rf_results['cv_train_score']

# validation score
rf_val_score = rf_results['cv_val_score']

# best model
rf_best_model = rf_results['best_model']

# best parameters
rf_best_params = rf_results['best_params']

##### Overfit Evaluation

In [None]:
rf_scores_df = pd.DataFrame(
    [rf_train_score, rf_val_score],
    index = ['Train Score', 'Validation Score']
)

rf_scores_df = rf_scores_df.style.format({
    'mean' : '{:.2%}',
    'std': '{:.2%}'
})

rf_scores_df

##### Best Hyperparameters

In [None]:
pd.DataFrame(
    list(rf_best_params.items()),
    columns = ['Hyperparameter', 'Value']
)

##### Feature Importance

In [None]:
sns.set_style('dark')
sns.set_palette('dark')

importances = rf_best_model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (10, 8))
sns.barplot(
    x = 'Importance', 
    y = 'Feature',
    data = importance_df,
    palette = 'viridis',
    hue = 'Feature',
    dodge = False)
plt.show()

#### Testing

##### Predictions

In [15]:
# values predictions
y_pred = rf_best_model.predict(X_test)

# probabability predictions
y_pred_proba = rf_best_model.predict_proba(X_test)[:, 1]

##### Classification Report

In [None]:
et.generate_classification_report(y_test, y_pred)

##### Confusion Matrix

In [None]:
et.plot_confusion_matrix(y_test, y_pred)

##### AUC-ROC Curve

In [None]:
et.plot_roc_curve(y_test, y_pred_proba)

##### Precision - Recall Curve

In [None]:
et.plot_precision_recall(y_test, y_pred_proba)

##### Error Analysis

In [20]:
missclassified = X_test_plain[y_test != y_pred]

In [None]:
ut.uniqueness_categorical_columns(missclassified)

### Gradient Boosting

#### Training

##### Model Fitting

In [None]:
# model fitting
gb_results = tn.train_model(
    model_name = 'GradientBoosting',
    X_train = X_train,
    y_train = y_train,
    mode = 'manual' 
)

# train score
gb_train_score = gb_results['cv_train_score']

# validation score
gb_val_score = gb_results['cv_val_score']

# best model
gb_best_model = gb_results['best_model']

# best parameters
gb_best_params = gb_results['best_params']

##### Overfit Evaluation

In [None]:
gb_scores_df = pd.DataFrame(
    [gb_train_score, gb_val_score],
    index = ['Train Score', 'Validation Score']
)

gb_scores_df = gb_scores_df.style.format({
    'mean' : '{:.2%}',
    'std': '{:.2%}'
})

gb_scores_df

##### Best Hyperparameters

In [None]:
pd.DataFrame(
    list(gb_best_params.items()),
    columns = ['Hyperparameter', 'Value']
    )

##### Feature Importance

In [None]:
sns.set_style('dark')
sns.set_palette('dark')

importances = gb_best_model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
}).sort_values(by = 'Importance', ascending = False)

plt.figure(figsize = (10, 8))
sns.barplot(
    x = 'Importance', 
    y = 'Feature',
    data = importance_df,
    palette = 'viridis',
    hue = 'Feature',
    dodge = False)
plt.show()

#### Testing

##### Predictions

In [26]:
# values predictions
y_pred = gb_best_model.predict(X_test)

# probabability predictions
y_pred_proba = gb_best_model.predict_proba(X_test)[:, 1]

##### Classification Report

In [None]:
et.generate_classification_report(y_test, y_pred)

##### Confusion Matrix

In [None]:
et.plot_confusion_matrix(y_test, y_pred)

##### AUC-ROC Curve

In [None]:
et.plot_roc_curve(y_test, y_pred_proba)

##### Precision - Recall Curve

In [None]:
et.plot_precision_recall(y_test, y_pred_proba)

## Model Benchmarking