## Model Development and Evaluation - Part 02

+ On top of the Previous Models
+ Performed Stratified Sampling
+ Cross Validation
+ Regularization
+ Hyperparameter Tuning
+ Evaluate the Models Performance using same metrics.

In [1]:
## import required libraries here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, \
                            roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

In [2]:
## load the data and split it into train and test sets
## load the data
data = pd.read_csv('processed_customer_data.csv')
# data.info()

## split the data to features and target
features = data.drop(['Churn', 'customerID'], axis=1) 
target =  data['Churn']

print("Shape of the feature dataset :", features.shape)
print("Shape of the target dataset :", target.shape)

Shape of the feature dataset : (7043, 43)
Shape of the target dataset : (7043,)


In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
print("Training Features : ", X_train.shape)
print("Testing Features : ", X_test.shape)
print("Training Target : ", y_train.shape)
print("Testing Target : ", y_test.shape)

Training Features :  (5634, 43)
Testing Features :  (1409, 43)
Training Target :  (5634,)
Testing Target :  (1409,)


### Phase 02

In [4]:
# Initialize Stratified K-Fold with 5 splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the models and their parameter grids for hyperparameter tuning
models = {
    "Logistic Regression": (LogisticRegression(random_state=42, max_iter=1000), 
                            {'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear']}),

    "Decision Tree": (DecisionTreeClassifier(random_state=42), 
                      {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}),

    "Random Forest": (RandomForestClassifier(random_state=42), 
                      {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}),

    "Gradient Boosting": (GradientBoostingClassifier(random_state=42), 
                          {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}),

    "XGBoost": (XGBClassifier(random_state=42, eval_metric='logloss'), 
                {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}),

    "SVM": (SVC(random_state=42, probability=True), 
            {'C': [0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}),

    "KNN": (KNeighborsClassifier(), 
            {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}),

    "Naive Bayes": (GaussianNB(), {}),  # No hyperparameters to tune

    "LightGBM": (LGBMClassifier(random_state=42), 
                 {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]})
}

# Loop through each model
for model_name, (model, param_grid) in models.items():
    print(f"Tuning hyperparameters for {model_name}...")
    
    # Use GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, cv=skf, scoring='accuracy', n_jobs=-1)  # Use Stratified K-Fold
    grid_search.fit(X_train, y_train)
    
    # Best estimator after hyperparameter tuning
    best_model = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    
    # Make predictions with the best model
    y_pred = best_model.predict(X_test)
    y_pred_prob = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}")
    
    # Display classification report
    print(f"Classification Report - {model_name}:")
    print(classification_report(y_test, y_pred))
    
    # Display confusion matrix
    print(f"Confusion Matrix - {model_name}:")
    print(confusion_matrix(y_test, y_pred))
    
#     # Plot and compute AUC-ROC Curve if model provides probability scores
#     if y_pred_prob is not None:
#         # AUC-ROC Score
#         auc_score = roc_auc_score(y_test, y_pred_prob)
#         print(f"AUC-ROC Score - {model_name}: {auc_score:.2f}")
        
#         # ROC Curve
#         fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
#         plt.figure()
#         plt.plot(fpr, tpr, color='darkorange', label=f'ROC curve (area = {auc_score:.2f})')
#         plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
#         plt.xlabel('False Positive Rate')
#         plt.ylabel('True Positive Rate')
#         plt.title(f'Receiver Operating Characteristic (ROC) Curve - {model_name}')
#         plt.legend(loc="lower right")
#         plt.show()
        
#         # Precision-Recall Curve
#         precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
#         avg_precision = average_precision_score(y_test, y_pred_prob)
#         print(f"Average Precision Score - {model_name}: {avg_precision:.2f}")
        
#         plt.figure()
#         plt.plot(recall, precision, color='b', label=f'Precision-Recall curve (area = {avg_precision:.2f})')
#         plt.xlabel('Recall')
#         plt.ylabel('Precision')
#         plt.title(f'Precision-Recall Curve - {model_name}')
#         plt.legend(loc="lower left")
#         plt.show()
    
    print("\n" + "="*50 + "\n")


Tuning hyperparameters for Logistic Regression...
Best parameters for Logistic Regression: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Logistic Regression Accuracy: 0.82
Classification Report - Logistic Regression:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1036
           1       0.69      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.78      0.75      0.76      1409
weighted avg       0.82      0.82      0.82      1409

Confusion Matrix - Logistic Regression:
[[935 101]
 [148 225]]


Tuning hyperparameters for Decision Tree...
Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_split': 10}
Decision Tree Accuracy: 0.76
Classification Report - Decision Tree:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      1036
           1       0.56      0.52      0.54       373

    accuracy                      

### Phase 03

In [None]:
# Initialize Stratified K-Fold with 5 splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "Logistic Regression": (LogisticRegression(random_state=42, max_iter=1000), 
                            {'penalty': ['l1', 'l2'], 
                             'C': [0.001, 0.01, 0.1, 1, 10, 100], 
                             'solver': ['liblinear', 'saga']}),

    "Decision Tree": (DecisionTreeClassifier(random_state=42), 
                      {'max_depth': [None, 5, 10, 15, 20, 30], 
                       'min_samples_split': [2, 5, 10, 20], 
                       'min_samples_leaf': [1, 2, 4]}),

    "Random Forest": (RandomForestClassifier(random_state=42), 
                      {'n_estimators': [100, 200, 300, 500], 
                       'max_depth': [None, 10, 20, 30, 50], 
                       'min_samples_split': [2, 5, 10],
                       'min_samples_leaf': [1, 2, 4]}),

    "Gradient Boosting": (GradientBoostingClassifier(random_state=42), 
                          {'n_estimators': [100, 200, 300], 
                           'learning_rate': [0.01, 0.05, 0.1, 0.2], 
                           'max_depth': [3, 5, 7, 9]}),

    "XGBoost": (XGBClassifier(random_state=42, eval_metric='logloss'), 
                {'n_estimators': [100, 200, 300], 
                 'learning_rate': [0.01, 0.05, 0.1, 0.2], 
                 'max_depth': [3, 5, 7, 9], 
                 'subsample': [0.7, 0.8, 0.9, 1], 
                 'colsample_bytree': [0.7, 0.8, 0.9, 1]}),

    "SVM": (SVC(random_state=42, probability=True), 
            {'C': [0.01, 0.1, 1, 10, 100], 
             'kernel': ['linear', 'rbf', 'poly'], 
             'gamma': ['scale', 'auto']}),

    "KNN": (KNeighborsClassifier(), 
            {'n_neighbors': [3, 5, 7, 9, 11], 
             'weights': ['uniform', 'distance'], 
             'p': [1, 2]}),

    "Naive Bayes": (GaussianNB(), {}),  # No hyperparameters to tune

    "LightGBM": (LGBMClassifier(random_state=42), 
                 {'n_estimators': [100, 200, 300], 
                  'learning_rate': [0.01, 0.05, 0.1, 0.2], 
                  'max_depth': [3, 5, 7, 9], 
                  'num_leaves': [31, 40, 50], 
                  'subsample': [0.7, 0.8, 0.9, 1]})
}

# Loop through each model
for model_name, (model, param_grid) in models.items():
    print(f"Tuning hyperparameters for {model_name}...")
    
    # Use GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, cv=skf, scoring='accuracy', n_jobs=-1)  # Use Stratified K-Fold
    grid_search.fit(X_train, y_train)
    
    # Best estimator after hyperparameter tuning
    best_model = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    
    # Make predictions with the best model
    y_pred = best_model.predict(X_test)
    y_pred_prob = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}")
    
    # Display classification report
    print(f"Classification Report - {model_name}:")
    print(classification_report(y_test, y_pred))
    
    # Display confusion matrix
    print(f"Confusion Matrix - {model_name}:")
    print(confusion_matrix(y_test, y_pred))
    
#     # Plot and compute AUC-ROC Curve if model provides probability scores
#     if y_pred_prob is not None:
#         # AUC-ROC Score
#         auc_score = roc_auc_score(y_test, y_pred_prob)
#         print(f"AUC-ROC Score - {model_name}: {auc_score:.2f}")
        
#         # ROC Curve
#         fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
#         plt.figure()
#         plt.plot(fpr, tpr, color='darkorange', label=f'ROC curve (area = {auc_score:.2f})')
#         plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
#         plt.xlabel('False Positive Rate')
#         plt.ylabel('True Positive Rate')
#         plt.title(f'Receiver Operating Characteristic (ROC) Curve - {model_name}')
#         plt.legend(loc="lower right")
#         plt.show()
        
#         # Precision-Recall Curve
#         precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
#         avg_precision = average_precision_score(y_test, y_pred_prob)
#         print(f"Average Precision Score - {model_name}: {avg_precision:.2f}")
        
#         plt.figure()
#         plt.plot(recall, precision, color='b', label=f'Precision-Recall curve (area = {avg_precision:.2f})')
#         plt.xlabel('Recall')
#         plt.ylabel('Precision')
#         plt.title(f'Precision-Recall Curve - {model_name}')
#         plt.legend(loc="lower left")
#         plt.show()
    
    print("\n" + "="*50 + "\n")

Tuning hyperparameters for Logistic Regression...
Best parameters for Logistic Regression: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Logistic Regression Accuracy: 0.82
Classification Report - Logistic Regression:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1036
           1       0.69      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.78      0.75      0.76      1409
weighted avg       0.82      0.82      0.82      1409

Confusion Matrix - Logistic Regression:
[[936 100]
 [149 224]]


Tuning hyperparameters for Decision Tree...
Best parameters for Decision Tree: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}
Decision Tree Accuracy: 0.80
Classification Report - Decision Tree:
              precision    recall  f1-score   support

           0       0.87      0.86      0.86      1036
           1       0.62      0.64      0.63       373

    accuracy      