## Model Development and Evaluation - Part 03

+ Doing the One of the Feature Selection Method.
+ Feature Selection by Training the Random Forest Classifier.

#### Feature Selection by Training Random Forest Classifier

In [25]:
## import required libraries here
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, \
                            roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

In [2]:
## load the dataset
data = pd.read_csv('processed_customer_data.csv')

## train a random forest classifier
X = data.drop(['Churn', 'customerID'], axis=1)  # Drop target and irrelevant features, customerID is irrelevant
y = data['Churn']

# Initialize the model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X, y)

# Get feature importances
feature_importances = pd.DataFrame(rf_model.feature_importances_,
                                   index=X.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

# Display feature importances
print(feature_importances)

                                         importance
TotalCharges                               0.169007
tenure                                     0.145959
MonthlyCharges                             0.142845
Contract                                   0.064140
OnlineSecurity_No                          0.032541
PaymentMethod_Electronic check             0.031039
InternetService_Fiber optic                0.024319
TechSupport_No                             0.022592
SeniorCitizen                              0.019939
OnlineBackup_No                            0.019909
gender_Female                              0.017638
gender_Male                                0.017401
DeviceProtection_No                        0.016860
PaperlessBilling_Yes                       0.015118
PaperlessBilling_No                        0.014611
Partner_No                                 0.014557
Partner_Yes                                0.014058
OnlineSecurity_Yes                         0.012972
MultipleLine

**Decide on the Number of Top Features** 
+ A common approach is to select the features that cumulatively represent a high percentage (e.g., 80-90%) of the total importance or the top N features based on the importance ranking.

Cumulative Importance Features which will give 80%

In [10]:
# List of selected features for cumulative importance calculation
selected_features = [
    'TotalCharges', 'tenure', 'MonthlyCharges', 'Contract', 'OnlineSecurity_No', 
    'PaymentMethod_Electronic check', 'InternetService_Fiber optic', 'TechSupport_No', 
    'SeniorCitizen', 'OnlineBackup_No', 'gender_Female', 'gender_Male', 'DeviceProtection_No', 
    'PaperlessBilling_Yes', 'PaperlessBilling_No', 'Partner_No', 'Partner_Yes', 'OnlineSecurity_Yes'
]

# Filter the feature importances DataFrame to only include the selected features
selected_feature_importances = feature_importances.loc[selected_features]

# Calculate the cumulative importance for these features
selected_feature_importances['cumulative_importance'] = selected_feature_importances['importance'].cumsum()

# Determine the top features that cumulatively represent 80% importance
top_features_80 = selected_feature_importances[selected_feature_importances['cumulative_importance'] <= 0.80]

# Display the top features for 80% cumulative importance
print("\nTop Features Representing 80% Cumulative Importance:")
print(top_features_80)


Top Features Representing 80% Cumulative Importance:
                                importance  cumulative_importance
TotalCharges                      0.169007               0.169007
tenure                            0.145959               0.314966
MonthlyCharges                    0.142845               0.457811
Contract                          0.064140               0.521951
OnlineSecurity_No                 0.032541               0.554492
PaymentMethod_Electronic check    0.031039               0.585531
InternetService_Fiber optic       0.024319               0.609850
TechSupport_No                    0.022592               0.632443
SeniorCitizen                     0.019939               0.652382
OnlineBackup_No                   0.019909               0.672291
gender_Female                     0.017638               0.689929
gender_Male                       0.017401               0.707330
DeviceProtection_No               0.016860               0.724190
PaperlessBilling_Yes  

In [22]:
## prepared data
## selected columns for feature selected columns
selected_features = [
    'TotalCharges', 'tenure', 'MonthlyCharges', 'Contract', 'OnlineSecurity_No', 
    'PaymentMethod_Electronic check', 'InternetService_Fiber optic', 'TechSupport_No', 
    'SeniorCitizen', 'OnlineBackup_No', 'gender_Female', 'gender_Male', 'DeviceProtection_No', 
    'PaperlessBilling_Yes', 'PaperlessBilling_No', 'Partner_No', 'Partner_Yes', 'OnlineSecurity_Yes', 'Churn'
]
# Select only the desired columns from the original DataFrame
selected_features_df = data[selected_features]

features = selected_features_df.drop('Churn', axis = 1)
target = selected_features_df['Churn']
print("Shape of the feature dataset :", features.shape)
print("Shape of the target dataset :", target.shape)

## split the data to train and test sets
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
print("Training Features : ", X_train.shape)
print("Testing Features : ", X_test.shape)
print("Training Target : ", y_train.shape)
print("Testing Target : ", y_test.shape)

Shape of the feature dataset : (7043, 18)
Shape of the target dataset : (7043,)
Training Features :  (5634, 18)
Testing Features :  (1409, 18)
Training Target :  (5634,)
Testing Target :  (1409,)


In [23]:
# Initialize Stratified K-Fold with 5 splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the models and their parameter grids for hyperparameter tuning
models = {
    "Logistic Regression": (LogisticRegression(random_state=42, max_iter=1000), 
                            {'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear']}),

    "Decision Tree": (DecisionTreeClassifier(random_state=42), 
                      {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}),

    "Random Forest": (RandomForestClassifier(random_state=42), 
                      {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}),

    "Gradient Boosting": (GradientBoostingClassifier(random_state=42), 
                          {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}),

    "XGBoost": (XGBClassifier(random_state=42, eval_metric='logloss'), 
                {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}),

    "SVM": (SVC(random_state=42, probability=True), 
            {'C': [0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}),

    "KNN": (KNeighborsClassifier(), 
            {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}),

    "Naive Bayes": (GaussianNB(), {}),  # No hyperparameters to tune

    "LightGBM": (LGBMClassifier(random_state=42), 
                 {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]})
}

# Loop through each model
for model_name, (model, param_grid) in models.items():
    print(f"Tuning hyperparameters for {model_name}...")
    
    # Use GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, cv=skf, scoring='accuracy', n_jobs=-1)  # Use Stratified K-Fold
    grid_search.fit(X_train, y_train)
    
    # Best estimator after hyperparameter tuning
    best_model = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    
    # Make predictions with the best model
    y_pred = best_model.predict(X_test)
    y_pred_prob = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}")
    
    # Display classification report
    print(f"Classification Report - {model_name}:")
    print(classification_report(y_test, y_pred))
    
    # Display confusion matrix
    print(f"Confusion Matrix - {model_name}:")
    print(confusion_matrix(y_test, y_pred))
    
#     # Plot and compute AUC-ROC Curve if model provides probability scores
#     if y_pred_prob is not None:
#         # AUC-ROC Score
#         auc_score = roc_auc_score(y_test, y_pred_prob)
#         print(f"AUC-ROC Score - {model_name}: {auc_score:.2f}")
        
#         # ROC Curve
#         fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
#         plt.figure()
#         plt.plot(fpr, tpr, color='darkorange', label=f'ROC curve (area = {auc_score:.2f})')
#         plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
#         plt.xlabel('False Positive Rate')
#         plt.ylabel('True Positive Rate')
#         plt.title(f'Receiver Operating Characteristic (ROC) Curve - {model_name}')
#         plt.legend(loc="lower right")
#         plt.show()
        
#         # Precision-Recall Curve
#         precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
#         avg_precision = average_precision_score(y_test, y_pred_prob)
#         print(f"Average Precision Score - {model_name}: {avg_precision:.2f}")
        
#         plt.figure()
#         plt.plot(recall, precision, color='b', label=f'Precision-Recall curve (area = {avg_precision:.2f})')
#         plt.xlabel('Recall')
#         plt.ylabel('Precision')
#         plt.title(f'Precision-Recall Curve - {model_name}')
#         plt.legend(loc="lower left")
#         plt.show()
    
    print("\n" + "="*50 + "\n")


Tuning hyperparameters for Logistic Regression...
Best parameters for Logistic Regression: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Logistic Regression Accuracy: 0.82
Classification Report - Logistic Regression:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1036
           1       0.68      0.58      0.63       373

    accuracy                           0.82      1409
   macro avg       0.77      0.74      0.75      1409
weighted avg       0.81      0.82      0.81      1409

Confusion Matrix - Logistic Regression:
[[933 103]
 [155 218]]


Tuning hyperparameters for Decision Tree...
Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_split': 10}
Decision Tree Accuracy: 0.77
Classification Report - Decision Tree:
              precision    recall  f1-score   support

           0       0.84      0.85      0.84      1036
           1       0.57      0.55      0.56       373

    accuracy                     

#### Feature Selection by Recursive Feature Elimination (RFE)

In [38]:
## Recursive Feature Elimination (RFE) method
## load the dataset
data = pd.read_csv('processed_customer_data.csv')

## train a random forest classifier
X = data.drop(['Churn', 'customerID'], axis=1)  # Drop target and irrelevant features, customerID is irrelevant
y = data['Churn']

# Initialize the model
rf_model = RandomForestClassifier(random_state=42)

# Initialize RFE
rfe = RFE(estimator=rf_model, n_features_to_select=20)  # You can choose the number of features to keep
rfe.fit(X, y)

# Get the ranking of the features
feature_ranking = pd.DataFrame({'Feature': X.columns, 'Ranking': rfe.ranking_})
# print(feature_ranking.sort_values('Ranking'))

# Filter features with ranking 1
selected_features_rfe = feature_ranking[feature_ranking['Ranking'] == 1]['Feature'].tolist()

# print(selected_features_rfe)

selected_features = ['SeniorCitizen', 'tenure', 'Contract', 'MonthlyCharges', 'TotalCharges', 'gender_Female', 'gender_Male', 'Partner_Yes',
                     'Dependents_No', 'MultipleLines_No', 'InternetService_Fiber optic', 'OnlineSecurity_No', 'OnlineBackup_No', 
                     'DeviceProtection_No', 'TechSupport_No', 'StreamingTV_No', 'StreamingMovies_No', 'PaperlessBilling_Yes', 
                     'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'Churn']

# Create a new DataFrame with only the selected features
data_selected_rfe = data[selected_features]

In [39]:
features = data_selected_rfe.drop('Churn', axis = 1)
target = data_selected_rfe['Churn']
print("Shape of the feature dataset :", features.shape)
print("Shape of the target dataset :", target.shape)

## split the data to train and test sets
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
print("Training Features : ", X_train.shape)
print("Testing Features : ", X_test.shape)
print("Training Target : ", y_train.shape)
print("Testing Target : ", y_test.shape)



Shape of the feature dataset : (7043, 20)
Shape of the target dataset : (7043,)
Training Features :  (5634, 20)
Testing Features :  (1409, 20)
Training Target :  (5634,)
Testing Target :  (1409,)


In [40]:
# Initialize Stratified K-Fold with 5 splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the models and their parameter grids for hyperparameter tuning
models = {
    "Logistic Regression": (LogisticRegression(random_state=42, max_iter=1000), 
                            {'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear']}),

    "Decision Tree": (DecisionTreeClassifier(random_state=42), 
                      {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}),

    "Random Forest": (RandomForestClassifier(random_state=42), 
                      {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}),

    "Gradient Boosting": (GradientBoostingClassifier(random_state=42), 
                          {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}),

    "XGBoost": (XGBClassifier(random_state=42, eval_metric='logloss'), 
                {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}),

    "SVM": (SVC(random_state=42, probability=True), 
            {'C': [0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}),

    "KNN": (KNeighborsClassifier(), 
            {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}),

    "Naive Bayes": (GaussianNB(), {}),  # No hyperparameters to tune

    "LightGBM": (LGBMClassifier(random_state=42), 
                 {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]})
}

# Loop through each model
for model_name, (model, param_grid) in models.items():
    print(f"Tuning hyperparameters for {model_name}...")
    
    # Use GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, cv=skf, scoring='accuracy', n_jobs=-1)  # Use Stratified K-Fold
    grid_search.fit(X_train, y_train)
    
    # Best estimator after hyperparameter tuning
    best_model = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    
    # Make predictions with the best model
    y_pred = best_model.predict(X_test)
    y_pred_prob = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}")
    
    # Display classification report
    print(f"Classification Report - {model_name}:")
    print(classification_report(y_test, y_pred))
    
    # Display confusion matrix
    print(f"Confusion Matrix - {model_name}:")
    print(confusion_matrix(y_test, y_pred))
    
#     # Plot and compute AUC-ROC Curve if model provides probability scores
#     if y_pred_prob is not None:
#         # AUC-ROC Score
#         auc_score = roc_auc_score(y_test, y_pred_prob)
#         print(f"AUC-ROC Score - {model_name}: {auc_score:.2f}")
        
#         # ROC Curve
#         fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
#         plt.figure()
#         plt.plot(fpr, tpr, color='darkorange', label=f'ROC curve (area = {auc_score:.2f})')
#         plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
#         plt.xlabel('False Positive Rate')
#         plt.ylabel('True Positive Rate')
#         plt.title(f'Receiver Operating Characteristic (ROC) Curve - {model_name}')
#         plt.legend(loc="lower right")
#         plt.show()
        
#         # Precision-Recall Curve
#         precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
#         avg_precision = average_precision_score(y_test, y_pred_prob)
#         print(f"Average Precision Score - {model_name}: {avg_precision:.2f}")
        
#         plt.figure()
#         plt.plot(recall, precision, color='b', label=f'Precision-Recall curve (area = {avg_precision:.2f})')
#         plt.xlabel('Recall')
#         plt.ylabel('Precision')
#         plt.title(f'Precision-Recall Curve - {model_name}')
#         plt.legend(loc="lower left")
#         plt.show()
    
    print("\n" + "="*50 + "\n")

Tuning hyperparameters for Logistic Regression...
Best parameters for Logistic Regression: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Logistic Regression Accuracy: 0.82
Classification Report - Logistic Regression:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      1036
           1       0.68      0.58      0.63       373

    accuracy                           0.82      1409
   macro avg       0.77      0.74      0.75      1409
weighted avg       0.81      0.82      0.81      1409

Confusion Matrix - Logistic Regression:
[[936 100]
 [156 217]]


Tuning hyperparameters for Decision Tree...
Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_split': 10}
Decision Tree Accuracy: 0.77
Classification Report - Decision Tree:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84      1036
           1       0.57      0.52      0.54       373

    accuracy                     