In [1]:
!pip install catboost
!pip install imbalanced-learn
!pip install lightgbm





In [2]:
import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

# Feature Transformation Related Methods

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, ClassifierMixin
from imblearn.combine import SMOTEENN, SMOTETomek


# MachineLearning Models

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

## Load Data


In [3]:
data = pd.read_csv("C:/Users/ANANDHU/OneDrive/Desktop/Documents/mini_project_1/Telco_Customer_Churn.csv")
data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## TotalCharges Change to Float

In [4]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors = 'coerce')
data.isnull().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

## Checking Duplicat Value

In [5]:
data.duplicated().sum()

0

## Remove customerID in DataFrame

In [6]:
data.drop('customerID', axis = 1, inplace = True)

## Checking data if customerID remove or not

In [7]:
data.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [8]:
X = data.drop('Churn', axis = 1)   # drops the 'Churn' column from the data DataFrame and assigns the remaining data to X
y = data['Churn']                  # assigns the 'Churn' column from DataFrame to the variable y

In [9]:
X.shape, y.shape      # Checking X,y shape

((7043, 19), (7043,))

## Model Training With all Features

In [10]:
# Data Transformation in Pipline and ColumnTransformer 
# Spliting Numeric Features and Categorical Features 


numeric = X.select_dtypes(include = 'number').columns.tolist()  #This line separates the numeric and categorical columns from X into two separate lists
categorical = X.select_dtypes(include = 'object').columns.tolist()

num_pipline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')), #This line creates a pipeline for numeric data with imputation and scaling
    ('scaler', StandardScaler())
])

cat_pipline = Pipeline([
    ('encoder', OrdinalEncoder())  #This line creates a pipeline for categorical data with ordinal encoding
])

preprosser = ColumnTransformer([
    ('numeric', num_pipline, numeric),  
    ('categorical', cat_pipline, categorical)  #This line creates a column transformer that applies separate preprocessing pipelines for numeric and categorical columns
])
preprosser

In [11]:
X_pre_transformed = preprosser.fit_transform(X)    # Fiting Preprosser Object in X Features
X_pre_transformed.shape                            # Checking X_pre_transformed


(7043, 19)

In [12]:

X_pre_transformed         # Printing X_pre_transformed Values        

array([[-0.43991649, -1.27744458, -1.16032292, ...,  0.        ,
         1.        ,  2.        ],
       [-0.43991649,  0.06632742, -0.25962894, ...,  1.        ,
         0.        ,  3.        ],
       [-0.43991649, -1.23672422, -0.36266036, ...,  0.        ,
         1.        ,  3.        ],
       ...,
       [-0.43991649, -0.87024095, -1.1686319 , ...,  0.        ,
         1.        ,  2.        ],
       [ 2.27315869, -1.15528349,  0.32033821, ...,  0.        ,
         1.        ,  3.        ],
       [-0.43991649,  1.36937906,  1.35896134, ...,  2.        ,
         1.        ,  0.        ]])

In [13]:
le = LabelEncoder()                     # y target Value Encoding
y_encoded = le.fit_transform(y)
y_encoded
y_encoded.shape                         # Checking y_encoded Shape

(7043,)

In [14]:
smt = SMOTEENN()  #SMOTEENN is a technique that combines SMOTE and Edited Nearest Neighbors
X_resampled, y_resampled = smt.fit_resample(X_pre_transformed, y_encoded)
X_resampled.shape, y_resampled.shape

((6384, 19), (6384,))

In [15]:
def evaluate_clf(true, predicted):                                    # Evaluation Metrics
    acc = accuracy_score(true, predicted)
    f1 = f1_score(true, predicted)                                 #This function calculates and returns accuracy, precision, recall,score for a classifier's performance
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    roc_auc = roc_auc_score(true, predicted)
    return acc, f1 , precision, recall, roc_auc

## Training models

In [30]:
from sklearn.model_selection import train_test_split, GridSearchCV
from tqdm.notebook import tqdm
import pandas as pd

def evaluate_models(X, y, models, params):
    """
    Evaluates multiple models using GridSearchCV and returns a report of their performance.

    Parameters:
    - X: Features (DataFrame or ndarray).
    - y: Target labels (Series or ndarray).
    - models: Dictionary of model names and their corresponding instances.
    - params: Dictionary of hyperparameter grids for each model.

    Returns:
    - A DataFrame summarizing the test accuracy of each model.
    """

    # Split the dataset
   
   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize lists to store results
    
    
    report_of_models = {}
    model_names = []
    accuracy_scores = []
    auc_scores = []

    # Iterate through models and their parameters
    
    
    for model_name, model_instance in tqdm(models.items(), desc="Evaluating models"):
        print(f"Evaluating {model_name}...")
        param_grid = params.get(model_name, {})

        # Perform Grid Search
        
        gs = GridSearchCV(model_instance, param_grid, cv=3, scoring='accuracy', verbose=1)
        gs.fit(X_train, y_train)

        # Update the model with the best parameters
        
        best_model = gs.best_estimator_

        # Refit the model on the training set
        
        best_model.fit(X_train, y_train)

        # Predictions for train and test sets
        
        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)

        # Evaluate performance on training and test sets
        
        train_metrics = evaluate_clf(y_train, y_train_pred)
        test_metrics = evaluate_clf(y_test, y_test_pred)

        # Display model performance
        
        print(f"Best Parameters for {model_name}: {gs.best_params_}")
        print("\nModel performance on Training Set:")
        print(f"- Accuracy: {train_metrics[0]:.4f}")
        print(f"- F1 Score: {train_metrics[1]:.4f}")
        print(f"- Precision: {train_metrics[2]:.4f}")
        print(f"- Recall: {train_metrics[3]:.4f}")
        print(f"- ROC AUC Score: {train_metrics[4]:.4f}")

        print("\nModel performance on Test Set:")
        print(f"- Accuracy: {test_metrics[0]:.4f}")
        print(f"- F1 Score: {test_metrics[1]:.4f}")
        print(f"- Precision: {test_metrics[2]:.4f}")
        print(f"- Recall: {test_metrics[3]:.4f}")
        print(f"- ROC AUC Score: {test_metrics[4]:.4f}")
        print("=" * 40)

        # Append results for report
        
        model_names.append(model_name)
        accuracy_scores.append(test_metrics[0])
        auc_scores.append(test_metrics[4])
          # Store accuracy in the dictionary

    # Create a performance report DataFrame
    
    report = pd.DataFrame({
        "Model Name": model_names,
        "Accuracy": accuracy_scores,
        "ROC AUC Score": auc_scores
    }).sort_values(by="Accuracy", ascending=False)

    return report



## Define models with parameters

In [17]:
# Define models

models = {
    'LogisticRegression': LogisticRegression(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'XGBClassifier': XGBClassifier(),
    'LGBMClassifier': LGBMClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'GaussianNB': GaussianNB()
}

# Define Parameters

param_grids = {
    'LogisticRegression': {
        "class_weight":["balanced"],
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
        'min_samples_split': [2]
    },
    'AdaBoostClassifier': {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1]
    },
    'DecisionTreeClassifier': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2]
    },
    'GaussianNB': {},
    'XGBClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 6]
    },
    'LGBMClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 63]
    }
}

In [18]:
report,report_dict = evaluate_models(X_resampled, y_resampled, models, param_grids)
#evaluate_models function to assess the performance of multiple models

Evaluating models:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating LogisticRegression...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Parameters for LogisticRegression: {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'}

Model performance on Training Set:
- Accuracy: 0.9091
- F1 Score: 0.9182
- Precision: 0.9270
- Recall: 0.9095
- ROC AUC Score: 0.9091

Model performance on Test Set:
- Accuracy: 0.9107
- F1 Score: 0.9201
- Precision: 0.9162
- Recall: 0.9239
- ROC AUC Score: 0.9091
Evaluating KNeighborsClassifier...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best Parameters for KNeighborsClassifier: {'n_neighbors': 3}

Model performance on Training Set:
- Accuracy: 0.9875
- F1 Score: 0.9889
- Precision: 0.9814
- Recall: 0.9965
- ROC AUC Score: 0.9862

Model performance on Test Set:
- Accuracy: 0.9624
- F1 Score: 0.9668
- Precision: 0.9497
- Recall: 0.9845
- ROC AUC Score: 0.9596
Evaluating SVC...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best Parameters for SV

In [19]:
report

Unnamed: 0,Model Name,Accuracy,ROC AUC Score
1,KNeighborsClassifier,0.962412,0.959626
7,LGBMClassifier,0.960063,0.958579
3,RandomForestClassifier,0.95928,0.958052
4,GradientBoostingClassifier,0.95928,0.958052
6,XGBClassifier,0.95928,0.957519
2,SVC,0.955364,0.953643
8,DecisionTreeClassifier,0.927956,0.927396
5,AdaBoostClassifier,0.912294,0.908339
0,LogisticRegression,0.910728,0.909062
9,GaussianNB,0.900548,0.90115


## Model Training With 6 Features

In [20]:
X_2 = X[['gender', 'InternetService', 'Contract', 'tenure', 'MonthlyCharges', 'TotalCharges']]
y_2 =y #when eda has done this 6 feature is having more relation with churn 

In [21]:
X_2.head(3)   #displays the first 3 rows of the X_2 data

Unnamed: 0,gender,InternetService,Contract,tenure,MonthlyCharges,TotalCharges
0,Female,DSL,Month-to-month,1,29.85,29.85
1,Male,DSL,One year,34,56.95,1889.5
2,Male,DSL,Month-to-month,2,53.85,108.15


In [22]:
y_2.head(3)  #displays the first 3 rows of the y_2 data

0     No
1     No
2    Yes
Name: Churn, dtype: object

## Data Transformation in Pipline and ColumnTransformer call and combined it


In [23]:
# Data Transformation in Pipline and ColumnTransformer call and combined it

# Spliting Numeric Features and Categorical Features

numeric_1= ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_1= ['gender', 'InternetService', 'Contract']

# Numeric Pipline

num_pipline_1 = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

# Categorical Pipline

cat_pipline_1 = Pipeline([
    ('encoder', OrdinalEncoder())
])

# ColumnTransformer Object

preprosser_1 = ColumnTransformer([
    ('numeric', num_pipline_1, numeric_1),
    ('categorical', cat_pipline_1, categorical_1)
])
preprosser_1

In [24]:
X_pre_transformed_1 = preprosser_1.fit_transform(X_2)                # Fiting Preprosser Object in X Features
X_pre_transformed_1.shape                                            # Checking X_pre_transformed shape

(7043, 6)

In [25]:
X_pre_transformed_1                  # Printing X_pre_transformed Values

array([[-1.27744458, -1.16032292, -0.99424194,  0.        ,  0.        ,
         0.        ],
       [ 0.06632742, -0.25962894, -0.17324413,  1.        ,  0.        ,
         1.        ],
       [-1.23672422, -0.36266036, -0.95967407,  1.        ,  0.        ,
         0.        ],
       ...,
       [-0.87024095, -1.1686319 , -0.85446945,  0.        ,  0.        ,
         0.        ],
       [-1.15528349,  0.32033821, -0.87206242,  1.        ,  1.        ,
         0.        ],
       [ 1.36937906,  1.35896134,  2.01428802,  1.        ,  1.        ,
         2.        ]])

In [26]:
# y target Value Encoding

le = LabelEncoder()
y_encoded_1 = le.fit_transform(y_2)
y_encoded_1

# Checking y_encoded Shape

array([0, 0, 1, ..., 0, 1, 0])

In [27]:
smt_1 = SMOTEENN()
X_resampled_1, y_resampled_1 = smt_1.fit_resample(X_pre_transformed_1, y_encoded_1)
X_resampled_1.shape, y_resampled_1.shape
#SMOTEENN to the pre-transformed features and encoded target variable

((6133, 6), (6133,))

## Evaluates multiple models on the resampled data, identifies the best-performing model

In [28]:
report_2 = evaluate_models(X_resampled_1, y_resampled_1, models, param_grids)
best_model_score = max(sorted(report_dict.values()))
best_model_name = list(report_dict.keys())[
list(report_dict.values()).index(best_model_score)
]
best_model = models[best_model_name]


Evaluating models:   0%|          | 0/10 [00:00<?, ?it/s]

Evaluating LogisticRegression...
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Parameters for LogisticRegression: {'C': 10, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}

Model performance on Training Set:
- Accuracy: 0.8977
- F1 Score: 0.9027
- Precision: 0.8792
- Recall: 0.9275
- ROC AUC Score: 0.8970

Model performance on Test Set:
- Accuracy: 0.8998
- F1 Score: 0.9014
- Precision: 0.8727
- Recall: 0.9320
- ROC AUC Score: 0.9003
Evaluating KNeighborsClassifier...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best Parameters for KNeighborsClassifier: {'n_neighbors': 3}

Model performance on Training Set:
- Accuracy: 0.9914
- F1 Score: 0.9916
- Precision: 0.9905
- Recall: 0.9928
- ROC AUC Score: 0.9914

Model performance on Test Set:
- Accuracy: 0.9821
- F1 Score: 0.9819
- Precision: 0.9755
- Recall: 0.9884
- ROC AUC Score: 0.9822
Evaluating SVC...
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best Parameters for S

In [29]:
report_2

(                   Model Name  Accuracy  ROC AUC Score
 3      RandomForestClassifier  0.983700       0.983779
 7              LGBMClassifier  0.983700       0.983807
 1        KNeighborsClassifier  0.982070       0.982176
 6               XGBClassifier  0.975550       0.975738
 8      DecisionTreeClassifier  0.975550       0.975627
 4  GradientBoostingClassifier  0.972290       0.972449
 2                         SVC  0.925020       0.924971
 9                  GaussianNB  0.903830       0.904193
 5          AdaBoostClassifier  0.901385       0.901789
 0          LogisticRegression  0.899756       0.900298,
 {'LogisticRegression': 0.8997555012224939,
  'KNeighborsClassifier': 0.9820700896495518,
  'SVC': 0.9250203748981255,
  'RandomForestClassifier': 0.9837000814995925,
  'GradientBoostingClassifier': 0.9722901385493072,
  'AdaBoostClassifier': 0.9013854930725347,
  'XGBClassifier': 0.9755501222493888,
  'LGBMClassifier': 0.9837000814995925,
  'DecisionTreeClassifier': 0.97555012224