In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [3]:
data = pd.read_csv('C:\\Users\\ashis\\Downloads\\AspireNex\\Task-4 CUSTOMER CHURN PREDICTION\\WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [4]:
# Display the first few rows of the dataset
print(data.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [5]:
# Summary of the dataset
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
# Convert 'TotalCharges' to numeric, coerce errors to handle potential non-numeric values
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

In [14]:
# Check for missing values
print(data.isnull().sum())

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


In [15]:
# Fill missing 'TotalCharges' with the median value
data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)

In [16]:
# Encode categorical variables
le = LabelEncoder()
categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']
for column in categorical_columns:
    data[column] = le.fit_transform(data[column])

In [17]:
# Display the first few rows after preprocessing
print(data.head())

   customerID  gender  SeniorCitizen  Partner  Dependents  tenure  \
0  7590-VHVEG       0              0        1           0       1   
1  5575-GNVDE       1              0        0           0      34   
2  3668-QPYBK       1              0        0           0       2   
3  7795-CFOCW       1              0        0           0      45   
4  9237-HQITU       0              0        0           0       2   

   PhoneService  MultipleLines  InternetService  OnlineSecurity  ...  \
0             0              1                0               0  ...   
1             1              0                0               2  ...   
2             1              0                0               2  ...   
3             0              1                0               2  ...   
4             1              0                1               0  ...   

   DeviceProtection  TechSupport  StreamingTV  StreamingMovies  Contract  \
0                 0            0            0                0         0   


In [18]:
# Split the data into training and testing sets
X = data.drop(columns=['customerID', 'Churn'])
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return accuracy, precision, recall, f1, roc_auc, cm

In [21]:
# Train and evaluate Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_metrics = evaluate_model(lr_model, X_test, y_test)

In [22]:
# Train and evaluate Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_metrics = evaluate_model(rf_model, X_test, y_test)

In [23]:
# Train and evaluate Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_metrics = evaluate_model(gb_model, X_test, y_test)

In [24]:
# Print the evaluation metrics
print(f"Logistic Regression - Accuracy: {lr_metrics[0]}, Precision: {lr_metrics[1]}, Recall: {lr_metrics[2]}, F1 Score: {lr_metrics[3]}, ROC AUC: {lr_metrics[4]}")
print(f"Random Forest - Accuracy: {rf_metrics[0]}, Precision: {rf_metrics[1]}, Recall: {rf_metrics[2]}, F1 Score: {rf_metrics[3]}, ROC AUC: {rf_metrics[4]}")
print(f"Gradient Boosting - Accuracy: {gb_metrics[0]}, Precision: {gb_metrics[1]}, Recall: {gb_metrics[2]}, F1 Score: {gb_metrics[3]}, ROC AUC: {gb_metrics[4]}")

Logistic Regression - Accuracy: 0.815471965933286, Precision: 0.677115987460815, Recall: 0.579088471849866, F1 Score: 0.624277456647399, ROC AUC: 0.7398338112145082
Random Forest - Accuracy: 0.7963094393186657, Precision: 0.6616541353383458, Recall: 0.4718498659517426, F1 Score: 0.5508607198748043, ROC AUC: 0.6924886395395778
Gradient Boosting - Accuracy: 0.8055358410220014, Precision: 0.671280276816609, Recall: 0.5201072386058981, F1 Score: 0.5861027190332326, ROC AUC: 0.7142041984535281


In [25]:
# Define the parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0]
}


In [28]:
# Initialize the GridSearchCV with GradientBoostingClassifier
grid_search = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), param_grid=param_grid, 
                           cv=3, scoring='roc_auc', n_jobs=-1, verbose=2)

In [29]:
# Fit the GridSearchCV to the training data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [30]:
# Display the best parameters
print(f"Best parameters found: {grid_search.best_params_}")

Best parameters found: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.8}


In [31]:
# Evaluate the best estimator on the test set
best_model = grid_search.best_estimator_
best_metrics = evaluate_model(best_model, X_test, y_test)

In [32]:
# Print the evaluation metrics for the best model
print(f"Best Gradient Boosting - Accuracy: {best_metrics[0]}, Precision: {best_metrics[1]}, Recall: {best_metrics[2]}, F1 Score: {best_metrics[3]}, ROC AUC: {best_metrics[4]}")

Best Gradient Boosting - Accuracy: 0.8069552874378992, Precision: 0.684981684981685, Recall: 0.5013404825737265, F1 Score: 0.5789473684210527, ROC AUC: 0.7091644497810718
