In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# Load the data
data = pd.read_csv('/home/aditya/Desktop/codsoft/task-3 chrun/Churn_Modelling.csv')

# Display the first few rows of the data
data.head()


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
# Check for missing values
data.isnull().sum()

# Encode categorical variables
label_encoder = LabelEncoder()
data['Geography'] = label_encoder.fit_transform(data['Geography'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])

# Feature scaling
scaler = StandardScaler()
data[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']] = scaler.fit_transform(
    data[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']])

# Drop unnecessary columns
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

# Define feature matrix X and target vector y
X = data.drop(columns=['Exited'])
y = data['Exited']


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)


In [8]:
# Function to evaluate models
def evaluate_model(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    roc_auc = roc_auc_score(y_test, y_pred)
    print("ROC AUC Score:", roc_auc)
    return roc_auc

# Evaluate Logistic Regression
print("Logistic Regression:")
roc_auc_log_reg = evaluate_model(y_test, y_pred_log_reg)

# Evaluate Random Forest
print("\nRandom Forest:")
roc_auc_rf = evaluate_model(y_test, y_pred_rf)

# Evaluate Gradient Boosting
print("\nGradient Boosting:")
roc_auc_gb = evaluate_model(y_test, y_pred_gb)


Logistic Regression:
Accuracy: 0.815
Confusion Matrix:
 [[1559   48]
 [ 322   71]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.97      0.89      1607
           1       0.60      0.18      0.28       393

    accuracy                           0.81      2000
   macro avg       0.71      0.58      0.59      2000
weighted avg       0.78      0.81      0.77      2000

ROC AUC Score: 0.5753961279453282

Random Forest:
Accuracy: 0.8665
Confusion Matrix:
 [[1553   54]
 [ 213  180]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.97      0.92      1607
           1       0.77      0.46      0.57       393

    accuracy                           0.87      2000
   macro avg       0.82      0.71      0.75      2000
weighted avg       0.86      0.87      0.85      2000

ROC AUC Score: 0.7122061401217002

Gradient Boosting:
Accuracy: 0.8655
Confusion Matrix:
 [[1547   60]

In [9]:
# Let's assume Gradient Boosting has the highest ROC AUC score
# Hyperparameter tuning for Gradient Boosting
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Instantiate the grid search
grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=3, scoring='roc_auc', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

# Train the optimized model
best_gb = grid_search.best_estimator_
best_gb.fit(X_train, y_train)
y_pred_best_gb = best_gb.predict(X_test)

# Evaluate the optimized model
print("\nOptimized Gradient Boosting:")
roc_auc_best_gb = evaluate_model(y_test, y_pred_best_gb)


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.5s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   1.5s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   2.6s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   3.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   3.3s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=100; total time=   2.1s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   4.5s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=100; total time=   2.0s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=300; total time=   5.0s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=100; total time=   1.9s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=300; total time=   4.7s
[CV] END ..learning_rate=0.01, max_depth=3, n_es

In [10]:
# Predicting the Test set results
y_pred_final = best_gb.predict(X_test)

# Evaluate the final model
print("\nFinal Model Evaluation:")
evaluate_model(y_test, y_pred_final)



Final Model Evaluation:
Accuracy: 0.866
Confusion Matrix:
 [[1547   60]
 [ 208  185]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.47      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.72      0.75      2000
weighted avg       0.86      0.87      0.85      2000

ROC AUC Score: 0.7167006306695738


0.7167006306695738