In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score

In [4]:
#load the dataset
df = pd.read_csv('Churn_Modelling.csv')
print(df.head())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [6]:
print(df.dtypes)

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object


In [8]:
# check if any column contains null values
print(df.isnull().sum())

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [11]:
# As there is no need to drop any columns we direct go to convert categorical columns to numerical
labelEncoder = LabelEncoder()

categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

for column in categorical_columns:
  df[column] = labelEncoder.fit_transform(df[column])

print(df.head())

   RowNumber  CustomerId  Surname  CreditScore  Geography  Gender  Age  \
0          1    15634602     1115          619          0       0   42   
1          2    15647311     1177          608          2       0   41   
2          3    15619304     2040          502          0       0   42   
3          4    15701354      289          699          0       0   39   
4          5    15737888     1822          850          2       0   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  surname  
0        101348.88       1     1115  
1        112542.58       0     1177  
2        113931.57       1     2040  
3     

In [12]:
# printing datatypes of columns after preprocessing
print(df.dtypes)

RowNumber            int64
CustomerId           int64
Surname              int64
CreditScore          int64
Geography            int64
Gender               int64
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
surname              int64
dtype: object


In [14]:
# We'll use logistice regression over random forest because it provides proper baseline and interpretabilty

# Assigning feature variables to x and target variable to y
x = df.drop(['Exited'], axis=1)
y = df['Exited']

# spillting dataset in training and test sets, training set : 80% and test set : 20%
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)



(8000, 14)
(2000, 14)
(8000,)
(2000,)


In [25]:
# Initializing logistic regression model
model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)

# training the model
model.fit(x_train, y_train)

# predicting the test set results
y_pred = model.predict(x_test)

# calculating y_prob for computing ROC-AUC score
y_prob = model.predict_proba(x_test)[:, 1]

# Evaluate the model using a confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

#Generating a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Calculating ROC-AUC score
print("\nROC-AUC Score:")
print(roc_auc_score(y_test, y_prob))
print()

# Calculating accuracy
model_accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {model_accuracy:.4f}")
print()

Confusion Matrix:
[[1607    0]
 [ 393    0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89      1607
           1       0.00      0.00      0.00       393

    accuracy                           0.80      2000
   macro avg       0.40      0.50      0.45      2000
weighted avg       0.65      0.80      0.72      2000


ROC-AUC Score:
0.5829394617378486

Model Accuracy: 0.8035



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
