dataset link: https://www.kaggle.com/datasets/aakash50897/churn-modellingcsv?resource=download

# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Reading the dataset

In [None]:
modelingDataset = pd.read_csv('Churn-Modelling.csv')

In [None]:
modelingDataset.head(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [None]:
modelingDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


# Encoding String Values To Integers

In [None]:
encoder = LabelEncoder()
modelingDataset['Geography'] = encoder.fit_transform(modelingDataset['Geography'])
modelingDataset['Gender'] = encoder.fit_transform(modelingDataset['Gender'])

In [None]:
modelingDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  int64  
 2   Gender           10000 non-null  int64  
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9)
memory usage: 859.5 KB


# Splitting the dataset

In [None]:
X=modelingDataset.iloc[:,0:10]
Y=modelingDataset.iloc[:,10]
X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=0.25)

# Standardization of data

In [None]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

# K-Nearest-Neighbors

Experiment 1 (n_neighbors=3)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.822
[[1843  136]
 [ 309  212]]
              precision    recall  f1-score   support

           0       0.86      0.93      0.89      1979
           1       0.61      0.41      0.49       521

    accuracy                           0.82      2500
   macro avg       0.73      0.67      0.69      2500
weighted avg       0.80      0.82      0.81      2500



Experiment 2 (n_neighbors=7)

In [None]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.828
[[1884   95]
 [ 335  186]]
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      1979
           1       0.66      0.36      0.46       521

    accuracy                           0.83      2500
   macro avg       0.76      0.65      0.68      2500
weighted avg       0.81      0.83      0.81      2500



Experiment 3 (n_neighbors=11)

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.8316
[[1915   64]
 [ 357  164]]
              precision    recall  f1-score   support

           0       0.84      0.97      0.90      1979
           1       0.72      0.31      0.44       521

    accuracy                           0.83      2500
   macro avg       0.78      0.64      0.67      2500
weighted avg       0.82      0.83      0.80      2500



# Decision-Tree-Classifier

Experiment 1 (max_depth=3)

In [None]:
# Expirement 1
dtc = DecisionTreeClassifier(max_depth=3)
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.8432
[[1881   98]
 [ 294  227]]
              precision    recall  f1-score   support

           0       0.86      0.95      0.91      1979
           1       0.70      0.44      0.54       521

    accuracy                           0.84      2500
   macro avg       0.78      0.69      0.72      2500
weighted avg       0.83      0.84      0.83      2500



Experiment 2 (max_depth=7)

In [None]:
# Expirement 2
dtc = DecisionTreeClassifier(max_depth=7)
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.8472
[[1908   71]
 [ 311  210]]
              precision    recall  f1-score   support

           0       0.86      0.96      0.91      1979
           1       0.75      0.40      0.52       521

    accuracy                           0.85      2500
   macro avg       0.80      0.68      0.72      2500
weighted avg       0.84      0.85      0.83      2500



Experiment 3 (max_depth=11)

In [None]:
# Expirement 3
dtc = DecisionTreeClassifier(max_depth=11)
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.8272
[[1834  145]
 [ 287  234]]
              precision    recall  f1-score   support

           0       0.86      0.93      0.89      1979
           1       0.62      0.45      0.52       521

    accuracy                           0.83      2500
   macro avg       0.74      0.69      0.71      2500
weighted avg       0.81      0.83      0.82      2500



# Random-Forest-Classifier

Experiment 1 (max_depth=3)

In [None]:
rfc = RandomForestClassifier(max_depth=3)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.8176
[[1978    1]
 [ 455   66]]
              precision    recall  f1-score   support

           0       0.81      1.00      0.90      1979
           1       0.99      0.13      0.22       521

    accuracy                           0.82      2500
   macro avg       0.90      0.56      0.56      2500
weighted avg       0.85      0.82      0.76      2500



Experiment 2 (max_depth=7)

In [None]:
rfc = RandomForestClassifier(max_depth=7)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.8552
[[1928   51]
 [ 311  210]]
              precision    recall  f1-score   support

           0       0.86      0.97      0.91      1979
           1       0.80      0.40      0.54       521

    accuracy                           0.86      2500
   macro avg       0.83      0.69      0.73      2500
weighted avg       0.85      0.86      0.84      2500



Experiment 3 (max_depth=11)

In [None]:
# Expirement 3
rfc = RandomForestClassifier(max_depth=11)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.8596
[[1920   59]
 [ 292  229]]
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      1979
           1       0.80      0.44      0.57       521

    accuracy                           0.86      2500
   macro avg       0.83      0.70      0.74      2500
weighted avg       0.85      0.86      0.84      2500



# Support-Vector-Machines

Experiment 1 (kernel='linear')

In [None]:
svc = svm.SVC(kernel='linear')
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.7916
[[1979    0]
 [ 521    0]]
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1979
           1       0.00      0.00      0.00       521

    accuracy                           0.79      2500
   macro avg       0.40      0.50      0.44      2500
weighted avg       0.63      0.79      0.70      2500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Experiment 2 (kernel='rbf')

In [None]:
svc = svm.SVC(kernel='rbf')
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.852
[[1938   41]
 [ 329  192]]
              precision    recall  f1-score   support

           0       0.85      0.98      0.91      1979
           1       0.82      0.37      0.51       521

    accuracy                           0.85      2500
   macro avg       0.84      0.67      0.71      2500
weighted avg       0.85      0.85      0.83      2500



Experiment 3 (kernel='poly')

In [None]:
svc = svm.SVC(kernel='poly')
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.8488
[[1954   25]
 [ 353  168]]
              precision    recall  f1-score   support

           0       0.85      0.99      0.91      1979
           1       0.87      0.32      0.47       521

    accuracy                           0.85      2500
   macro avg       0.86      0.65      0.69      2500
weighted avg       0.85      0.85      0.82      2500



# Stochastic Gradient Descent

Experiment 1 (penalty="l2")

In [None]:
# Applying Stochastic-Gradient-Descent Classifier + Evaluation
sgd = SGDClassifier(penalty="l2")
sgd.fit(X_train,y_train)
y_pred = sgd.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.7924
[[1976    3]
 [ 516    5]]
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1979
           1       0.62      0.01      0.02       521

    accuracy                           0.79      2500
   macro avg       0.71      0.50      0.45      2500
weighted avg       0.76      0.79      0.70      2500



Experiment 2 (penalty="l1")

In [None]:
# Applying Stochastic-Gradient-Descent Classifier + Evaluation
sgd = SGDClassifier(penalty="l1")
sgd.fit(X_train,y_train)
y_pred = sgd.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.7916
[[1979    0]
 [ 521    0]]
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1979
           1       0.00      0.00      0.00       521

    accuracy                           0.79      2500
   macro avg       0.40      0.50      0.44      2500
weighted avg       0.63      0.79      0.70      2500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Experiment 3 (penalty="elasticnet")

In [None]:
# Applying Stochastic-Gradient-Descent Classifier + Evaluation
sgd = SGDClassifier(penalty="elasticnet")
sgd.fit(X_train,y_train)
y_pred = sgd.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.7916
[[1979    0]
 [ 521    0]]
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1979
           1       0.00      0.00      0.00       521

    accuracy                           0.79      2500
   macro avg       0.40      0.50      0.44      2500
weighted avg       0.63      0.79      0.70      2500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
