In [3]:
import numpy as np 
import pandas as pd

In [29]:
churn_data = pd.read_csv("Data\Churn_Modelling.csv")
churn_data.head()

Unnamed: 0,CustomerId,CredRate,Geography,Gender,Age,Tenure,Balance,Prod Number,HasCrCard,ActMem,EstimatedSalary,Exited
0,15634602,619,France,Female,42.0,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42.0,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39.0,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43.0,2,125510.82,1,1,1,79084.1,0


In [30]:
churn_data.columns

Index(['CustomerId', 'CredRate', 'Geography', 'Gender', 'Age', 'Tenure',
       'Balance', 'Prod Number', 'HasCrCard', 'ActMem', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [31]:
churn_df = churn_data.drop(['CustomerId'],axis=1)
churn_df.head()

Unnamed: 0,CredRate,Geography,Gender,Age,Tenure,Balance,Prod Number,HasCrCard,ActMem,EstimatedSalary,Exited
0,619,France,Female,42.0,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42.0,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39.0,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43.0,2,125510.82,1,1,1,79084.1,0


## Divide Data to Features and Labels

In [32]:
X = churn_df.drop(['Exited'],axis=1)
y = churn_df['Exited']

In [33]:
X.head()

Unnamed: 0,CredRate,Geography,Gender,Age,Tenure,Balance,Prod Number,HasCrCard,ActMem,EstimatedSalary
0,619,France,Female,42.0,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41.0,1,83807.86,1,0,1,112542.58
2,502,France,Female,42.0,8,159660.8,3,1,0,113931.57
3,699,France,Female,39.0,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43.0,2,125510.82,1,1,1,79084.1


In [34]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

## Convert Categorical Data to Numerical

In [35]:
numerical = churn_df.drop(['Geography','Gender'],axis=1)
numerical.head()

Unnamed: 0,CredRate,Age,Tenure,Balance,Prod Number,HasCrCard,ActMem,EstimatedSalary,Exited
0,619,42.0,2,0.0,1,1,1,101348.88,1
1,608,41.0,1,83807.86,1,0,1,112542.58,0
2,502,42.0,8,159660.8,3,1,0,113931.57,1
3,699,39.0,1,0.0,2,0,0,93826.63,0
4,850,43.0,2,125510.82,1,1,1,79084.1,0


In [36]:
categorical = X.filter(['Geography','Gender'])
categorical.head()

Unnamed: 0,Geography,Gender
0,France,Female
1,Spain,Female
2,France,Female
3,France,Female
4,Spain,Female


In [37]:
cat_numerical = pd.get_dummies(categorical,drop_first=True)

In [38]:
X = pd.concat([numerical,cat_numerical],axis=1)
X.head()

Unnamed: 0,CredRate,Age,Tenure,Balance,Prod Number,HasCrCard,ActMem,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42.0,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41.0,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42.0,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39.0,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43.0,2,125510.82,1,1,1,79084.1,0,0,1,0


In [40]:
X.Age = X.Age.fillna(X.Age.mean())
X.EstimatedSalary = X.EstimatedSalary.fillna(X.EstimatedSalary.mean())
X.isna().sum()

CredRate             0
Age                  0
Tenure               0
Balance              0
Prod Number          0
HasCrCard            0
ActMem               0
EstimatedSalary      0
Exited               0
Geography_Germany    0
Geography_Spain      0
Gender_Male          0
dtype: int64

In [41]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [42]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()
classifier = log_clf.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

array([[-1.93106065, -0.55881318,  1.72166414, ..., -0.5857672 ,
        -0.57283404, -1.08872158],
       [ 1.46576299, -0.46285679,  0.68323815, ..., -0.5857672 ,
        -0.57283404,  0.91850847],
       [-0.89227666, -1.23050796,  0.68323815, ..., -0.5857672 ,
        -0.57283404,  0.91850847],
       ...,
       [-0.12357651,  0.88053277,  1.72166414, ..., -0.5857672 ,
        -0.57283404, -1.08872158],
       [-0.88188882,  0.11288159, -1.04747184, ..., -0.5857672 ,
        -0.57283404, -1.08872158],
       [-0.37288467, -1.32646436,  1.72166414, ..., -0.5857672 ,
         1.74570632, -1.08872158]])

### Which accuracy matrix to use ?

The choice of using a metric for classification problems depends totally upon you. However, as a rule of thumb, in the case of balanced datasets, i.e., where the number of labels for each class is balanced, accuracy can be used as an evaluation metric. For imbalanced datasets, you can use the F1 measure as the classification metric.

In [45]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2021
           1       1.00      1.00      1.00       479

    accuracy                           1.00      2500
   macro avg       1.00      1.00      1.00      2500
weighted avg       1.00      1.00      1.00      2500

[[2021    0]
 [   0  479]]
1.0


## The KNN classifier

In [49]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
classifier = knn_clf.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2021
           1       1.00      1.00      1.00       479

    accuracy                           1.00      2500
   macro avg       1.00      1.00      1.00      2500
weighted avg       1.00      1.00      1.00      2500

[[2021    0]
 [   0  479]]
1.0
