In [1]:
import numpy as np 
import pandas as pd

In [2]:
churn_data = pd.read_csv("Data\Churn_Modelling.csv")
churn_data.head()

Unnamed: 0,CustomerId,CredRate,Geography,Gender,Age,Tenure,Balance,Prod Number,HasCrCard,ActMem,EstimatedSalary,Exited
0,15634602,619,France,Female,42.0,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42.0,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39.0,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43.0,2,125510.82,1,1,1,79084.1,0


In [3]:
churn_data.columns

Index(['CustomerId', 'CredRate', 'Geography', 'Gender', 'Age', 'Tenure',
       'Balance', 'Prod Number', 'HasCrCard', 'ActMem', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [4]:
churn_df = churn_data.drop(['CustomerId'],axis=1)
churn_df.head()

Unnamed: 0,CredRate,Geography,Gender,Age,Tenure,Balance,Prod Number,HasCrCard,ActMem,EstimatedSalary,Exited
0,619,France,Female,42.0,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42.0,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39.0,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43.0,2,125510.82,1,1,1,79084.1,0


## Divide Data to Features and Labels

In [5]:
X = churn_df.drop(['Exited'],axis=1)
y = churn_df['Exited']

In [6]:
X.head()

Unnamed: 0,CredRate,Geography,Gender,Age,Tenure,Balance,Prod Number,HasCrCard,ActMem,EstimatedSalary
0,619,France,Female,42.0,2,0.0,1,1,1,101348.88
1,608,Spain,Female,41.0,1,83807.86,1,0,1,112542.58
2,502,France,Female,42.0,8,159660.8,3,1,0,113931.57
3,699,France,Female,39.0,1,0.0,2,0,0,93826.63
4,850,Spain,Female,43.0,2,125510.82,1,1,1,79084.1


In [7]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Exited, dtype: int64

## Convert Categorical Data to Numerical

In [8]:
numerical = churn_df.drop(['Geography','Gender'],axis=1)
numerical.head()

Unnamed: 0,CredRate,Age,Tenure,Balance,Prod Number,HasCrCard,ActMem,EstimatedSalary,Exited
0,619,42.0,2,0.0,1,1,1,101348.88,1
1,608,41.0,1,83807.86,1,0,1,112542.58,0
2,502,42.0,8,159660.8,3,1,0,113931.57,1
3,699,39.0,1,0.0,2,0,0,93826.63,0
4,850,43.0,2,125510.82,1,1,1,79084.1,0


In [9]:
categorical = X.filter(['Geography','Gender'])
categorical.head()

Unnamed: 0,Geography,Gender
0,France,Female
1,Spain,Female
2,France,Female
3,France,Female
4,Spain,Female


In [10]:
cat_numerical = pd.get_dummies(categorical,drop_first=True)

In [11]:
X = pd.concat([numerical,cat_numerical],axis=1)
X.head()

Unnamed: 0,CredRate,Age,Tenure,Balance,Prod Number,HasCrCard,ActMem,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42.0,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41.0,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42.0,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39.0,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43.0,2,125510.82,1,1,1,79084.1,0,0,1,0


In [12]:
X.Age = X.Age.fillna(X.Age.mean())
X.EstimatedSalary = X.EstimatedSalary.fillna(X.EstimatedSalary.mean())
X.isna().sum()

CredRate             0
Age                  0
Tenure               0
Balance              0
Prod Number          0
HasCrCard            0
ActMem               0
EstimatedSalary      0
Exited               0
Geography_Germany    0
Geography_Spain      0
Gender_Male          0
dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [14]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

## Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()
classifier = log_clf.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

### Which accuracy matrix to use ?

The choice of using a metric for classification problems depends totally upon you. However, as a rule of thumb, in the case of balanced datasets, i.e., where the number of labels for each class is balanced, accuracy can be used as an evaluation metric. For imbalanced datasets, you can use the F1 measure as the classification metric.

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1997
           1       1.00      1.00      1.00       503

    accuracy                           1.00      2500
   macro avg       1.00      1.00      1.00      2500
weighted avg       1.00      1.00      1.00      2500

[[1997    0]
 [   0  503]]
1.0


## The KNN classifier

In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
classifier = knn_clf.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1997
           1       1.00      1.00      1.00       503

    accuracy                           1.00      2500
   macro avg       1.00      1.00      1.00      2500
weighted avg       1.00      1.00      1.00      2500

[[1997    0]
 [   1  502]]
0.9996


### Performing Cross Validation

In [20]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(classifier,X,y, cv=5,scoring="accuracy"))

[0.7675 0.7655 0.767  0.7645 0.766 ]


# Solving Multiclass Classificaitons

In [22]:
from sklearn.datasets import make_classification

# create a dummy dataset

X,y = make_classification(n_samples=2000,n_features=12,n_informative=8,n_redundant=4,n_classes=4,random_state=2)

print(X.shape,y.shape)

(2000, 12) (2000,)


In [23]:
np.unique(y)

array([0, 1, 2, 3])

In [24]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [26]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42,n_estimators=500)
classifier = rf_clf.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       120
           1       0.75      0.83      0.79       113
           2       0.88      0.74      0.80       137
           3       0.82      0.82      0.82       130

    accuracy                           0.82       500
   macro avg       0.82      0.82      0.82       500
weighted avg       0.82      0.82      0.82       500

[[108   7   3   2]
 [  9  94   3   7]
 [ 11  11 101  14]
 [  3  13   8 106]]
0.818


## Using the OneVsRest Approach

In [28]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

lg_clf = LogisticRegression()

clf = OneVsRestClassifier(lg_clf)

classifier = clf.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.69      0.71      0.70       120
           1       0.54      0.65      0.59       113
           2       0.67      0.57      0.61       137
           3       0.68      0.63      0.65       130

    accuracy                           0.64       500
   macro avg       0.64      0.64      0.64       500
weighted avg       0.64      0.64      0.64       500

[[85 10 19  6]
 [13 74 10 16]
 [18 24 78 17]
 [ 8 30 10 82]]
0.638


## The one vs one Approach

In [30]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression

lg_clf = LogisticRegression()

clf = OneVsOneClassifier(lg_clf)

classifier = clf.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.71      0.71      0.71       120
           1       0.59      0.76      0.66       113
           2       0.73      0.60      0.66       137
           3       0.71      0.66      0.69       130

    accuracy                           0.68       500
   macro avg       0.68      0.68      0.68       500
weighted avg       0.69      0.68      0.68       500

[[85 13 16  6]
 [ 9 86  6 12]
 [18 20 82 17]
 [ 7 28  9 86]]
0.678


## Multilabel Classification Problems

In [31]:
from sklearn.datasets import make_multilabel_classification

X,y = make_multilabel_classification(n_samples=2000,n_features=10,n_classes=5,n_labels=3,random_state=42)

print(X.shape,y.shape)

(2000, 10) (2000, 5)


In [32]:
y[200],y[1000]

(array([0, 1, 0, 1, 1]), array([1, 1, 1, 0, 0]))

In [33]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [35]:
#Random Forest Classifier can be used for multiclass and multilabel both classifications

from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42,n_estimators=500)
classifier = rf_clf.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.71      0.76       240
           1       0.86      0.93      0.90       367
           2       0.81      0.92      0.86       306
           3       0.74      0.90      0.81       301
           4       0.69      0.37      0.48       129

   micro avg       0.80      0.83      0.81      1343
   macro avg       0.78      0.77      0.76      1343
weighted avg       0.80      0.83      0.80      1343
 samples avg       0.82      0.82      0.80      1343

0.36


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
# You can also use one vs Rest for this purpose.

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

lg_clf = LogisticRegression()

clf = OneVsRestClassifier(lg_clf)

classifier = clf.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.70      0.75       240
           1       0.87      0.89      0.88       367
           2       0.79      0.84      0.81       306
           3       0.72      0.79      0.76       301
           4       0.70      0.40      0.51       129

   micro avg       0.79      0.78      0.78      1343
   macro avg       0.77      0.73      0.74      1343
weighted avg       0.79      0.78      0.78      1343
 samples avg       0.80      0.79      0.76      1343

0.264


  _warn_prf(average, modifier, msg_start, len(result))
