In [10]:
import numpy as np
import pandas as pd
df = pd.read_csv('Churn_Modelling.csv',index_col='RowNumber')

In [11]:
from sklearn.preprocessing import MinMaxScaler
df.drop(['CustomerId','Surname'],axis=1,inplace=True)
df['Gender'].replace({'Female':1,'Male':0},inplace=True)
df = pd.get_dummies(data= df, columns=['Geography'])
cols_to_scale = ['CreditScore','Age','Balance','Tenure','EstimatedSalary']
df[cols_to_scale] = MinMaxScaler().fit_transform(df[cols_to_scale])
df


Unnamed: 0_level_0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.538,1,0.324324,0.2,0.000000,1,1,1,0.506735,1,1,0,0
2,0.516,1,0.310811,0.1,0.334031,1,0,1,0.562709,0,0,0,1
3,0.304,1,0.324324,0.8,0.636357,3,1,0,0.569654,1,1,0,0
4,0.698,1,0.283784,0.1,0.000000,2,0,0,0.469120,0,1,0,0
5,1.000,1,0.337838,0.2,0.500246,1,1,1,0.395400,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,0.842,0,0.283784,0.5,0.000000,2,1,0,0.481341,0,1,0,0
9997,0.332,0,0.229730,1.0,0.228657,1,1,1,0.508490,0,1,0,0
9998,0.718,1,0.243243,0.7,0.000000,1,0,1,0.210390,1,1,0,0
9999,0.844,0,0.324324,0.3,0.299226,2,1,0,0.464429,1,0,1,0


In [12]:
df.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

### Method 1: Undersampling

In [14]:
count_class_0 , count_class_1 = df.Exited.value_counts()
df_class_0 = df[df.Exited == 0] 
df_class_1 = df[df.Exited == 1]
df_class_0 , df_class_1

(           CreditScore  Gender       Age  Tenure   Balance  NumOfProducts  \
 RowNumber                                                                   
 2                0.516       1  0.310811     0.1  0.334031              1   
 4                0.698       1  0.283784     0.1  0.000000              2   
 5                1.000       1  0.337838     0.2  0.500246              1   
 7                0.944       0  0.432432     0.7  0.000000              2   
 9                0.302       0  0.351351     0.4  0.566170              2   
 ...                ...     ...       ...     ...       ...            ...   
 9994             0.588       0  0.135135     0.7  0.618021              1   
 9995             0.900       1  0.148649     0.2  0.000000              2   
 9996             0.842       0  0.283784     0.5  0.000000              2   
 9997             0.332       0  0.229730     1.0  0.228657              1   
 10000            0.884       1  0.135135     0.4  0.518708     

In [44]:
df_class_0_sample = df_class_0.sample(count_class_1)
df_test = pd.concat([df_class_0_sample,df_class_1],axis =0)
df_test.Exited.value_counts() 

0    2037
1    2037
Name: Exited, dtype: int64

In [42]:
X = df_test.drop(columns=['Exited'],axis=0)
y = df_test.Exited

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2,random_state=15,stratify=y)

In [21]:
y_train.value_counts()

1    1630
0    1629
Name: Exited, dtype: int64

In [23]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X_train,y_train)

In [27]:
y_pred = model.predict(X_test)

In [32]:
from sklearn.metrics import  classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.75      0.71      0.73       435
           1       0.69      0.74      0.71       380

    accuracy                           0.72       815
   macro avg       0.72      0.72      0.72       815
weighted avg       0.72      0.72      0.72       815



### Method2: Oversampling

In [36]:
df_class_1_sample = df_class_1.sample(count_class_0,replace=True)
df_test = pd.concat([df_class_0,df_class_1_sample],axis=0)
df_test.Exited.value_counts()

0    7963
1    7963
Name: Exited, dtype: int64

In [37]:
X = df_test.drop(columns=['Exited'],axis=0)
y = df_test.Exited

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2,random_state=15,stratify=y)

In [38]:
y_train.value_counts()

0    6370
1    6370
Name: Exited, dtype: int64

In [39]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X_train,y_train)

In [40]:
y_pred = model.predict(X_test)

In [41]:
from sklearn.metrics import  classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.72      0.70      0.71      1635
           1       0.69      0.71      0.70      1551

    accuracy                           0.70      3186
   macro avg       0.70      0.70      0.70      3186
weighted avg       0.70      0.70      0.70      3186



### Method3: SMOTE

In [89]:
from imblearn.over_sampling import SMOTE
X = df.drop(columns=['Exited'],axis=0)
y = df.Exited

smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X,y)

X_sm.shape, y_sm.shape

((15926, 12), (15926,))

In [91]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_sm, y_sm, test_size=0.2,random_state=15,stratify=y_sm)

In [92]:
y_train.value_counts()

0    6370
1    6370
Name: Exited, dtype: int64

In [93]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X_train,y_train)

In [94]:
y_pred = model.predict(X_test)

In [96]:
from sklearn.metrics import  classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.72      0.71      0.71      1617
           1       0.70      0.72      0.71      1569

    accuracy                           0.71      3186
   macro avg       0.71      0.71      0.71      3186
weighted avg       0.71      0.71      0.71      3186



### Method4: Use of Ensemble with undersampling

In [151]:
def Ensembeling(df,model_func = LogisticRegression()):
    X = df.drop(columns=['Exited'],axis=0)
    y = df.Exited
    X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2,random_state=15,stratify=y)

    n_cuts = np.ceil(y_train.value_counts()[0]/ y_train.value_counts()[1])
    slices = np.ceil(np.linspace(0,int(y_train.value_counts()[0]),num=int(n_cuts)))

    df_class0 = df[df.Exited==0]
    df_class1 = df[df.Exited==1]
    final_pred = np.empty([X_test.shape[0]])
    print(final_pred.shape[0])
    for i in range(0,len(slices)-1):
        df_train = pd.concat([df_class0[int(slices[i]):int(slices[i+1])], df_class1], axis=0)
        X_train = df_train.drop(columns=['Exited'],axis=0)
        y_train = df_train.Exited

        model = model_func.fit(X_train,y_train)
        y_pred = model.predict(X_test)
 
        final_pred += y_pred 

    for i in range(len(final_pred)):
        if final_pred[i]>1:
            final_pred[i] = 1
        else:
            final_pred[i] = 0
    return final_pred
    

In [153]:
y_pred = Ensembeling(df)
print(classification_report(y_pred,y_test))

2000
              precision    recall  f1-score   support

         0.0       0.73      0.89      0.80      1305
         1.0       0.64      0.37      0.47       695

    accuracy                           0.71      2000
   macro avg       0.68      0.63      0.63      2000
weighted avg       0.70      0.71      0.68      2000

