In [56]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/codebasics/deep-learning-keras-tf-tutorial/master/14_imbalanced/Handling%20Imbalanced%20Data%20In%20Customer%20Churn%20Using%20ANN/Churn_Modelling.csv')
df.head()

In [5]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

It's look there isn't any empty values. So lets see what data types available

In [6]:
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

Basic describe of numerical features

In [7]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


about categorical features

In [8]:
df.describe(include='O')

Unnamed: 0,Surname,Geography,Gender
count,10000,10000,10000
unique,2932,3,2
top,Smith,France,Male
freq,32,5014,5457


Lets see which categories are available in each column

In [9]:
for column in df:
    if df[column].dtype == 'object':
        print(f'{column}: {df[column].unique()}')

Surname: ['Hargrave' 'Hill' 'Onio' ... 'Kashiwagi' 'Aldridge' 'Burbidge']
Geography: ['France' 'Spain' 'Germany']
Gender: ['Female' 'Male']


There is lot more unique surname values. So lets see count

In [10]:
df['Surname'].value_counts()

Smith       32
Scott       29
Martin      29
Walker      28
Brown       26
            ..
Izmailov     1
Bold         1
Bonham       1
Poninski     1
Burbidge     1
Name: Surname, Length: 2932, dtype: int64

We want to group all the surname which has less than five count 

In [11]:
df1 = df.copy()

In [12]:
surname_stats = df1['Surname'].value_counts()
surname_stats_lessthan_5 = surname_stats[surname_stats < 5]
len(surname_stats_lessthan_5)

2311

In [13]:
df1['Surname'] = df1['Surname'].apply(lambda x: 'Rare' if x in surname_stats_lessthan_5 else x)

In [14]:
df1.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Rare,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


We want to convert categorical features to numeric. So lets do that

In [15]:
df1['Gender'].replace({'Male': 1, 'Female':0}, inplace=True)

Lets get dummies for Surname and Geography columns

In [16]:
df2 = df1.copy()

In [17]:
df2 = pd.get_dummies(data=df2, columns=['Geography', 'Surname']) #, 'Surname'

In [18]:
df2.head()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,...,Surname_Yin,Surname_Yobachi,Surname_Yobachukwu,Surname_Yobanna,Surname_Young,Surname_Yu,Surname_Yuan,Surname_Zetticci,Surname_Zikoranachidimma,Surname_Zito
0,1,15634602,619,0,42,2,0.0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,15647311,608,0,41,1,83807.86,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,15619304,502,0,42,8,159660.8,3,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,15701354,699,0,39,1,0.0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,15737888,850,0,43,2,125510.82,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df2.shape

(10000, 637)

Lets drop unwanted columns

In [20]:
df3 = df2.drop(['RowNumber', 'CustomerId'], axis=1)

In [21]:
df3.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,...,Surname_Yin,Surname_Yobachi,Surname_Yobachukwu,Surname_Yobanna,Surname_Young,Surname_Yu,Surname_Yuan,Surname_Zetticci,Surname_Zikoranachidimma,Surname_Zito
0,619,0,42,2,0.0,1,1,1,101348.88,1,...,0,0,0,0,0,0,0,0,0,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,...,0,0,0,0,0,0,0,0,0,0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,...,0,0,0,0,0,0,0,0,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,0,...,0,0,0,0,0,0,0,0,0,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,...,0,0,0,0,0,0,0,0,0,0


Now we want to scale the data

In [26]:
cols_to_scale = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
scaler = MinMaxScaler()

In [27]:
df3[cols_to_scale] = scaler.fit_transform(df3[cols_to_scale])

In [28]:
df3.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,...,Surname_Yin,Surname_Yobachi,Surname_Yobachukwu,Surname_Yobanna,Surname_Young,Surname_Yu,Surname_Yuan,Surname_Zetticci,Surname_Zikoranachidimma,Surname_Zito
0,0.538,0,0.324324,0.2,0.0,0.0,1,1,0.506735,1,...,0,0,0,0,0,0,0,0,0,0
1,0.516,0,0.310811,0.1,0.334031,0.0,0,1,0.562709,0,...,0,0,0,0,0,0,0,0,0,0
2,0.304,0,0.324324,0.8,0.636357,0.666667,1,0,0.569654,1,...,0,0,0,0,0,0,0,0,0,0
3,0.698,0,0.283784,0.1,0.0,0.333333,0,0,0.46912,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0,0.337838,0.2,0.500246,0.0,1,1,0.3954,0,...,0,0,0,0,0,0,0,0,0,0


Now we want to round float numbers to about 5 decimal points

In [29]:
for column in df3:
    df3[column] = round(df3[column], 5)

Split the data

In [30]:
X = df3.drop('Exited', axis='columns')
y = df3['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 634), (2000, 634), (8000,), (2000,))

Lets build the model

In [44]:
def ann(X_train, X_test, y_train, y_test):
    model = keras.Sequential([
        keras.layers.Dense(634, input_shape=(634,), activation='relu'),
        keras.layers.Dense(300, activation='relu'),
        keras.layers.Dense(50, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer = 'adam',
        loss= 'binary_crossentropy',
        metrics = ['accuracy']
    )

    model.fit(X_train, y_train, epochs=15)
    print('Accuracy :',model.evaluate(X_test, y_test))
    yp = model.predict(X_test)
    y_pred = []
    for element in yp:
        if element > 0.5:
            y_pred.append(1)
        else:
            y_pred.append(0)
    return y_pred

In [33]:
y_pred = ann(X_train, X_test, y_train, y_test)
y_pred



array([[2.4916690e-03],
       [2.0797856e-01],
       [8.3727700e-01],
       ...,
       [1.4375421e-01],
       [8.1462211e-05],
       [9.8429179e-01]], dtype=float32)

In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.91      0.88      1600
           1       0.53      0.41      0.46       400

    accuracy                           0.81      2000
   macro avg       0.69      0.66      0.67      2000
weighted avg       0.79      0.81      0.80      2000



## __Imbalance data__

In [46]:
df3.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

01. __Under sampling__

In [47]:
count_class_0, count_class_1 = df3.Exited.value_counts()

df_class_0 = df3[df3['Exited'] == 0]
df_class_1 = df3[df3['Exited'] == 1]

In [48]:
df_class_0_under = df_class_0.sample(count_class_1)
df_y_under = pd.concat([df_class_0_under, df_class_1], axis=0)
df_y_under.Exited.value_counts()

0    2037
1    2037
Name: Exited, dtype: int64

In [None]:
X = df_y_under.drop('Exited', axis=1)
y = df_y_under['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

y_pred = ann(X_train, X_test, y_train, y_test)

In [50]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.75      0.71       408
           1       0.72      0.64      0.67       407

    accuracy                           0.69       815
   macro avg       0.69      0.69      0.69       815
weighted avg       0.69      0.69      0.69       815



02. __Over Sampling__

In [53]:
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_over = pd.concat([df_class_1_over, df_class_0], axis=0)

df_over.Exited.value_counts()

1    7963
0    7963
Name: Exited, dtype: int64

In [None]:
X = df_over.drop('Exited', axis=1)
y = df_over['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

y_pred = ann(X_train, X_test, y_train, y_test)

In [55]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.85      0.89      1593
           1       0.86      0.93      0.90      1593

    accuracy                           0.89      3186
   macro avg       0.89      0.89      0.89      3186
weighted avg       0.89      0.89      0.89      3186



3. __Oversample with SMOTE__

In [57]:
smote = SMOTE(sampling_strategy='minority')
x_sm, y_sm = smote.fit_resample(df3.drop('Exited', axis=1), df3['Exited'])

y_sm.value_counts()

1    7963
0    7963
Name: Exited, dtype: int64

In [58]:
X = x_sm
y = y_sm

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

y_pred = ann(X_train, X_test, y_train, y_test)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Accuracy : [0.6627492308616638, 0.8571876883506775]


In [59]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      1593
           1       0.86      0.86      0.86      1593

    accuracy                           0.86      3186
   macro avg       0.86      0.86      0.86      3186
weighted avg       0.86      0.86      0.86      3186



4. __Ensemble method__

In [60]:
df3.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [65]:
X = df3.drop(['Exited'], axis=1)
y = df3['Exited']

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

df_train = X_train.copy()
df_train['Exited'] = y_train

df_train_class_0 = df_train[df_train['Exited'] == 0]
df_train_class_1 = df_train[df_train['Exited'] == 1]

In [69]:
def get_train_batch(df_majority, df_minority, start, end):
    df_train = pd.concat([df_majority[start:end], df_minority], axis=0)

    X_train = df_train.drop('Exited', axis=1)
    y_train = df_train['Exited']

    return X_train, y_train

In [74]:
X_train, y_train = get_train_batch(df_train_class_0, df_train_class_1, 0, 2036)
y_pred1 = ann(X_train, X_test, y_train, y_test)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Accuracy : [1.1210533380508423, 0.7434999942779541]


In [75]:
X_train, y_train = get_train_batch(df_train_class_0, df_train_class_1, 2036, 4072)
y_pred2 = ann(X_train, X_test, y_train, y_test)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Accuracy : [1.158438801765442, 0.7404999732971191]


In [76]:
X_train, y_train = get_train_batch(df_train_class_0, df_train_class_1, 4072, 6108)
y_pred3 = ann(X_train, X_test, y_train, y_test)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Accuracy : [1.3600783348083496, 0.7200000286102295]


In [78]:
X_train, y_train = get_train_batch(df_train_class_0, df_train_class_1, 6108, 7963)
y_pred4 = ann(X_train, X_test, y_train, y_test)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Accuracy : [3.8636674880981445, 0.35499998927116394]


In [79]:
y_pred_final = y_pred1.copy()
for i in range(len(y_pred1)):
    n_ones = y_pred1[i] + y_pred2[i] + y_pred3[i]+ y_pred4[i]
    if n_ones > 1:
        y_pred_final[i] = 1
    else:
        y_pred_final[i] = 0

In [80]:
print(classification_report(y_test, y_pred_final))

              precision    recall  f1-score   support

           0       0.92      0.62      0.74      1593
           1       0.34      0.77      0.48       407

    accuracy                           0.65      2000
   macro avg       0.63      0.70      0.61      2000
weighted avg       0.80      0.65      0.69      2000

