##Reading the data

In [30]:
import pandas as pd
import numpy as np

df = pd.read_csv("Customer_Churn.csv")
df.head()

Unnamed: 0,CustomerId,Lastname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Churned
0,15729836,Robinson,646,Spain,Male,32,1,0.0,2,1,0,183289.22,0
1,15708610,Costa,690,Germany,Male,44,9,100368.63,2,0,0,35342.33,0
2,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
3,15594133,Erskine,697,Spain,Male,62,7,0.0,1,1,0,129188.18,1
4,15726747,Donaldson,714,France,Male,63,4,138082.16,1,0,1,166677.54,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       9000 non-null   int64  
 1   Lastname         9000 non-null   object 
 2   CreditScore      9000 non-null   int64  
 3   Geography        9000 non-null   object 
 4   Gender           9000 non-null   object 
 5   Age              9000 non-null   int64  
 6   Tenure           9000 non-null   int64  
 7   Balance          9000 non-null   float64
 8   NumOfProducts    9000 non-null   int64  
 9   HasCrCard        9000 non-null   int64  
 10  IsActiveMember   9000 non-null   int64  
 11  EstimatedSalary  9000 non-null   float64
 12  Churned          9000 non-null   int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 914.2+ KB


In [4]:
df.isna().sum()

CustomerId         0
Lastname           0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Churned            0
dtype: int64

In [5]:
df.nunique()

CustomerId         9000
Lastname           2798
CreditScore         456
Geography             3
Gender                2
Age                  70
Tenure               11
Balance            5738
NumOfProducts         4
HasCrCard             2
IsActiveMember        2
EstimatedSalary    8999
Churned               2
dtype: int64

## Dropping unnecessary variables


In [6]:
df.drop(columns=["CustomerId", "Lastname"], axis=1, inplace=True)


In [7]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Churned
0,646,Spain,Male,32,1,0.0,2,1,0,183289.22,0
1,690,Germany,Male,44,9,100368.63,2,0,0,35342.33,0
2,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
3,697,Spain,Male,62,7,0.0,1,1,0,129188.18,1
4,714,France,Male,63,4,138082.16,1,0,1,166677.54,0


In [8]:
df["Churned"].value_counts()

Churned
0    7178
1    1822
Name: count, dtype: int64

## Dummy encoding

* For Gender and Geography categorical variables

In [9]:
df = pd.get_dummies(df, drop_first = True)
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Churned,Geography_Germany,Geography_Spain,Gender_Male
0,646,32,1,0.0,2,1,0,183289.22,0,False,True,True
1,690,44,9,100368.63,2,0,0,35342.33,0,True,False,True
2,772,42,3,75075.31,2,1,0,92888.52,1,True,False,True
3,697,62,7,0.0,1,1,0,129188.18,1,False,True,True
4,714,63,4,138082.16,1,0,1,166677.54,0,False,False,True


## Standardizing the variables

* MinMaxScaler

In [10]:
X = df.drop(["Churned"], axis=1)
y = df.Churned

In [11]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

sc = MinMaxScaler()
X = sc.fit_transform(X)

In [12]:
X = pd.DataFrame(X)

In [13]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.592,0.189189,0.1,0.0,0.333333,1.0,0.0,0.916575,0.0,1.0,1.0
1,0.68,0.351351,0.9,0.400037,0.333333,0.0,0.0,0.17669,1.0,0.0,1.0
2,0.844,0.324324,0.3,0.299226,0.333333,1.0,0.0,0.46448,1.0,0.0,1.0
3,0.694,0.594595,0.7,0.0,0.0,1.0,0.0,0.646015,0.0,1.0,1.0
4,0.728,0.608108,0.4,0.550352,0.0,0.0,1.0,0.8335,0.0,0.0,1.0


In [14]:
X.shape

(9000, 11)

##Fitting an Autoencoder architecture

In [15]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
import tensorflow as tf


AE = Sequential()
AE.add(Input(shape = X.shape[1]))
AE.add(Dense(6, activation ="relu"))
AE.add(Dropout(0.1))

AE.add(Dense(3, activation ="relu"))
AE.add(Dropout(0.1))

AE.add(Dense(6, activation ="relu"))
AE.add(Dropout(0.1))
AE.add(Dense(X.shape[1], activation = "sigmoid"))


AE.compile(optimizer= 'adam', loss = "mse")


AE.fit(X,X, epochs = 80, batch_size= 200, validation_split = 0.1)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.src.callbacks.History at 0x7fb2b412da20>

In [16]:
predicted = AE.predict(X)



In [17]:
pd.DataFrame(predicted, columns = df.drop("Churned", axis =1).columns)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,0.587548,0.258377,0.492158,0.238370,0.171897,0.965648,5.971828e-04,0.494776,0.017380,0.618796,1.000000
1,0.592648,0.273345,0.516388,0.405251,0.172217,0.607084,1.230177e-10,0.518790,0.673069,0.052839,0.995908
2,0.587584,0.267088,0.524626,0.427167,0.152862,0.824235,2.539064e-14,0.523501,0.754172,0.023552,0.997842
3,0.588277,0.259408,0.492291,0.241228,0.173736,0.961760,7.388029e-04,0.495286,0.019525,0.604250,1.000000
4,0.605550,0.291023,0.491471,0.317614,0.218158,0.361277,8.715120e-01,0.503906,0.228557,0.349745,0.993602
...,...,...,...,...,...,...,...,...,...,...,...
8995,0.588233,0.259347,0.492291,0.241180,0.173679,0.961573,7.253810e-04,0.495260,0.019463,0.605335,1.000000
8996,0.607805,0.292013,0.491880,0.303173,0.215469,0.809209,8.893251e-01,0.506019,0.189045,0.250869,0.850981
8997,0.604256,0.285132,0.488313,0.233866,0.181083,0.998980,9.407815e-01,0.502150,0.026239,0.236545,0.999726
8998,0.592978,0.274018,0.516692,0.407979,0.172263,0.610963,1.146439e-10,0.519263,0.689811,0.049063,0.990531


In [18]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.592,0.189189,0.1,0.000000,0.333333,1.0,0.0,0.916575,0.0,1.0,1.0
1,0.680,0.351351,0.9,0.400037,0.333333,0.0,0.0,0.176690,1.0,0.0,1.0
2,0.844,0.324324,0.3,0.299226,0.333333,1.0,0.0,0.464480,1.0,0.0,1.0
3,0.694,0.594595,0.7,0.000000,0.000000,1.0,0.0,0.646015,0.0,1.0,1.0
4,0.728,0.608108,0.4,0.550352,0.000000,0.0,1.0,0.833500,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
8995,0.702,0.243243,0.8,0.000000,0.333333,1.0,0.0,0.845922,0.0,1.0,1.0
8996,0.530,0.283784,0.5,0.451153,0.333333,1.0,1.0,0.260827,1.0,0.0,1.0
8997,0.562,0.229730,0.8,0.514972,0.000000,1.0,1.0,0.395755,0.0,0.0,1.0
8998,0.822,0.594595,0.5,0.394002,0.000000,0.0,0.0,0.434636,1.0,0.0,1.0


## Reconstruction Loss

In [19]:
mse_reconstruction = np.mean((X-predicted)**2, axis=1)

In [20]:
pd.DataFrame(mse_reconstruction)

Unnamed: 0,0
0,0.051472
1,0.071103
2,0.023981
3,0.039657
4,0.059630
...,...
8995,0.042920
8996,0.081231
8997,0.025935
8998,0.060411


In [21]:
df_withloss = df.copy()
df_withloss['reconstruction loss'] = mse_reconstruction

In [22]:
df_withloss.sort_values('reconstruction loss', ascending = False, inplace = True)

In [23]:
df_withloss.head(50)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Churned,Geography_Germany,Geography_Spain,Gender_Male,reconstruction loss
8988,850,45,5,174088.3,4,1,0,5669.31,1,False,True,False,0.199353
4049,757,55,9,117294.12,4,1,0,94187.47,1,False,True,False,0.178578
824,773,47,2,118079.47,4,1,1,143007.49,1,True,False,True,0.173843
6370,698,50,1,0.0,4,1,0,88566.9,1,False,True,False,0.173675
5672,575,49,7,121205.15,4,1,1,168080.53,1,True,False,True,0.169234
6311,640,39,9,131607.28,4,0,1,6981.43,1,True,False,True,0.168954
4859,438,54,2,0.0,1,0,0,191763.07,1,False,True,False,0.167346
5446,749,66,6,182532.23,2,1,1,195429.92,0,True,False,True,0.161246
2117,538,31,0,0.0,2,0,0,179453.66,0,False,True,False,0.159281
6077,547,44,5,0.0,3,0,0,5459.07,1,False,True,False,0.158995


## Determing top 500 most likely to churn customers

In [24]:
top500 = df_withloss.head(500)

## Recall

In [28]:
top500.Churned.sum() / df.Churned.sum()

0.08726673984632272

8.7% Recall