# **Churn Modelling**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.linear_model import LogisticRegression

# **Loading the dataset**

In [2]:
df = pd.read_csv("Churn_Modelling.csv")

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# **Checking Missing values**

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


There is no missing data 

# **Dependent and independent variable**

In [5]:
X = df.iloc[:,3:-1]
y = df['Exited']

# **Encoding Categorical Variable**

In [6]:
# label encoder can be done on only single dimentional array
le = LabelEncoder()
X['Gender'] = le.fit_transform(X['Gender'])

In [7]:
# array should be 2-D in one hot encoding
ct = ColumnTransformer(transformers=[("encode",OneHotEncoder(drop="first",sparse=False),[1])],remainder="passthrough")
X = ct.fit_transform(X)
X
# drop = first to avoid dummy variable trap (region southwest)
# sparse false to stop sparse compression



array([[0.0000000e+00, 0.0000000e+00, 6.1900000e+02, ..., 1.0000000e+00,
        1.0000000e+00, 1.0134888e+05],
       [0.0000000e+00, 1.0000000e+00, 6.0800000e+02, ..., 0.0000000e+00,
        1.0000000e+00, 1.1254258e+05],
       [0.0000000e+00, 0.0000000e+00, 5.0200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 1.1393157e+05],
       ...,
       [0.0000000e+00, 0.0000000e+00, 7.0900000e+02, ..., 0.0000000e+00,
        1.0000000e+00, 4.2085580e+04],
       [1.0000000e+00, 0.0000000e+00, 7.7200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 9.2888520e+04],
       [0.0000000e+00, 0.0000000e+00, 7.9200000e+02, ..., 1.0000000e+00,
        0.0000000e+00, 3.8190780e+04]])

# **Scaling**

In [8]:
sc = StandardScaler()
X = sc.fit_transform(X)
X

array([[-0.57873591, -0.57380915, -0.32622142, ...,  0.64609167,
         0.97024255,  0.02188649],
       [-0.57873591,  1.74273971, -0.44003595, ..., -1.54776799,
         0.97024255,  0.21653375],
       [-0.57873591, -0.57380915, -1.53679418, ...,  0.64609167,
        -1.03067011,  0.2406869 ],
       ...,
       [-0.57873591, -0.57380915,  0.60498839, ..., -1.54776799,
         0.97024255, -1.00864308],
       [ 1.72790383, -0.57380915,  1.25683526, ...,  0.64609167,
        -1.03067011, -0.12523071],
       [-0.57873591, -0.57380915,  1.46377078, ...,  0.64609167,
        -1.03067011, -1.07636976]])

# **Splitting**

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
print(X_train.dtype)
print(y_train)

float64
7389    0
9275    0
2995    0
5316    0
356     0
       ..
9225    0
4859    0
3264    0
9845    0
2732    1
Name: Exited, Length: 8000, dtype: int64


# **Tensorflow**

In [10]:
import tensorflow as tf

In [11]:
# make structure # compile - how to train # fit 

In [12]:
ann = tf.keras.models.Sequential()

In [23]:
# Input layers
ann.add(tf.keras.layers.Input(shape=11)) # make 11 input neuron 
# Hidden Layer
ann.add(tf.keras.layers.Dense(20,activation="relu"))
# Dropout layer to avoid overfitting
#ann.add(tf.keras.layers.Dropout(0.2))

#ann.add(tf.keras.layers.Dense(10,activation="relu"))

# output layer
ann.add(tf.keras.layers.Dense(1,activation='sigmoid'))   

**Activation Function**

- linear
- relu  - rectified linear unit
- elu = exponential linear unit
- sigmoid
- softmax
- sogtplus
- tanh = tangent hyperbolic funtion

**Advanced**
- leaky relu                          
so on---

why last output have sigmoid func? = 

In [34]:
# compile 
ann.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),loss='binary_crossentropy',metrics=["accuracy"])
'''optimizers = a func to update the parameter of neural network'''#(adam,rms prop,sgd,adadelta)

'optimizers = a func to update the parameter of neural network'

In [35]:
# fit 
# callback 
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath="model.h5",save_best_only=True)
# early stoping
early_stoping = tf.keras.callbacks.EarlyStopping(patience=4)
# above code will stop the epoches when loss will not alter
history=ann.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=80,batch_size=32,callbacks=[model_checkpoint,early_stoping])

Epoch 1/80


Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80


# **Learning Curve**

In [36]:
# to know the loss of each epoche
error=history.history
print(error)

{'loss': [0.4638563394546509, 0.404997318983078, 0.38455671072006226, 0.3676183819770813, 0.3664712905883789, 0.36293089389801025, 0.36317482590675354, 0.35412076115608215, 0.3542690575122833, 0.35646378993988037, 0.3521491587162018, 0.35470718145370483, 0.35171911120414734, 0.3513685464859009, 0.3532319664955139, 0.34941452741622925, 0.3477019667625427], 'accuracy': [0.8007500171661377, 0.8323749899864197, 0.8445000052452087, 0.8493750095367432, 0.8503749966621399, 0.8460000157356262, 0.8522499799728394, 0.8556249737739563, 0.8553749918937683, 0.8579999804496765, 0.8557500243186951, 0.8557500243186951, 0.8569999933242798, 0.8608750104904175, 0.8583750128746033, 0.8553749918937683, 0.859250009059906], 'val_loss': [0.4285051226615906, 0.3740985691547394, 0.3594658374786377, 0.3472910225391388, 0.3478069603443146, 0.35487842559814453, 0.3481895327568054, 0.34494465589523315, 0.36367619037628174, 0.3440283238887787, 0.34119275212287903, 0.35240569710731506, 0.33765411376953125, 0.34495097

In [40]:
# plt.plot(range(1,80),error['loss'])
# plt.plot(range(1,80),error['val_loss'])

# plt.title("Value_loss vs train_loss")
# plt.xlabel("no. of epoches")
# plt.ylabel("loss")
# plt.legend(["Train loss","Val loss"])
# plt.show()

# it will run when we do not take eaarly stopping parameter