In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl
import pickle

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

## **preprocessing**

In [2]:
# Read the data
df = pd.read_excel('Telco_customer_churn.xlsx')
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,No,No,Yes,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,No,No,Yes,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,No,Yes,Yes,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,No,No,Yes,49,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


In [3]:
# Convert Total Charges feature from object to numerical
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
df['Total Charges'].apply(lambda x : type(x)).value_counts()

Total Charges
<class 'float'>    7043
Name: count, dtype: int64

In [4]:
# drop Null Records
df = df.dropna()
df.isnull().sum()

CustomerID           0
Count                0
Country              0
State                0
City                 0
Zip Code             0
Lat Long             0
Latitude             0
Longitude            0
Gender               0
Senior Citizen       0
Partner              0
Dependents           0
Tenure Months        0
Phone Service        0
Multiple Lines       0
Internet Service     0
Online Security      0
Online Backup        0
Device Protection    0
Tech Support         0
Streaming TV         0
Streaming Movies     0
Contract             0
Paperless Billing    0
Payment Method       0
Monthly Charges      0
Total Charges        0
Churn Label          0
Churn Value          0
Churn Score          0
CLTV                 0
Churn Reason         0
dtype: int64

In [5]:
# Dropping Unnecessary Columns
dropped_columns = ['CustomerID', 'Count', 'Country', 'State', 'Lat Long', 'Churn Label', 'Churn Score', 'CLTV', 'Churn Reason']
df.drop(columns = dropped_columns, axis = 1, inplace = True)

In [6]:
# Label Encoder
Label_Encoder = LabelEncoder()
label_encoder_features = ['City', 'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Paperless Billing']

for feature in label_encoder_features:
    df[feature] = Label_Encoder.fit_transform(df[feature])

In [7]:
df

Unnamed: 0,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Value
0,417,90003,33.964131,-118.272783,1,0,0,0,2,1,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
1,417,90005,34.059281,-118.307420,0,0,0,1,2,1,No,Fiber optic,No,No,No,No,No,No,Month-to-month,1,Electronic check,70.70,151.65,1
2,417,90006,34.048013,-118.293953,0,0,0,1,8,1,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,1,Electronic check,99.65,820.50,1
3,417,90010,34.062125,-118.315709,0,0,1,1,28,1,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,1,Electronic check,104.80,3046.05,1
4,417,90015,34.039224,-118.266293,1,0,0,1,49,1,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,1,Bank transfer (automatic),103.70,5036.30,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1864,535,92264,33.745746,-116.514215,1,0,0,0,12,1,No,DSL,No,No,No,Yes,Yes,No,One year,1,Electronic check,59.80,727.80,1
1865,674,92273,32.790282,-115.689559,0,0,0,0,9,1,No,DSL,No,No,No,No,No,No,Month-to-month,1,Bank transfer (automatic),44.20,403.35,1
1866,783,92280,34.264124,-114.717964,1,1,0,0,1,1,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,1,Electronic check,75.75,75.75,1
1867,807,92282,33.972293,-116.654195,0,0,0,0,67,1,Yes,Fiber optic,Yes,Yes,Yes,No,Yes,No,Month-to-month,1,Credit card (automatic),102.95,6886.25,1


In [8]:
# One Hot Encoder
One_Hot_Encoder = OneHotEncoder(drop = 'first', sparse_output = False)

OneHot_Encoder_features = ['Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
       'Streaming Movies', 'Contract', 'Payment Method']

encoded_data = One_Hot_Encoder.fit_transform(df[OneHot_Encoder_features])
encoded_columns = One_Hot_Encoder.get_feature_names_out(OneHot_Encoder_features)

encoded_df = pd.DataFrame(encoded_data, columns = encoded_columns)
df = pd.concat([df, encoded_df], axis = 1)
df.drop(columns = OneHot_Encoder_features, axis = 1, inplace = True)

df.head()

Unnamed: 0,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Paperless Billing,Monthly Charges,Total Charges,Churn Value,Multiple Lines_No phone service,Multiple Lines_Yes,Internet Service_Fiber optic,Internet Service_No,Online Security_No internet service,Online Security_Yes,Online Backup_No internet service,Online Backup_Yes,Device Protection_No internet service,Device Protection_Yes,Tech Support_No internet service,Tech Support_Yes,Streaming TV_No internet service,Streaming TV_Yes,Streaming Movies_No internet service,Streaming Movies_Yes,Contract_One year,Contract_Two year,Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,417,90003,33.964131,-118.272783,1,0,0,0,2,1,1,53.85,108.15,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,417,90005,34.059281,-118.30742,0,0,0,1,2,1,1,70.7,151.65,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,417,90006,34.048013,-118.293953,0,0,0,1,8,1,1,99.65,820.5,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,417,90010,34.062125,-118.315709,0,0,1,1,28,1,1,104.8,3046.05,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,417,90015,34.039224,-118.266293,1,0,0,1,49,1,1,103.7,5036.3,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Save the Encoders

with open ('Pickled Objects/One_Hot_Encoder.pkl', 'wb') as file:
    pickle.dump(One_Hot_Encoder, file)

with open ('Pickled Objects/Label_Encoder.pkl', 'wb') as file:
    pickle.dump(Label_Encoder, file)

In [10]:
# Divide the data into independent and dependent features
X = df.drop(columns = ['Churn Value'])
y = df['Churn Value']

# Split the data in training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Scaling the independent features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# Save the Standard Scaler

with open('Pickled Objects/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

## **ANN Implementation**

In [12]:
import tensorflow as tf
from datetime import datetime

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from tensorflow.keras.regularizers import l2

In [13]:
# ANN Implementation
model = Sequential([
    Dense(64, activation = 'relu', input_shape = (X_train.shape[1], )),
    Dense(32, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])

model.summary()

In [14]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
loss = tf.keras.losses.BinaryCrossentropy()
metric = tf.keras.metrics.BinaryAccuracy()

model.compile(optimizer = optimizer, loss = loss, metrics = [metric])

In [15]:
# Set up the tnesorboard
log_dir = "c:/Users/ahmed/Desktop/End To End DL Project ANN - 2/logs/fit" + datetime.now().strftime('%Y%m%d - %H%M%S')
Tensorboard_callback = TensorBoard(log_dir = log_dir, histogram_freq = 1)

In [16]:
# Set up early stopping
early_stopping_callback = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)

In [17]:
# Trainning the model
history = model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 100,
                    callbacks = [Tensorboard_callback, early_stopping_callback])

Epoch 1/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - binary_accuracy: 0.6603 - loss: 0.5569 - val_binary_accuracy: 1.0000 - val_loss: 0.0550
Epoch 2/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - binary_accuracy: 1.0000 - loss: 0.0325 - val_binary_accuracy: 1.0000 - val_loss: 0.0086
Epoch 3/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - binary_accuracy: 1.0000 - loss: 0.0063 - val_binary_accuracy: 1.0000 - val_loss: 0.0036
Epoch 4/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - binary_accuracy: 1.0000 - loss: 0.0030 - val_binary_accuracy: 1.0000 - val_loss: 0.0020
Epoch 5/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - binary_accuracy: 1.0000 - loss: 0.0017 - val_binary_accuracy: 1.0000 - val_loss: 0.0013
Epoch 6/100
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - binary_accuracy: 1.0000 - loss: 0.0011

In [18]:
# Save the model
model.save('model.h5', include_optimizer=True)



In [19]:
%load_ext tensorboard
%tensorboard --logdir="c:/Users/ahmed/Desktop/End To End DL Project ANN - 2/logs/fit"

Reusing TensorBoard on port 6010 (pid 6224), started 0:07:46 ago. (Use '!kill 6224' to kill it.)