In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
# %matplotlib notebook
plt.rcParams["figure.figsize"] = (12, 6)
# plt.rcParams['figure.dpi'] = 100
sns.set_style("whitegrid")
import warnings

warnings.filterwarnings("ignore")
warnings.warn("this will not show")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV

In [None]:
df = pd.read_csv("/kaggle/input/churn-predictions-personal/Churn_Predictions.csv", index_col='RowNumber')
df.head()

# Exploratory Data Analysis and Visualization

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum().any()

In [None]:
df.duplicated().sum()

In [None]:
df.describe().T

In [None]:
df.drop(['CustomerId', 'Surname'], axis=1, inplace=True)

In [None]:
# Get the value counts of the 'Exited' column
exited_counts = df['Exited'].value_counts()

# Get the bar names and heights
bar_names = exited_counts.index.astype(str).tolist()
bar_heights = exited_counts.values.tolist()

# Create a bar chart of the value counts
plt.bar(bar_names, bar_heights)
colors = ['steelblue', 'orange']
plt.bar(bar_names, bar_heights, color=colors)
plt.xlabel('Exited')
plt.ylabel('Count')
plt.title('Customer Churn Count')
plt.show()

In [None]:
df.hist(figsize=(15, 12), bins=15);

In [None]:
cat_list = ["Gender", "HasCrCard", "IsActiveMember", "Geography"]
index = 0
plt.figure(figsize=(16, 12))
for i in cat_list:
    index += 1
    plt.subplot(2, 2, index)
    sns.countplot(data=df, x=i, hue="Exited")

In [None]:
# Set the figure size
plt.figure(figsize=(20, 7))

# Create a count plot of age with hue on churn
sns.countplot(x='Age', hue='Exited', data=df)

# Rotate x labels to prevent overlapping
plt.xticks()
plt.show()

In [None]:
plt.figure(figsize=(16, 12))
sns.heatmap(df.corr(), annot=True, cmap='viridis')

In [None]:
df.corr()['Exited'][:-1].sort_values().plot.barh()

In [None]:
df = pd.get_dummies(df, drop_first=True)
df.head()

# Preprocessing of Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
X = df.drop('Exited', axis=1)
y = df['Exited'].values
seed = 42
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    test_size=0.1,
                                                    random_state=seed)


In [None]:
scaler = MinMaxScaler() # Minmax scaler mostly used for deep learning as a rule of thumb

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Modelling & Model Performance

In [None]:
X_train.shape

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
from tensorflow.keras.optimizers import Adam

In [None]:
tf.random.set_seed(seed)

model = Sequential()

model.add(Dense(16, activation="relu"))    # add the input layer and the first hidden layer
model.add(Dense(8, activation="relu"))     # add the second hidden layer
model.add(Dense(1, activation="sigmoid"))   # add the output layer

opt = Adam(lr=0.002)
model.compile(optimizer=opt, 
              loss="binary_crossentropy",
              metrics=["Recall"])    # loss function and metrics

In [None]:
early_stop = EarlyStopping(monitor="val_loss",
                           mode="auto",
                           verbose=1,
                           patience=25)

In [None]:
model.fit(x=X_train,
          y=y_train,
          validation_split=.1,
          batch_size=128,
          epochs=200,
          verbose=1,
          callbacks=[early_stop])

In [None]:
model.summary()

In [None]:
loss_df = pd.DataFrame(model.history.history)
loss_df.head()

In [None]:
loss_df.plot()

In [None]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")  # to convert the probabilities into binary values
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ROC (Receiver Operating Curve) and AUC (Area Under Curve) for model

In [None]:
y_pred_proba = model.predict(X_test)
precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)
plt.plot(recalls, precisions, label='ANN')
plt.xlabel('recalls')
plt.ylabel('precisions')
plt.title('Precision-Recall curve')
plt.show()

In [None]:
average_precision_score(y_test, y_pred_proba)

In [None]:
model = Sequential()

tf.random.set_seed(seed)

model.add(Dense(16, activation="relu"))
model.add(Dense(8, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

opt = Adam(lr=0.002)
model.compile(optimizer=opt, 
              loss="binary_crossentropy",
              metrics=["Recall"])

In [None]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                  classes=np.unique(y_train),
                                                  y=y_train)

class_weights = {0: class_weights[0], 1: class_weights[1]}
class_weights

In [None]:
model.fit(x=X_train,
          y=y_train,
          validation_split=.1,
          batch_size=128,
          epochs=200,
          verbose=1,
          callbacks=[early_stop],
          class_weight=class_weights)

In [None]:
loss_df = pd.DataFrame(model.history.history)
loss_df.plot()

In [None]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ROC (Receiver Operating Curve) and AUC (Area Under Curve) for model

In [None]:
y_pred_proba = model.predict(X_test)
precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)
plt.plot(recalls, precisions, label='ANN')
plt.xlabel('recalls')
plt.ylabel('precisions')
plt.title('Precision-Recall curve')
plt.show()

In [None]:
average_precision_score(y_test, y_pred_proba)

#  Final Model and Model Deployment

In [None]:
import pickle

pickle.dump(scaler, open("scaler_churn", 'wb'))

In [None]:
tf.random.set_seed(seed)

model = Sequential()

model.add(Dense(16, activation="relu"))
model.add(Dense(8, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

opt = Adam(lr=0.002)

model.compile(optimizer=opt,
              loss="binary_crossentropy",
              metrics=["Recall"])

model.fit(x=X_train,
          y=y_train,
          validation_data=(X_test, y_test), 
          callbacks=[early_stop],
          batch_size=256,
          epochs=200,
          verbose=1,
          class_weight=class_weights)  

In [None]:
loss_df = pd.DataFrame(model.history.history)
loss_df.plot()

In [None]:
loss, recall = model.evaluate(X_test, y_test, verbose=0)
print("loss : ", loss)
print("recall : ", recall)

In [None]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
model.save('model_churn.h5')

# Prediction

In [None]:
single_customer = df.drop('Exited', axis=1).iloc[0]
single_customer

In [None]:
single_customer = scaler.transform(single_customer.values.reshape(-1, 11))
single_customer

In [None]:
y_pred =(model.predict(single_customer) > 0.5).astype("int32")
y_pred

In [None]:
df["Exited"].iloc[0]

# Comparison with ML

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

In [None]:
log_model=LogisticRegression(class_weight='balanced')
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))