<a href="https://www.kaggle.com/code/abhijeetrao/beginner-binary-classification-bank-churn-data?scriptVersionId=160560165" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam

In [None]:
dat = pd.read_csv("/kaggle/input/playground-series-s4e1/train.csv")

### EDA

In [None]:
## Are there any null values in the dataset?
dat.isnull().any()

In [None]:
## Printing the names of all the columns 
dat.columns
dat.dtypes

In [None]:
dat.describe()

In [None]:
dat.head(5)

In [None]:
id = dat["CustomerId"]
dat.drop(["CustomerId", "Surname","id"], axis = 1, inplace=True)

In [None]:
dat["Geography"].unique()

In [None]:
dat["Gender"].unique()

In [None]:
dat["Tenure"].unique()

In [None]:
dat["Exited"].unique()

In [None]:
box_CreditScore = [dat["CreditScore"][dat["Exited"] == i].tolist() for i in dat["Exited"].unique()]
#box_CreditScore

In [None]:
plt.boxplot(box_CreditScore, labels=dat['Exited'].unique())

In [None]:
pd.crosstab(dat["Geography"], dat["Exited"])

In [None]:
pd.crosstab(dat["Gender"], dat["Exited"])

In [None]:
pd.crosstab(dat["HasCrCard"], dat["Exited"])

In [None]:
pd.crosstab(dat["IsActiveMember"], dat["Exited"])

In [None]:
pd.crosstab(dat["Gender"], dat["HasCrCard"])

In [None]:
pd.crosstab(dat["Tenure"], dat["Exited"])

In [None]:
pd.crosstab(dat["Geography"], dat["Gender"])

In [None]:
dat.select_dtypes(include=['number']).corr()

### Data transformation (OHE)

In [None]:
encoded = pd.get_dummies(dat, columns = ["Geography", "Tenure","NumOfProducts","HasCrCard","IsActiveMember"], dtype = int)

In [None]:
encoded = pd.get_dummies(encoded, columns = ["Gender"], dtype = int, drop_first = True)

In [None]:
encoded

In [None]:
encoded.corr()

Data EDA and Preparation Done

### Model Creation with Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
Y_train = np.array(encoded["Exited"].values)
Y_train

In [None]:
encoded.drop("Exited", axis = 1).values

In [None]:
X_train = np.array(encoded.drop("Exited", axis = 1).values).reshape(-1,27)

In [None]:
X_train.shape

In [None]:
scaler = StandardScaler()
X_scaled_transformed = scaler.fit_transform(X_train)

In [None]:
Logistic_Model = LogisticRegression()
Logistic_Model.fit(X_scaled_transformed, Y_train)

In [None]:
Logistic_Model.score(X_scaled_transformed, Y_train)

### Model Creation (Going with Neural Networks - sklearn)

In [None]:
X_train[:,0:4]

In [None]:
normalization_m = tf.keras.layers.Normalization(axis=-1)
X_train_intermediary = X_train[:,0:4]
normalization_m.adapt(X_train_intermediary)
tensor_Xtrain_normalized_columns = normalization_m(X_train_intermediary)

In [None]:
X_train[:,4:]

In [None]:
tensor_Xtrain = np.concatenate((tensor_Xtrain_normalized_columns, X_train[:,4:]), axis = 1)
tensor_Xtrain = tf.constant(tensor_Xtrain)
tensor_Xtrain

In [None]:
tensor_Ytrain = tf.constant(Y_train, shape = (Y_train.size,1))

In [None]:
tensor_Ytrain

In [None]:
model = Sequential(
[
    tf.keras.Input(shape=(27,)),
    Dense(units = 128, activation = 'relu', name = "L1"),
    Dense(units = 32, activation = 'relu', name = "L2"),
    Dense(units = 8, activation = 'relu', name="L3"),
    Dense(units = 1, activation = 'sigmoid', name = 'OutputLayer')
])

In [None]:
model.compile(
    loss = BinaryCrossentropy(),
    optimizer = Adam(learning_rate = 0.01),
    metrics = ['accuracy']
)

In [None]:
model.fit(tensor_Xtrain, tensor_Ytrain, epochs = 10, validation_split = 0.20)

In [None]:
predictions = model.predict(tensor_Xtrain)

In [None]:
vals = np.array(predictions).reshape(-1,)

In [None]:
final_vals = np.array(vals).reshape(-1,)

In [None]:
dat_test = pd.read_csv("/kaggle/input/playground-series-s4e1/test.csv")

In [None]:
X_test = pd.get_dummies(dat_test, columns = ["Geography", "Tenure","NumOfProducts","HasCrCard","IsActiveMember"], dtype = int)

In [None]:
final_ids = X_test["id"]
X_test.drop(["id","CustomerId","Surname"], axis = 1, inplace=True)

In [None]:
X_test = pd.get_dummies(X_test, columns = ["Gender"], drop_first = True, dtype = int)

In [None]:
cols = encoded.columns.tolist()
cols.remove("Exited")

In [None]:
cols
X_test = X_test[cols]

In [None]:
X_test

In [None]:
X_normalized = normalization_m(X_test.iloc[:,0:4])

In [None]:
X_test_intermediary = np.array(X_test.iloc[:,4:])

In [None]:
X_test_final = np.concatenate((X_normalized, X_test_intermediary), axis = 1)

In [None]:
X_test_final = tf.constant(X_test_final)

In [None]:
X_test_final

In [None]:
test_predictions = model.predict(X_test_final)
test_predictions = np.array(test_predictions).reshape(-1,)

In [None]:
test_predictions = pd.Series(test_predictions, name = "Exited")
test_predictions

In [None]:
final_ids

In [None]:
submission_df = pd.concat([final_ids, test_predictions], axis = 1)
submission_df

In [None]:
submission_df.to_csv("submission.csv", index = False)

### Plotting metrics

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, auc

In [None]:
Y_train

In [None]:
roc_auc_score(Y_train, predictions)

In [None]:
fpr, tpr, thresholds = roc_curve(Y_train, predictions)
roc_auc = auc(fpr, tpr)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr, color='red', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='upper left')
plt.show()