Imports

In [None]:
import numpy as np
import pandas as pd
import io
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
from google.colab import files
uploaded = files.upload()

# Reading dataset and visualization

In [None]:
def visualize(df):
    print('HEAD\n')
    print(df.head())
    print('\n\nDATA TYPES\n')
    print(df.dtypes)
    print('\n\nINFORMATION\n')
    print(df.info())

    # numerical statistics
    print('\n\nSTATISTICS FOR NUMERICAL DATA:\n')
    print(df.describe().T)
    # categorical statistics
    print('\n\nSTATISTICS FOR CATEGORICAL DATA:\n')
    print(df.describe(include=('bool', 'category', 'object')).T)
    print('\n')


    # Display label distribution. Do we have class imbalance? 
    graph = sns.displot(df, x = 'Churn', hue='Churn')
    graph.set_axis_labels('Churn', 'Number in each label')
    graph.set_titles('label distribution')


    # Visualize the correlation between each feature and the label
    correlations = df.corr()
    sort_corr_cols = correlations.Churn.sort_values(ascending=False).keys()
    sorted_corr = correlations.loc[sort_corr_cols, sort_corr_cols]
    print(sorted_corr)
    corr_mask = np.zeros_like(correlations)
    corr_mask[np.triu_indices_from(corr_mask)] = 1
    # Make the figsize 9x9
    plt.figure(figsize=(9,9))
    # Plot heatmap of annotated correlations
    sns.heatmap(sorted_corr*100, 
                    cmap='RdBu', 
                    annot=True,
                    fmt='.0f',
                    mask=corr_mask,
                    cbar=False)
    plt.title('Correlations by Churn', fontsize=14)
    plt.yticks(rotation=0)
    plt.show()


    # Visualize the distribution for all continuous value features using histogram
    df.hist(figsize=(19,19))
    plt.show()

# Preprocess data

OneHot encode using pandas

In [None]:
def encode_and_bind(og_df, feature):
    dummies = pd.get_dummies(og_df[[feature]])
    result = pd.concat([og_df, dummies], axis=1)
    result = result.drop([feature], axis=1)
    return (result)

Clean the data

In [None]:
def prepdata(data):
    # drop unnacessrly columns 
    data = data.drop(columns=['Unnamed: 0', 'CLIENTNUM','Avg_Open_To_Buy','Months_on_book','Total_Trans_Ct','Total_Revolving_Bal'])
    # drop null if any
    data.dropna()
    # drop dublicate if any 
    data.drop_duplicates(inplace=True)

    cat_types = ['bool','object','category']
    clean_data = data.copy()
    clean_data[clean_data.select_dtypes(cat_types).columns] = clean_data.select_dtypes(cat_types).apply(lambda x : x.astype('category'))
    features = clean_data.select_dtypes('category').columns.to_list()
    for feature in features:
        clean_data = encode_and_bind(clean_data, feature)
        
    return clean_data

In [None]:
def preprocess_data(train_X,test_X,train_y,test_y):
    # train data
    p_train_x = prepdata(train_X)

    # test data
    p_test_x = prepdata(test_X)

    # scaling all the data [Numerical]
    scaler = MinMaxScaler()
    p_train_x = scaler.fit_transform(p_train_x)
    p_test_x = scaler.fit_transform(p_test_x)

    # we do not need to change anything in y since it's already represented as 0-1
    p_train_y = train_y
    p_test_y = test_y


    return p_train_x, p_test_x, p_train_y, p_test_y

# Models

In [None]:
def plot_loss(model):
    plt.plot(model.loss_curve_)
    plt.show()

In [None]:
def predict_evalute(x_test, y_test, model):
    y_predicted = model.predict(x_test)
    
    print('Classification Report')

    report = classification_report(y_test, y_predicted, target_names=['0','1'])
    print(report)

    matrix = ConfusionMatrixDisplay.from_predictions(y_true=y_test, y_pred=y_predicted)
    plt.title("Confusion Matrix")
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

MLP

In [None]:
def train_mlp(x_test, y_test, inpu_dim):
    mlp = MLPClassifier(max_iter=10)

    parameter_space = {
    'hidden_layer_sizes': (16, 32, 64),
    'activation': ['logistic', 'relu'],
    'solver': ['sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant','adaptive'], 
    }

    gscv = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=10)
    gscv.fit(x_train, y_train)
    print('Best parameters found:\n', gscv.best_params_)
    print('Best achieved score:\n', gscv.best_score_)

    plot_loss(gscv.best_estimator_)

    return gscv.best_estimator_

Suggested Model

In [None]:
def train(x_train, y_train):
    mlp = MLPClassifier(max_iter=10, hidden_layer_sizes=(16, 32, 64, 128, 256))

    parameter_space = {
      'activation': ['relu'],
      'solver': ['sgd'],
      'alpha': [1e-2, 1e-3, 1e-4],
      'learning_rate': ['constant','adaptive'], 
      }

    gscv = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=10)
    gscv.fit(x_train, y_train)


    print('Best parameters found:\n', gscv.best_params_)
    print('Best achieved score:\n', gscv.best_score_)

    plot_loss(gscv.best_estimator_)

    return gscv.best_estimator_

In [None]:
df = pd.read_csv(io.BytesIO(uploaded['BankChurners1.csv']))

visualize(df)

y = df['Churn']
x = df.drop('Churn', errors='ignore', axis=1)


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, shuffle=True)
x_train, x_test, y_train, y_test=preprocess_data(x_train, x_test, y_train, y_test)
mlp_model = train_mlp(x_train, y_train, x_train.shape[1])
print('\n\nMLP:\n')
predict_evalute(x_test, y_test, mlp_model)

ann_model = train(x_train, y_train)
print('\n\nANN:\n')
predict_evalute(x_test, y_test, ann_model)