In [None]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.pyplot import figure
import seaborn as sns
# import library for preprocessing
from sklearn.preprocessing import StandardScaler

from utilities.utlity import * 

from sklearn.neural_network import MLPClassifier

# import libraries for cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

# import evaluation metrics
from sklearn.metrics import accuracy_score,recall_score,precision_recall_curve, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix


import warnings
warnings.filterwarnings('ignore')



## import the dataset

In [None]:
# import the dataset

dataset = pd.read_csv('data/bank-additional/bank-additional-full.csv', sep=';')
dataset.name = 'dataset'

In [None]:
dataset.head()

# Exploring the dataset

#### Check the shape and size of the imported dataset

In [None]:
shape(dataset)
size(dataset)

Our dataset has 4119 rows and 21 columns

#### Check the schema of the dataset

In [None]:
check_info(dataset)

#### Check the statiscical data of the dataset

In [None]:
dataset.describe().T

#### Unique values of each column. 

In [None]:
get_unique_values(dataset)

#### Check each column and the number of rows with no value

In [None]:
check_missing_val(dataset)

The result shows that the dataset doesn't have missing values

### Explore the columns that have categoriacal data type

In [None]:
# Categorical columns
categorical_data = dataset.select_dtypes(exclude='number')
categorical_data.name = "categorical_data";
categorical_data.head()

#### 11 out of 20 columns have non numerical data type

In [None]:
# numerical columns
numberical_data = dataset.select_dtypes(include='number')
numberical_data.name = "numberical_data";
numberical_data.head()

#### 10 out of 20 columns have non numerical data type

## Correlation

In [None]:
# visualize correlation between the columns that have numberical data dype
sns.heatmap(dataset.corr(), annot=True, fmt='.1g', 
                 vmin=-1, vmax=1, center= 0);

### Outliers

In [None]:
# Copying the dataset to a new df to handle outliers
dataset_new = dataset.copy(deep=True)
dataset_new.head()

In [None]:
# Using boxplot to identify outliers
for col in numberical_data:
    ax = sns.boxplot(numberical_data[col])
    # save(f"{col}")
    plt.show()

Class Imbalance

In [None]:
# Checking the value count in the target variable 'y'
print("Value count in y\n-----------------\n",dataset.y.value_counts())

# percentage of yes and no
print("\nPercentage of value count in y\n------------------------------\n",
      dataset.y.value_counts(normalize=True)*100)

### Replacing outlier datapoints with nan

In [None]:
# treating outliers
count = 1
for col in numberical_data:
    Q1 = numberical_data[col].quantile(0.25)
    Q3 = numberical_data[col].quantile(0.75)
    IQR = Q3 - Q1
    print(f'column {count}: {numberical_data[col].name}\n------------------------')
    print('1st quantile => ',Q1)
    print('3rd quantile => ',Q3)
    print('IQR =>',IQR)

    fence_low  = Q1-(1.5*IQR)
    print('fence_low => ' + str(fence_low))

    fence_high = Q3+(1.5*IQR)
    print('fence_high => ' + str(fence_high))
    print("\n------------------------")
    
    count = count + 1
    
    #replacing outliers with nan
    dataset_new[col][((dataset_new[col] < fence_low) |(dataset_new[col] > fence_high))] = np.nan

In [None]:
# checking the columns which outliers were replace with nan
print(dataset_new.select_dtypes(include='number').isnull().sum())

In [None]:
# replace_outliers_with_nan(numberical_data, dataset_new)

In [None]:
get_column_with_nan_values(dataset_new)

In [None]:
# Dealing with the nan values

# mode
columns_mode = ['age', 'pdays']
for col in columns_mode:
    dataset_new[col].fillna(dataset_new[col].mode()[0], inplace=True)
    
# median
columns_median = ['duration', 'campaign', 'previous', 'cons.conf.idx']
for col in columns_median:
    dataset_new[col].fillna(dataset_new[col].median(), inplace=True)

In [None]:
# Checking if outliers has been removed
for col in dataset_new.select_dtypes(include='number'):
    ax = sns.boxplot(dataset_new.select_dtypes(include='number')[col])
    # save(f"{col}2")
    plt.show()

In [None]:
export_to_csv(dataset_new, 'bank-addition-full-without-outliers.csv', index=False)

## Data preprocessing

In [None]:
# import the dataset without outliers
dataset_new = pd.read_csv('bank-addition-full-without-outliers.csv')
dataset_new.name = 'New dataset'
print("New Dataset\n-------------------------")
print(dataset_new.head())

In [None]:
# Preprocessing - Using the new dataset i.e. data without outliers
# replacing basic.4y, basic.6y, basic.9y as basic
dataset_new['education'] = dataset_new['education'].replace(['basic.4y', 'basic.6y', 'basic.9y'], 'basic')

In [None]:
# defining output variable for classification
dataset_new['subscribed'] = (dataset_new.y == 'yes').astype('int')


In [None]:
# encoding categorical columns
encoded_data = encode(dataset_new)
print("Encoded Data\n-------------------------")
print(encoded_data.head())

In [None]:
# preprocessed data
preprocessed_data = preprocessed(dataset_new)
print("Preprocessed Data\n-------------------------")
print(preprocessed_data.head())

In [None]:
# Rescale numerical columns
rescaled_data = rescale(preprocessed_data)
print("Rescaled Data\n-------------------------")
print(rescaled_data.head())

In [None]:
# input and target variables
X, y = split_input_output_variables(rescaled_data)


In [None]:
# splitting the data
X_train,X_test,y_train,y_test = split_data(X, y)

In [None]:
get_column_with_nan_values(X)

# Dimensionality Reduction

In [None]:
# dimensionality reduction
X_train_reduced, X_test_reduced = dimension_reduction('PCA', 20, X_train, X_test)

### Dealing with imbalanced data

In [None]:
# dealing with imbalanced class
X_train_smote, y_train_smote = class_imbalance(X_train_reduced, y_train)

After sovling class imbalance

In [None]:
y_train_smote.describe().T

## Model

In [None]:
metrics = ['accuracy', 'roc_auc', 'f1', 'precision', 'recall']

In [None]:
def model(model, cross_validation_method, metrics, X_train, X_test, y_train):
    if (model == 'MLP'):
        # creating an instance of the classifier
        model_inst = MLPClassifier()
        print('Multi Layer Perceptron\n----------------------')
        
      # cross validation
    if (cross_validation_method == 'KFold'):
        print('Cross validation: KFold\n--------------------------')
        cv = KFold(n_splits=10, random_state=100, shuffle=True)
    elif (cross_validation_method == 'StratifiedKFold'):
        print('Cross validation: StratifiedKFold\n-----------------')
        cv = StratifiedKFold(n_splits=10, random_state=100, shuffle=True)
    else:
        print('Cross validation method not found!')
        
    
    
    try:
        cv_scores = cross_validate(model_inst, X_train, y_train, 
                                   cv=cv, scoring=metrics)   
        # displaying evaluation metric scores
        cv_metric = cv_scores.keys()
        for metric in cv_metric:
            mean_score = cv_scores[metric].mean()*100
            print(metric+':', '%.2f%%' % mean_score)
            print('')
            
    except:
        metrics = ['accuracy', 'f1', 'precision', 'recall']
        cv_scores = cross_validate(model_inst, X_train, y_train, 
                                   cv=cv, scoring=metrics)
        # displaying evaluation metric scores
        cv_metric = cv_scores.keys()
        for metric in cv_metric:
            mean_score = cv_scores[metric].mean()*100
            print(metric+':', '%.2f%%' % mean_score)
            print('')
    return model_inst
        
        

In [None]:
# function to make predictions
def prediction(model, model_name, X_train, y_train, X_test, y_test):
    model_ = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    #Get the confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(cf_matrix, annot=True, fmt='.0f')
    plt.title(f'{model_name} Confusion Matrix')
    plt.savefig(f'conf_{model_name}.png')
    plt.show()

In [None]:
# 3. Multi Layer Perceptron
# KFold cross validation
model_res = model('MLP', 'KFold', metrics, X_train_smote, X_test_reduced, y_train_smote)
# StratifiedKFold cross validation
# model_res = model('MLP', 'StratifiedKFold', metrics, X_train_smote, X_test_reduced, y_train_smote)
# make prediction
prediction(model_res, 'Multi Layer Perceptron', X_train_smote, y_train_smote, X_test_reduced, y_test)