In [1]:
# Mount the Google Drive

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
# Load packages 
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pre processing packages 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Modelling packages 
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier

# Evaluation packages 
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import  RandomizedSearchCV, GridSearchCV, cross_val_score

plt.rcParams["figure.figsize"] = (10,7)
file_location = '/content/drive/MyDrive/SW_CodingPractice/RIS/'
os.chdir(file_location)

In [3]:
data = pd.read_csv('bank-additional/bank-additional-full.csv', sep = ';')

# Data Preparation 

Now we have a great deal of understanding about the dataset. For building the model we need to do some data preparation so we can feed it to the model. From the previous analysis I know there are duplicate entries. Let me remove those first. 

In [None]:
data = data.drop_duplicates().reset_index(drop=True)

There are several categorical features in our dataset. There are couple of ways to handle them. <br>
1. One Hot encoding : If the variables have no order (Nominal variables) then assign a vector of size equal to the number of categories and then binary [0,1] based on the presence or absence of that category for each row/data sample. The issue with this approach is that it introduces sparseness. It becomes challenging when we have very high cardinality.
2. Numerical Encoding: If there is an inherent order to the variable(Ordinal variables) then assign a numerical value. For instance T shirt size Small, Medium and Large.
3. Binary Encoding: We can first convert the categories into numerical using an ordinal encoder. Then transform the numerical value in the binary number. This binary number is then split into different columns. This way we do not get a sparse matrix like we do in One hot encoding. This technique is generally preferred when the cardinality is high.
4. Bin or combine the categories if that is feasible for the dataset and then convert the bins to numbers.




I will also remove the feature `duration` as discussed in the EDA.

In [None]:
data = data.drop(columns = 'duration')

We know there were 96% clients who were not previously contacted. The column pdays has 96% 999 values and rest are small values in magnitude. I will convert this column into two categories the Clients who were not contacted previosuly i.e pdays = 999 as Category A and the rest as Category B.

In [None]:
data.pdays = np.where(data.pdays==999,'A','B')
data.pdays = data.pdays.astype('object')

Let me rename the y column to target and assign numerical values instead of 'yes' and 'no'.

In [None]:
data = data.rename(columns = {'y':'target'})
map_values = {'yes':1,'no':0}
data.target = data.target.replace(map_values)

In [None]:
data.shape[0]

41176

For modelling purpose, My intention is the keep some portion of dataset for final evaluation. It behaves as unseen data, I will call this Test dataset. Besides that I need data to train and validate my results. I will split the data into 3 parts. This practice reduces the risk of Data Leakage.
This technique of data splitting has been stressed a lot by Prof Andrew Ng. Youtube video from the course on Machine Learning I attended: https://www.youtube.com/watch?v=MyBSkmUeIEs

Also, I know our dataset is imbalanced I need to make sure there is sufficient representation of minority class in each of the dataset. 

In [None]:
# Split the data in Train and Test(HoldOut)
train_validation, test_data = train_test_split(data, test_size = 0.10, random_state = 123)

# Next I split the Train data further into Train and Validation(To check in case we are overfitting)
train_data, validation_data = train_test_split(train_validation, test_size = 0.15, random_state = 123)

train_data = train_data.reset_index(drop=True)
validation_data= validation_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

What is the porportion of minority class in each dataset?

In [None]:
train_data.target.value_counts(normalize = True)

0    0.886758
1    0.113242
Name: target, dtype: float64

In [None]:
validation_data.target.value_counts(normalize = True)

0    0.890808
1    0.109192
Name: target, dtype: float64

In [None]:
test_data.target.value_counts(normalize = True)

0    0.887081
1    0.112919
Name: target, dtype: float64

In [None]:
print('Sample size for training dataset: {0}'.format(train_data.shape[0]))
print('Sample size for validation dataset: {0}'.format(validation_data.shape[0]))
print('Sample size for test dataset: {0}'.format(test_data.shape[0]))

Sample size for training dataset: 31499
Sample size for validation dataset: 5559
Sample size for test dataset: 4118


Great, we have proportion of both the categories in all the 3 datasets.

# One Hot Encoding 

Only the columns with object category need to be encoded.

Let us use One hot encoding from sklearn. I am implementing this step by step for each of the datasets. This is not the most optimized way but I am interested in checking the outputs of each step. If I have to write a script I will use functions instead of doing the following tasks step by step.

In [None]:
dtype_groups = train_data.drop(columns='target').columns.to_series().groupby(train_data.drop(columns='target').dtypes).groups
feature_types = {k.name: list(v) for k, v in dtype_groups.items()}

In [None]:
feature_types['object']

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'pdays',
 'poutcome']

## Training data encoding 

In [None]:
train_data_categorical = train_data.loc[:,feature_types['object']]
train_data_numeric = train_data.loc[:,feature_types['float64'] + feature_types['int64']]

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=int)

In [None]:
ohe.fit(train_data_categorical)

OneHotEncoder(dtype=<class 'int'>, handle_unknown='ignore', sparse=False)

In [None]:
ohe_array_train = ohe.transform(train_data_categorical)
ohe_array_train_df = pd.DataFrame(ohe_array_train, columns = ohe.get_feature_names_out(feature_types['object']))
train_data_encoded = pd.concat([train_data_numeric,ohe_array_train_df], axis =1)

## Validation data encoding

In [None]:
validation_data_categorical = validation_data.loc[:,feature_types['object']]
validation_data_numeric = validation_data.loc[:,feature_types['float64'] + feature_types['int64']]

In [None]:
ohe_array_valid = ohe.transform(validation_data_categorical)
ohe_array_valid_df = pd.DataFrame(ohe_array_valid, columns = ohe.get_feature_names_out(feature_types['object']))
valid_data_encoded = pd.concat([validation_data_numeric,ohe_array_valid_df], axis =1)

## Test data encoding 

In [None]:
test_data_categorical = test_data.loc[:,feature_types['object']]
test_data_numeric = test_data.loc[:,feature_types['float64'] + feature_types['int64']]

In [None]:
ohe_array_test = ohe.transform(test_data_categorical)
ohe_array_test_df = pd.DataFrame(ohe_array_test, columns = ohe.get_feature_names_out(feature_types['object']))
test_data_encoded = pd.concat([test_data_numeric,ohe_array_test_df], axis =1)

In [None]:
pd.concat([train_data_encoded,train_data.target], axis =1).to_csv('train_data_encoded.csv', index = False)
pd.concat([valid_data_encoded,validation_data.target], axis =1).to_csv('validation_data_encoded.csv', index = False)
pd.concat([test_data_encoded,test_data.target], axis =1).to_csv('test_data_encoded.csv', index = False)

# Scaling 

We can see the column values are at different scales. For any algorithm based on calculating the distance we need the features to be on same scale also for gradient descent the convergence will be faster. Let us use the standard scaler from sklearn to scale the features.

In [None]:
scaler = StandardScaler()

# Fit on the training dataset
scaler.fit(train_data_encoded)

# Transform the training and validation datasets
X_train_scaled = scaler.transform(train_data_encoded)
X_validation_scaled = scaler.transform(valid_data_encoded)

In [None]:
y_train = train_data.target
y_validation = validation_data.target


# Modelling 

At this point we have the dataset ready to be put into any ML predictive algorithm. But its important to make a choice of which algorithms to go forward with. My intention is to first use the baseline models and later dig into hyperparameter tuning to find the best set of hyperparameters of the model. I will not use the Test dataset till the last step.

We have a binary classification problem where the target class is highly imbalanced. Following models can be used for prediction:

- SVM
- Naive Bayes
- Logistic Regression
- Tree based algorithm (Decision Tree)
- Ensemble techniques (Bagging - Random forest) and (Boosting - XGBoost Ada boost etc.)
- Neural networks (MLP) <br>
There are pros and cons of each of the algorithm, for instance if we are intersted in the interpretation of the results Logistic regression is a better choice but it will be able to fit a linear hyperplane only. SVM and Neural networks can model more complex non linear hyperplanes but they are not so interpretable.

## Logistic regression


In [None]:
# Fit the model 
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)

LogisticRegression()

In [None]:
# Predictions on Training and Validation datasets
y_predict_train = lr.predict_proba(X_train_scaled)
y_predict_valid = lr.predict_proba(X_validation_scaled)

In [None]:
# Evaluation with AUC 
train_score_lr = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_lr = roc_auc_score(y_validation, y_predict_valid[:,1])
print("Training ROC-AUC score of baseline Logistic Regression model: ", train_score_lr)
print("Validation ROC-AUC score of baseline Logistic Regression model: ", valid_score_lr)

Training ROC-AUC score of baseline Logistic Regression model:  0.7962220697700664
Validation ROC-AUC score of baseline Logistic Regression model:  0.787747549456662


## Random Forest

In [None]:
# Fit the model 
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train_scaled, y_train)

RandomForestClassifier(random_state=1)

In [None]:
# Predictions on Training and Validation datasets
y_predict_train = rf.predict_proba(X_train_scaled)
y_predict_valid = rf.predict_proba(X_validation_scaled)

In [None]:
# Evaluation with AUC 
train_score_rf = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_rf = roc_auc_score(y_validation, y_predict_valid[:,1])
print("Training ROC-AUC score of Random Forest model: ", train_score_rf)
print("Validation ROC-AUC score of Random Forest model: ", valid_score_rf)

Training ROC-AUC score of Random Forest model:  0.9998147409217331
Validation ROC-AUC score of Random Forest model:  0.7723518096627127


## Xgboost 

In [None]:
# Fit the model 
xgbc = xgb.XGBClassifier(random_state = 1, verbosity=0)


In [None]:
xgbc.fit(X_train_scaled, y_train)

XGBClassifier(random_state=1, verbosity=0)

In [None]:
# Predictions on Training and Validation datasets
y_predict_train = xgbc.predict_proba(X_train_scaled)
y_predict_valid = xgbc.predict_proba(X_validation_scaled)

In [None]:
# Evaluation with AUC 
train_score_xgb = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_xgb = roc_auc_score(y_validation, y_predict_valid[:,1])
print("Training ROC-AUC score of XGBoost model: ", train_score_xgb)
print("Validation ROC-AUC score of XGBoost model: ", valid_score_xgb)

Training ROC-AUC score of XGBoost model:  0.8154911969117519
Validation ROC-AUC score of XGBoost model:  0.7948942467124263


## Linear SVM

In [None]:
# Fit the model 
lsvc = SGDClassifier(random_state = 42)
clsvc = CalibratedClassifierCV(lsvc)
clsvc.fit(X_train_scaled, y_train)

CalibratedClassifierCV(base_estimator=SGDClassifier(random_state=42))

In [None]:
# Predictions on Training and Validation datasets
y_predict_train = clsvc.predict_proba(X_train_scaled)
y_predict_valid = clsvc.predict_proba(X_validation_scaled)

In [None]:
# Evaluation with AUC 
train_score_lsvc = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_lsvc = roc_auc_score(y_validation, y_predict_valid[:,1])
print("Training ROC-AUC score of Linear SVM model: ", train_score_lsvc)
print("Validation ROC-AUC score of Linear SVM model: ", valid_score_lsvc)

Training ROC-AUC score of Linear SVM model:  0.7702768158852363
Validation ROC-AUC score of Linear SVM model:  0.7646037877961211


## Naive Bayes

In [None]:
# Fit the model 
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

GaussianNB()

In [None]:
# Predictions on Training and Validation datasets
y_predict_train = nb.predict_proba(X_train_scaled)
y_predict_valid = nb.predict_proba(X_validation_scaled)

In [None]:
# Evaluation with AUC 
train_score_nb = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_nb = roc_auc_score(y_validation, y_predict_valid[:,1])
print("Training ROC-AUC score of Linear SVM model: ", train_score_nb)
print("Validation ROC-AUC score of Linear SVM model: ", valid_score_nb)

Training ROC-AUC score of Linear SVM model:  0.7721882272783825
Validation ROC-AUC score of Linear SVM model:  0.7606162487723995


In [None]:
Model_comparison = {
    'models':['Logistic Regression','Linear SVM', 'Naive Bayes', 'Random Forest',  'XGBoost' ],
    'Train ROC-AUC score':[train_score_lr, train_score_lsvc, train_score_nb, train_score_rf,  train_score_xgb ],
    'Validation ROC-AUC score':[valid_score_lr, valid_score_lsvc, valid_score_nb,valid_score_rf,  valid_score_xgb]
}

pd.DataFrame(Model_comparison)

Unnamed: 0,models,Train ROC-AUC score,Validation ROC-AUC score
0,Logistic Regression,0.796222,0.787748
1,Linear SVM,0.770277,0.764604
2,Naive Bayes,0.772188,0.760616
3,Random Forest,0.999815,0.772352
4,XGBoost,0.815491,0.794894


Random forest is overfitting. The Bias for training data is very low but the variance is very high. We need to find a balance.
Logistic regression seems to be more robust. Naive bayes and Linear SVM also seem okay but the score is low.
For Naive Bayes, most of its assumptions do not hold true in current scenario. For instance the features are not independent nor Gaussian and do not contribute equally to the target prediction. Linear SVM is also performing poorly. XGBoost looks promising. 

For further analysis, I will take Logistic regression(Linear model of classification), XGBoost (Ensemble - Boosting) and Random Forest(Ensemble - Bagging) for tuning. I am curious to see if we can reduce the overfitting for Random forest. 

# END