## BANK MARKETING CAMPAIGN - HYPERPARAMETER TUNING

Data taken from : https://archive.ics.uci.edu/ml/datasets/bank+marketing

In this section we will try to find the best hyperparameter to best tune the respected models. This data related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed. 

### IMPORT LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn import tree
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, classification_report, recall_score
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('bank_HP.csv')

In [3]:
df.head()

Unnamed: 0,duration,day,balance,age,campaign,contact_unknown,poutcome,education,housing_yes,pdays,deposit
0,1042,5,2343,59,1,1,3,1,1,0,1
1,1467,5,45,56,1,1,3,1,0,0,1
2,1389,5,1270,41,1,1,3,1,1,0,1
3,579,5,2476,55,1,1,3,1,1,0,1
4,673,5,184,54,2,1,3,2,0,0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   duration         11162 non-null  int64
 1   day              11162 non-null  int64
 2   balance          11162 non-null  int64
 3   age              11162 non-null  int64
 4   campaign         11162 non-null  int64
 5   contact_unknown  11162 non-null  int64
 6   poutcome         11162 non-null  int64
 7   education        11162 non-null  int64
 8   housing_yes      11162 non-null  int64
 9   pdays            11162 non-null  int64
 10  deposit          11162 non-null  int64
dtypes: int64(11)
memory usage: 959.4 KB


### PARAMETERS

In [5]:
# Random Forest Classifier

max_depth = [10, 20, 40, 'None']
min_samples_leaf = [2, 4, 8]
min_samples_split = [2, 10, 100]
n_estimators = [10, 100, 500]

RFC_param = {'max_depth' : max_depth, 
             'min_samples_leaf': min_samples_leaf, 
             'min_samples_split' : min_samples_split, 
             'n_estimators' : n_estimators}

In [6]:
# Support Vector Machine Classifier

C = [0.01, 0.1, 1, 10, 100]
gamma = [1, 0.1, 0.01, 0.001, 'scale', 'auto']

SVM_param= {'C': C, 
            'gamma': gamma}

### SPLIT DATA

In [7]:
# Split target predictors

X = df.drop(['deposit'], axis = 1)
y = df['deposit']

In [8]:
# Split 80% train data

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = .8, shuffle = False)

In [9]:
X_train.head()

Unnamed: 0,duration,day,balance,age,campaign,contact_unknown,poutcome,education,housing_yes,pdays
0,1042,5,2343,59,1,1,3,1,1,0
1,1467,5,45,56,1,1,3,1,0,0
2,1389,5,1270,41,1,1,3,1,1,0
3,579,5,2476,55,1,1,3,1,1,0
4,673,5,184,54,2,1,3,2,0,0


In [10]:
X_test.head()

Unnamed: 0,duration,day,balance,age,campaign,contact_unknown,poutcome,education,housing_yes,pdays
8929,203,3,183,48,9,1,3,1,1,0
8930,1075,2,86,24,2,1,3,1,0,0
8931,217,20,0,59,4,1,3,1,1,0
8932,317,11,2388,45,2,0,3,0,0,0
8933,156,5,160,39,1,0,3,1,1,0


### MODEL

In [11]:
# Random Forest Classifier

RFC = RandomForestClassifier().fit(X_train, y_train)

# Support Vector Machine

SVM = SVC(kernel='rbf').fit(X_train, y_train)

### HYPERPARAMETER TUNING

In [12]:
# Random Forest Classifier

def CVRFC (est, xtr, ytr):
    result = RandomizedSearchCV(estimator = est, 
                                param_distributions = RFC_param, 
                                cv=5, scoring = 'accuracy').fit(xtr, ytr)
    
    return result

# Support Vector Machine Classifier

def CVSVM (est, xtr, ytr):
    result = RandomizedSearchCV(estimator = est, 
                                param_distributions = SVM_param, 
                                cv=5, scoring = 'accuracy').fit(xtr, ytr)
    
    return result

In [13]:
# Random Forest Classifier

for i in range(1,4):
    cv_rfc = CVRFC(RFC, X_train, y_train)
    print('Hyper Model', i, cv_rfc.best_params_)

Hyper Model 1 {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 40}
Hyper Model 2 {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_depth': 40}
Hyper Model 3 {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 8, 'max_depth': 40}


In [14]:
# Support Vector Machine Classifier

for i in range(1,4):
    cv_svm = CVSVM(SVM, X_train, y_train)
    print('Hyper Model', i, cv_svm.best_params_)

Hyper Model 1 {'gamma': 'scale', 'C': 100}
Hyper Model 2 {'gamma': 'scale', 'C': 100}
Hyper Model 3 {'gamma': 'scale', 'C': 10}


### APPLY TUNED PARAMETER

In [16]:
# Random Forest Classifier

hyper_RFC1 = RandomForestClassifier(n_estimators = 500, min_samples_split = 10, min_samples_leaf = 4, max_depth = 40).fit(X_train, y_train)
hyper_RFC2 = RandomForestClassifier(n_estimators = 100, min_samples_split = 2, min_samples_leaf = 8, max_depth = 10).fit(X_train, y_train)
hyper_RFC3 = RandomForestClassifier(n_estimators = 100, min_samples_split = 10, min_samples_leaf = 8, max_depth = 10).fit(X_train, y_train)

In [17]:
# Support Vector Machine Classifier

hyper_SVM1 = SVC(kernel='rbf', gamma = 'scale', C = 100).fit(X_train, y_train)
hyper_SVM2 = SVC(kernel='rbf', gamma = 'scale', C = 100).fit(X_train, y_train)
hyper_SVM3 = SVC(kernel='rbf', gamma = 'scale', C = 10).fit(X_train, y_train)

In [18]:
# y_predict for Random Forest Classifier

yp_RFC = RFC.predict(X_test)
yp_hyper_RFC1 = hyper_RFC1.predict(X_test)
yp_hyper_RFC2 = hyper_RFC2.predict(X_test)
yp_hyper_RFC3 = hyper_RFC3.predict(X_test)

# y_predict for Support Vector Machine Classifier

yp_SVM = SVM.predict(X_test)
yp_hyper_SVM1 = hyper_SVM1.predict(X_test)
yp_hyper_SVM2 = hyper_SVM2.predict(X_test)
yp_hyper_SVM3 = hyper_SVM3.predict(X_test)

### MEASURE THE DEFAULT VS THE HYPERPARAMETER TUNED

In [19]:
# Measure the default vs hyperparameter tuned model score for Random Forest Classifier Model

RFC_acc = RFC.score(X_test, y_test)
hyper_RFC1_acc = hyper_RFC1.score(X_test, y_test)
hyper_RFC2_acc = hyper_RFC2.score(X_test, y_test)
hyper_RFC3_acc = hyper_RFC3.score(X_test, y_test)

In [20]:
model_RFC_score = pd.DataFrame({'Random Forest Classifier' : ['Default', 'Hyper Test 1', 'Hyper Test 2', 'Hyper Test 3'], 
                                'Accuracy Score': [RFC_acc, hyper_RFC1_acc, hyper_RFC2_acc, hyper_RFC3_acc]})

In [21]:
model_RFC_score

Unnamed: 0,Random Forest Classifier,Accuracy Score
0,Default,0.742499
1,Hyper Test 1,0.740708
2,Hyper Test 2,0.738916
3,Hyper Test 3,0.737573


In [22]:
# Measure the default vs hyperparameter tuned model score for Support Vector Machine Classifier Model

SVM_acc = SVM.score(X_test, y_test)
hyper_SVM1_acc = hyper_SVM1.score(X_test, y_test)
hyper_SVM2_acc = hyper_SVM2.score(X_test, y_test)
hyper_SVM3_acc = hyper_SVM3.score(X_test, y_test)

In [25]:
model_SVM_score = pd.DataFrame({'Support Vector Classifier' : ['Default', 'Hyper Test 1', 'Hyper Test 2', 'Hyper Test 3'], 
                                'Accuracy Score': [SVM_acc, hyper_SVM1_acc, hyper_SVM2_acc, hyper_SVM3_acc]})

In [26]:
model_SVM_score

Unnamed: 0,Support Vector Classifier,Accuracy Score
0,Default,0.640394
1,Hyper Test 1,0.652485
2,Hyper Test 2,0.652485
3,Hyper Test 3,0.672638
