# Classifier: Ensemble: Voting Classifier

## 1.a. Import: Libraries

In [1]:
#data organizing
import pandas #storage
import numpy as np #data-type conversion
from os import getcwd

#scaling and encoding
from sklearn.preprocessing import StandardScaler

#dimentionality reduction/feature selection
from sklearn.feature_selection import SelectKBest #count of k best features chi2
from sklearn.feature_selection import mutual_info_classif

#smote for imbalanced classes
from imblearn.over_sampling import SMOTENC

#preprocessing - data splitting
from sklearn.model_selection import train_test_split

#classifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC

from sklearn.ensemble import VotingClassifier

#classification result - statistical
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

#hyper-parameter optimization
from sklearn.model_selection import GridSearchCV

from sklearn.exceptions import *

## 1.b. Import: Dataset

In [2]:
#dtype changed from int64 to int32 to save space and speed up computation, no data was lost
def cvDefPay(prediction):
    mapper = {0: False, 1: True}
    return mapper.get(prediction)

url = getcwd() + '\\default of credit card clients.xls'
ccd = pandas.read_excel(io = url, \
                        sheet_name='Data', header = 1, index_col = 0, \
                        dtype = {'LIMIT_BAL': np.int32, 'AGE': np.int32, 'BILL_AMT1': np.int32, 'BILL_AMT2': np.int32, 'BILL_AMT3': np.int32, 'BILL_AMT4': np.int32, 'BILL_AMT5': np.int32, 'BILL_AMT6': np.int32, 'PAY_AMT1': np.int32, 'PAY_AMT2': np.int32, 'PAY_AMT3': np.int32, 'PAY_AMT4': np.int32, 'PAY_AMT5': np.int32, 'PAY_AMT6': np.int32}, \
                       converters = {'default payment next month': cvDefPay})

In [3]:
ccd.rename(columns = {'PAY_0': 'PAY_1'}, inplace = True)
ccd.rename(columns = {'default payment next month': 'default_payment_next_month'}, inplace = True)

## 2.a Feature Engineering

#### 1. PAY_1 to PAY_6

In [4]:
ccdr = pandas.read_excel(io = url, 
                        sheet_name='Data', header = 1, index_col = 0)
ccdr.rename(columns = {'PAY_0': 'PAY_1'}, inplace = True)

In [5]:
ccdrHistory = ccdr[['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']]
ccdrHistoryMode = ccdrHistory.mode(axis = 'columns')
ccdrHistorySeverest = ccdrHistoryMode.apply(func = max, axis = 'columns')
ccd['PAY_MODE_SEVEREST'] = ccdrHistorySeverest

#### 2. BILL_AMT1 to BILL_AMT6

In [6]:
ccdSpent = ccd[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']]
ccd['BILL_AMT_MEAN'] = np.int32(ccdSpent.mean(axis = 'columns').round())

#### 3. PAY_AMT1 to PAY_AMT6

In [7]:
ccdSettled = ccd[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]
ccd['PAY_AMT_MEAN'] = np.int32(ccdSettled.mean(axis = 'columns').round())

## 2.b. Normalization

Scaling: Only to reduce the effect of very large continuous variables (in distance based esimators).

Normalization: Also reduce the effect of skewness in variables.

No need in Decision Trees

In [8]:
varsToScale = ['LIMIT_BAL', 'AGE', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 
               'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'BILL_AMT_MEAN', 'PAY_AMT_MEAN']
scaler = StandardScaler(copy = True)

In [9]:
for var in varsToScale:
    ccd[var] = scaler.fit_transform(ccd[var].values.reshape(-1, 1))


## 2.c. Feature Selection

In [10]:
ccdY = pandas.DataFrame(ccd['default_payment_next_month'])
ccdX = ccd.drop(['default_payment_next_month'], axis = 'columns')

In [11]:
featureFilter = SelectKBest(score_func = mutual_info_classif, k = np.int32(len(ccdX.columns) * 0.75))
featureFilter.fit(X = ccdX, y = ccdY.values.ravel())
filteredColumnsIndices = featureFilter.get_support(indices = True)

ccdXdr = ccdX.iloc[:, filteredColumnsIndices]

## 2.d. Encoding

OneHotEncoding should be done after dimentionality reduction to ensure that one of the categories of a variable isn't dropped during feature selection, which could have been the case if OneHotEncoder was used before Feature Selection.

No need to run any encoder since:

1. The dataset's relevant categorical variables are pre-encoded via OrdinalEncoder.
2. All of the categorical variables' categories have some difference in distribution in correlation with the target variable, so OneHotEncoder should not be used.

## 3.a. Data Splitting

Data is split before oversampling to avoid synthetic datapoints in test dataset.

Test dataset is separated even though GridSearchCV uses Stratified K-Fold cross-validation so that model's accuracy can be tested independently.

In [12]:
trainX, testX, trainY, testY = train_test_split(ccdXdr, ccdY, test_size = 0.25, stratify = ccdY, random_state = 44)

## 3.b. Oversampling

In [13]:
categoricalVars = {'LIMIT_BAL': False, 'SEX': True, 'EDUCATION': True, 'MARRIAGE': True, 'AGE': False, 
                   'PAY_1': True, 'PAY_2': True, 'PAY_3': True, 'PAY_4': True, 'PAY_5': True, 'PAY_6': True,
                   'BILL_AMT1': False, 'BILL_AMT2': False, 'BILL_AMT3': False, 'BILL_AMT4': False, 'BILL_AMT5': False, 'BILL_AMT6': False,
                   'PAY_AMT1': False, 'PAY_AMT2': False, 'PAY_AMT3': False, 'PAY_AMT4': False, 'PAY_AMT5': False, 'PAY_AMT6': False,
                   'PAY_MODE_SEVEREST': True, 'BILL_AMT_MEAN': False, 'PAY_AMT_MEAN': False}

def getSelectedCatBool(catVars, dfSelectedX):
    boolList = []
    for varName in dfSelectedX:
        if varName in list(catVars.keys()):
            boolList.append(catVars.get(varName))
    return boolList

trainXcat = getSelectedCatBool(categoricalVars, trainX.columns)

In [14]:
oversampler = SMOTENC(categorical_features = trainXcat, sampling_strategy = 'minority', random_state = 44, n_jobs = -1)

In [15]:
trainXoversampled, trainYoversampled = oversampler.fit_resample(trainX, trainY)

## 4. VotingClassifier

In [16]:
clfRF = RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight='balanced',
                               criterion='entropy', max_depth=9.5, max_features='auto',
                               max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0,
                               min_impurity_split=None, min_samples_leaf=1, min_samples_split=2,
                               min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                               oob_score=False, random_state=39, verbose=0, warm_start=False)

clfLG = LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=-1, penalty='none',
                   random_state=44, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

clfTrue = GaussianNB(priors=None, var_smoothing=1e-09)

clfFalse = SVC(cache_size = 500, max_iter = -1, random_state = 44, kernel = 'linear', C = 10,
               class_weight = {True: 1.25, False: 1.0}, probability = True)

### 4.a. Soft Vote

In [17]:
classifier = VotingClassifier(estimators = [('clfLG', clfLG), 
                                            ('clfTrue', clfTrue),
                                            ('clfFalse', clfFalse)],
                              n_jobs = -1, voting = 'soft')

#### 4.a.i Oversampling: False

In [18]:
classifier.fit(trainX, trainY.values.ravel())

VotingClassifier(estimators=[('clfLG',
                              LogisticRegression(C=1.0, class_weight='balanced',
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto', n_jobs=-1,
                                                 penalty='none',
                                                 random_state=44,
                                                 solver='newton-cg', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('clfTrue',
                              GaussianNB(priors=None, var_smoothing=1e-09)),
                             ('clfFalse',
                              SVC(C=10, break_ties=False, cache_size=500,
                                  class_weig

In [19]:
classifier.score(trainX, trainY)

0.7866222222222222

In [20]:
classifier.score(testX, testY)

0.782

In [21]:
predictY = classifier.predict(testX)
print(classification_report(testY, predictY))

              precision    recall  f1-score   support

       False       0.86      0.86      0.86      5841
        True       0.51      0.52      0.51      1659

    accuracy                           0.78      7500
   macro avg       0.68      0.69      0.69      7500
weighted avg       0.78      0.78      0.78      7500



In [22]:
roc_auc_score(testY, predictY)

0.6878449805933179

In [23]:
pandas.DataFrame(
    confusion_matrix(testY, predictY),
    columns=['Predicted | Not Defaulter', 'Defaulter'],
    index=['Correct | Not Defaulter', 'Defaulter'])

Unnamed: 0,Predicted | Not Defaulter,Defaulter
Correct | Not Defaulter,5004,837
Defaulter,798,861


#### 4.a.ii Oversampling: True

In [24]:
classifier.fit(trainXoversampled, trainYoversampled.values.ravel())

VotingClassifier(estimators=[('clfLG',
                              LogisticRegression(C=1.0, class_weight='balanced',
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto', n_jobs=-1,
                                                 penalty='none',
                                                 random_state=44,
                                                 solver='newton-cg', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('clfTrue',
                              GaussianNB(priors=None, var_smoothing=1e-09)),
                             ('clfFalse',
                              SVC(C=10, break_ties=False, cache_size=500,
                                  class_weig

In [25]:
classifier.score(trainXoversampled, trainYoversampled)

0.6093134737202534

In [26]:
classifier.score(testX, testY)

0.5558666666666666

In [27]:
predictY = classifier.predict(testX)
print(classification_report(testY, predictY))

              precision    recall  f1-score   support

       False       0.89      0.49      0.63      5841
        True       0.30      0.79      0.44      1659

    accuracy                           0.56      7500
   macro avg       0.60      0.64      0.54      7500
weighted avg       0.76      0.56      0.59      7500



In [28]:
roc_auc_score(testY, predictY)

0.6380411526302966

In [29]:
pandas.DataFrame(
    confusion_matrix(testY, predictY),
    columns=['Predicted | Not Defaulter', 'Defaulter'],
    index=['Correct | Not Defaulter', 'Defaulter'])

Unnamed: 0,Predicted | Not Defaulter,Defaulter
Correct | Not Defaulter,2866,2975
Defaulter,356,1303


### 4.b. Hard Vote

In [30]:
classifier = VotingClassifier(estimators = [('clfLG', clfLG), 
                                            ('clfTrue', clfTrue),
                                            ('clfFalse', clfFalse)],
                              n_jobs = -1, voting = 'hard')

#### 4.b.i. Oversampling: False

In [31]:
classifier.fit(trainX, trainY.values.ravel())

VotingClassifier(estimators=[('clfLG',
                              LogisticRegression(C=1.0, class_weight='balanced',
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto', n_jobs=-1,
                                                 penalty='none',
                                                 random_state=44,
                                                 solver='newton-cg', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('clfTrue',
                              GaussianNB(priors=None, var_smoothing=1e-09)),
                             ('clfFalse',
                              SVC(C=10, break_ties=False, cache_size=500,
                                  class_weig

In [32]:
classifier.score(trainX, trainY)

0.7788888888888889

In [33]:
classifier.score(testX, testY)

0.7757333333333334

In [34]:
predictY = classifier.predict(testX)
print(classification_report(testY, predictY))

              precision    recall  f1-score   support

       False       0.87      0.84      0.85      5841
        True       0.49      0.54      0.52      1659

    accuracy                           0.78      7500
   macro avg       0.68      0.69      0.69      7500
weighted avg       0.78      0.78      0.78      7500



In [35]:
roc_auc_score(testY, predictY)

0.6920215115881281

In [36]:
pandas.DataFrame(
    confusion_matrix(testY, predictY),
    columns=['Predicted | Not Defaulter', 'Defaulter'],
    index=['Correct | Not Defaulter', 'Defaulter'])

Unnamed: 0,Predicted | Not Defaulter,Defaulter
Correct | Not Defaulter,4919,922
Defaulter,760,899


#### 4.b.ii. Oversampling: True

In [37]:
classifier.fit(trainXoversampled, trainYoversampled.values.ravel())

VotingClassifier(estimators=[('clfLG',
                              LogisticRegression(C=1.0, class_weight='balanced',
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto', n_jobs=-1,
                                                 penalty='none',
                                                 random_state=44,
                                                 solver='newton-cg', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('clfTrue',
                              GaussianNB(priors=None, var_smoothing=1e-09)),
                             ('clfFalse',
                              SVC(C=10, break_ties=False, cache_size=500,
                                  class_weig

In [38]:
classifier.score(trainXoversampled, trainYoversampled)

0.6294013582149175

In [39]:
classifier.score(testX, testY)

0.6030666666666666

In [40]:
predictY = classifier.predict(testX)
print(classification_report(testY, predictY))

              precision    recall  f1-score   support

       False       0.88      0.57      0.69      5841
        True       0.32      0.74      0.45      1659

    accuracy                           0.60      7500
   macro avg       0.60      0.65      0.57      7500
weighted avg       0.76      0.60      0.64      7500



In [41]:
roc_auc_score(testY, predictY)

0.6508656305910114

In [42]:
pandas.DataFrame(
    confusion_matrix(testY, predictY),
    columns=['Predicted | Not Defaulter', 'Defaulter'],
    index=['Correct | Not Defaulter', 'Defaulter'])

Unnamed: 0,Predicted | Not Defaulter,Defaulter
Correct | Not Defaulter,3301,2540
Defaulter,437,1222
