# Model Ensemble

## 1. Collect and Explore the Data
Take a look at these data first.

In [337]:
import pandas as pd

data_train = pd.read_csv("data/data_train.csv")
data_test = pd.read_csv("data/data_test.csv")
print(data_train.shape)
print(data_test.shape)

(32561, 15)
(16281, 15)


In [338]:
data_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [339]:
data_train.describe()

Unnamed: 0,age,fnlwgt,educational_num,capital-gain,capital-loss,hours-per-week,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456,0.24081
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429,0.427581
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


In [340]:
data_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,Private,103497,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,30,United-States,0


In [341]:
data_test.describe()

Unnamed: 0,age,fnlwgt,educational_num,capital-gain,capital-loss,hours-per-week,income
count,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0
mean,38.767459,189435.7,10.072907,1081.905104,87.899269,40.392236,0.236226
std,13.849187,105714.9,2.567545,7583.935968,403.105286,12.479332,0.424776
min,17.0,13492.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,116736.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,177831.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,238384.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1490400.0,16.0,99999.0,3770.0,99.0,1.0


In [342]:
print(data_train.dtypes)

age                 int64
workclass          object
fnlwgt              int64
education          object
educational_num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income              int64
dtype: object


In [343]:
print(data_test.dtypes)

age                 int64
workclass          object
fnlwgt              int64
education          object
educational_num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income              int64
dtype: object


## 2. Preprocessing
As everyone might have different preprocess actions towards the training dataset, it is necessary to perform corresponding preprocess actions to the testing set and then it can be used to test different models. 

In [344]:
# Read test data
import pandas as pd
from sklearn.preprocessing import LabelEncoder

### 2.1 Decision Tree

#### 2.1.1 Prepare train and validation data for the decision trees

In [400]:
from sklearn.model_selection import train_test_split

# Load test data
data_train = pd.read_csv('data/data_train.csv')

feature_names = ['age', 'workclass', 'fnlwgt', 'educational_num', 'marital-status', 'occupation', 'relationship',
                 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

encoder = LabelEncoder()
data_train['workclass'] = encoder.fit_transform(data_train['workclass'])
data_train['marital-status'] = encoder.fit_transform(
    data_train['marital-status'])
data_train['occupation'] = encoder.fit_transform(data_train['occupation'])
data_train['relationship'] = encoder.fit_transform(data_train['relationship'])
data_train['race'] = encoder.fit_transform(data_train['race'])
data_train['gender'] = encoder.fit_transform(data_train['gender'])
data_train['native-country'] = encoder.fit_transform(
    data_train['native-country'])

# Preprocessed test set for decision tree
x_train = data_train[feature_names]
y_train = data_train['income']

# Spliting the training set and validation set by 20%
x_train_tree, x_val_tree, y_train_tree, y_val_tree = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# print(x_train_tree.shape)
# print(x_val_tree.shape)




#### 2.1.2 Prepare test data for the decision trees

In [437]:
# Load test data
data_test = pd.read_csv('data/data_test.csv')

feature_names = ['age', 'workclass', 'fnlwgt', 'educational_num', 'marital-status', 'occupation', 'relationship',
                 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

encoder = LabelEncoder()
data_test['workclass'] = encoder.fit_transform(data_test['workclass'])
data_test['marital-status'] = encoder.fit_transform(
    data_test['marital-status'])
data_test['occupation'] = encoder.fit_transform(data_test['occupation'])
data_test['relationship'] = encoder.fit_transform(data_test['relationship'])
data_test['race'] = encoder.fit_transform(data_test['race'])
data_test['gender'] = encoder.fit_transform(data_test['gender'])
data_test['native-country'] = encoder.fit_transform(
    data_test['native-country'])

# Preprocessed test set for decision tree
x_test_tree = data_test[feature_names]
y_test_tree = data_test['income']

# Preprocessed test set
# print(x_test_tree)
# print(y_test_tree)

### 2.2 K-NN

#### 2.2.1 Prepare train and validation data for the k-NN
Load the data and encode categorical features.

In [439]:
# Load test data
data_train = pd.read_csv('data/data_train.csv')

# transformation 
from sklearn.preprocessing import LabelEncoder

# Select the categorical columns to encode
cat_columns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]

# Encode categorical columns using Label Encoding for data_train
le = LabelEncoder()
test_knn = []
for col in cat_columns:
    data_train[col] = le.fit_transform(data_train[col])

# Split the test set
x_train = data_train.drop(columns =['income'])
y_train = data_train['income']

# Spliting the training set and validation set by 20%
x_train_knn, x_val_knn, y_train_knn, y_val_knn = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# print(x_train_knn.shape)
# print(x_val_knn.shape)

#Standard Scaling
from sklearn import preprocessing

x_train_knn = preprocessing.StandardScaler().fit(x_train_knn).transform(x_train_knn.astype(float))
print(x_train_knn)

[[-0.40875606 -1.88422155  0.08005085 ... -0.21799808  0.77946024
   0.26333357]
 [-0.1888573  -0.0841701  -0.98165286 ...  4.45716784  0.77946024
   0.26333357]
 [ 1.42373357  1.71588136  0.126197   ... -0.21799808 -0.03151042
   0.26333357]
 ...
 [-1.50824984 -0.0841701   0.25206312 ... -0.21799808 -1.65345173
   0.26333357]
 [ 0.83733689  1.71588136 -1.28762772 ... -0.21799808  3.53676046
   0.26333357]
 [-0.33545648  0.81585563 -0.59020877 ... -0.21799808  1.59043089
   0.26333357]]


#### 2.2.2 Prepare test data for the k-NN


In [440]:
# Load test data
data_test = pd.read_csv('data/data_test.csv')

# transformation 
from sklearn.preprocessing import LabelEncoder

# Select the categorical columns to encode
cat_columns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]

# Encode categorical columns using Label Encoding for data_test
le = LabelEncoder()
test_knn = []
for col in cat_columns:
    data_test[col] = le.fit_transform(data_test[col])

# Split the test set
x_test_knn = data_test.drop(columns =['income'])
y_test_knn = data_test['income']

# Print the first 5 rows of the transformed dataset
# print(x_test_knn.head())
# print(y_test_knn.head())

#Standard Scaling
from sklearn import preprocessing

x_test_knn = preprocessing.StandardScaler().fit(x_test_knn).transform(x_test_knn.astype(float))
# print(x_test_knn)

### 2.3 Neural Network

#### 2.3.1 Prepare train and validation data for the neural network


In [419]:
from sklearn.preprocessing import MinMaxScaler
import category_encoders as ce

# Load test data
data_train = pd.read_csv('data/data_train.csv', header=0)

# feature transformation
for col in data_train:
    if data_train[col].dtype == 'object':
        data_train[col] = encoder.fit_transform(data_train[col].astype(str))

# feature scaling
scaler = MinMaxScaler()
for col in data_train.columns:
    data_train[col] = scaler.fit_transform(data_train[[col]])

x_train = data_train.iloc[:, :-1]
y_train = data_train.iloc[:, -1]
# print(x_test_nn)
# print(y_test_nn)

# Spliting the training set and validation set by 20%
x_train_nn, x_val_nn, y_train_nn, y_val_nn = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

print(x_train_nn.shape)
print(x_val_nn.shape)


(26048, 14)
(6513, 14)


#### 2.3.2 Prepare test data for the neural network


In [441]:
from sklearn.preprocessing import MinMaxScaler
import category_encoders as ce

# Load test data
data_test = pd.read_csv('data/data_test.csv', header=0)

# feature transformation
for col in data_test:
    if data_test[col].dtype == 'object':
        data_test[col] = encoder.fit_transform(data_test[col].astype(str))

# feature scaling
scaler = MinMaxScaler()
for col in data_test.columns:
    data_test[col] = scaler.fit_transform(data_test[[col]])

x_test_nn = data_test.iloc[:, :-1]
y_test_nn = data_test.iloc[:, -1]
# print(x_test_nn)
# print(y_test_nn)

### 2.4 Bayesian Leanring

#### 2.4.1 Prepare train data and validation for the Bayesian learning


In [430]:
#Importing the tain data
import pandas as pd
data_test = pd.read_csv('data/data_train.csv')
data_test = data_test.reset_index()
xs_test = data_test.drop(['income'], axis=1)
ys_test = data_test['income']

In [431]:
#Store all the categorical features
categorical = [var for var in xs_test.columns if xs_test[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :\n\n', categorical)

#Store all the numerical features
numerical = [var for var in xs_test.columns if xs_test[var].dtype!='O']
print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :\n\n', numerical)

There are 8 categorical variables

The categorical variables are :

 ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
There are 7 numerical variables

The numerical variables are :

 ['index', 'age', 'fnlwgt', 'educational_num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [432]:
##Discretization the numerical features##
from sklearn.preprocessing import KBinsDiscretizer
kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
#Age
age_t = xs_test['age']
age_t=age_t.values.reshape(-1,1)
age_trans_t = kbins.fit_transform(age_t)

#Final weight
fw_t = xs_test['fnlwgt']
fw_t=fw_t.values.reshape(-1,1)
fw_trans_t = kbins.fit_transform(fw_t)

#educational_num
edunum_t = xs_test['educational_num']
edunum_t=edunum_t.values.reshape(-1,1)
edunum_trans_t = kbins.fit_transform(edunum_t)

#capital gain
cg_t = xs_test['capital-gain']
cg_t=cg_t.values.reshape(-1,1)
cg_trans_t = kbins.fit_transform(cg_t)

#capital loss
cl_t = xs_test['capital-loss']
cl_t=cl_t.values.reshape(-1,1)
cl_trans_t = kbins.fit_transform(cl_t)

#hours-per-week
hours_t = xs_test['hours-per-week']
hours_t=hours_t.values.reshape(-1,1)
hours_trans_t = kbins.fit_transform(hours_t)

In [433]:
age_t=pd.DataFrame(age_trans_t,columns =['age'])
fw_t=pd.DataFrame(fw_trans_t,columns =['fnlwgt'])
edunum_t=pd.DataFrame(edunum_trans_t,columns =['educational-num'])
cg_t=pd.DataFrame(cg_trans_t,columns =['capital-gain'])
cl_t=pd.DataFrame(cl_trans_t,columns =['capital-loss'])
hours_t=pd.DataFrame(hours_trans_t,columns =['hours-per-week'])


numerical_trans_t = pd.concat([age_t,fw_t,edunum_t,cg_t,cl_t,hours_t],axis=1)

xs_bnb_test = pd.concat([xs_test[categorical],numerical_trans_t],axis=1)

In [434]:
#install encoder library
# !pip install category_encoders
import category_encoders as ce

#import the trained encoder_bnb
import pickle
with open("trained_models/encoder_bnb.pkl", "rb") as f:
    encoder_bnb = pickle.load(f)

#Encode the unseen test data
# xs_bnb_test = encoder_bnb.transform(xs_bnb_test)
# ys_bnb_test = ys_test

x_train = encoder_bnb.transform(xs_bnb_test)
y_train = ys_test

# Spliting the training set and validation set by 20%
x_train_nb, x_val_nb, y_train_nb, y_val_nb = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

print(x_train_nb.shape)
print(x_val_nb.shape)



(26048, 154)
(6513, 154)


#### 2.4.2 Prepare test for the Bayesian learning


In [442]:
#Importing the testing data
import pandas as pd
data_test = pd.read_csv('data/data_test.csv')
data_test = data_test.reset_index()
xs_test = data_test.drop(['income'], axis=1)
ys_test = data_test['income']

#Store all the categorical features
categorical = [var for var in xs_test.columns if xs_test[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :\n\n', categorical)

#Store all the numerical features
numerical = [var for var in xs_test.columns if xs_test[var].dtype!='O']
print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :\n\n', numerical)

# Discretization the numerical features##
from sklearn.preprocessing import KBinsDiscretizer
kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
#Age
age_t = xs_test['age']
age_t=age_t.values.reshape(-1,1)
age_trans_t = kbins.fit_transform(age_t)

#Final weight
fw_t = xs_test['fnlwgt']
fw_t=fw_t.values.reshape(-1,1)
fw_trans_t = kbins.fit_transform(fw_t)

#educational_num
edunum_t = xs_test['educational_num']
edunum_t=edunum_t.values.reshape(-1,1)
edunum_trans_t = kbins.fit_transform(edunum_t)

#capital gain
cg_t = xs_test['capital-gain']
cg_t=cg_t.values.reshape(-1,1)
cg_trans_t = kbins.fit_transform(cg_t)

#capital loss
cl_t = xs_test['capital-loss']
cl_t=cl_t.values.reshape(-1,1)
cl_trans_t = kbins.fit_transform(cl_t)

#hours-per-week
hours_t = xs_test['hours-per-week']
hours_t=hours_t.values.reshape(-1,1)
hours_trans_t = kbins.fit_transform(hours_t)

age_t=pd.DataFrame(age_trans_t,columns =['age'])
fw_t=pd.DataFrame(fw_trans_t,columns =['fnlwgt'])
edunum_t=pd.DataFrame(edunum_trans_t,columns =['educational-num'])
cg_t=pd.DataFrame(cg_trans_t,columns =['capital-gain'])
cl_t=pd.DataFrame(cl_trans_t,columns =['capital-loss'])
hours_t=pd.DataFrame(hours_trans_t,columns =['hours-per-week'])


numerical_trans_t = pd.concat([age_t,fw_t,edunum_t,cg_t,cl_t,hours_t],axis=1)

xs_bnb_test = pd.concat([xs_test[categorical],numerical_trans_t],axis=1)


There are 8 categorical variables

The categorical variables are :

 ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
There are 7 numerical variables

The numerical variables are :

 ['index', 'age', 'fnlwgt', 'educational_num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [443]:
import category_encoders as ce

# import the trained encoder_bnb
import pickle
with open("trained_models/encoder_bnb.pkl", "rb") as f:
    encoder_bnb = pickle.load(f)

#Encode the unseen test data
# xs_bnb_test = encoder_bnb.transform(xs_bnb_test)
# ys_bnb_test = ys_test

x_test_nb = encoder_bnb.transform(xs_bnb_test)
y_test_nb = ys_test

## 3. Predict the Result

### 3.1 Result of decision tree

In [447]:
# Load the decision tree
import pickle
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

# Load model
with open('trained_models/pruned_decision_tree.pkl', 'rb') as f:
    decision_tree = pickle.load(f)

print(decision_tree)

# Make predictions on the training data
y_pred_tree_proba = decision_tree.predict_proba(x_train_tree)[:, 1]  # probability of ">50k"
# print(y_pred_tree_proba)
# print(y_pred_tree_proba.shape)
y_pred_tree = decision_tree.predict(x_train_tree)  # binary
# print(y_pred_tree)
# print(y_pred_tree.shape)


# Make predictions on the validation data
y_pred_val_tree_proba = decision_tree.predict_proba(x_val_tree)[:, 1]  # probability of ">50k"
# print(y_pred_val_tree_proba)
# print(y_pred_val_tree_proba.shape)
y_pred_val_tree = decision_tree.predict(x_val_tree)  # binary
# print(y_pred_val_tree)
# print(y_pred_val_tree.shape)


# Make predictions on the testing data
y_pred_test_tree_proba = decision_tree.predict_proba(x_test_tree)[:, 1]  # probability of ">50k"
# print(y_pred_test_tree_proba)
# print(y_pred_test_tree_proba.shape)
y_pred_test_tree = decision_tree.predict(x_test_tree)  # binary
# print(y_pred_test_tree)
# print(y_pred_test_tree.shape)


# Evaluate training
# print(classification_report(y_train_tree,y_pred_tree_proba > 0.5))
# auc = roc_auc_score(y_train_tree, y_pred_tree_proba > 0.5)  # ROC score
# print('ROC_AUC score: {:.4f}'.format(auc))


DecisionTreeClassifier(max_depth=9, max_features=8, min_samples_leaf=10,
                       min_samples_split=8)
[0 0 0 ... 1 0 1]
(16281,)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


### 3.2 Result of k-NN

In [409]:
import joblib

# Load k-NN model
knn_model = joblib.load('trained_models/kNN.pkl')
print(knn_model)

# Produce results
y_pred_knn_proba = knn_model.predict_proba(x_train_knn)[:, 1]  # probability of ">50k"
# print(y_pred_knn_proba)
# print(y_pred_knn_proba.shape)

y_pred_knn = knn_model.predict(x_train_knn)
# print(y_pred_knn)
# print(y_pred_knn.shape)

# Evaluate the the model
print(classification_report(y_train_knn,y_pred_knn))
auc = roc_auc_score(y_train_knn,y_pred_knn)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))

KNeighborsClassifier(leaf_size=16, metric='manhattan', n_neighbors=29)
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     19778
           1       0.73      0.60      0.66      6270

    accuracy                           0.85     26048
   macro avg       0.81      0.76      0.78     26048
weighted avg       0.84      0.85      0.84     26048

ROC_AUC score: 0.7638


### 3.3 Result of neural networks

In [429]:
from tensorflow import keras
import numpy as np


# Load neutral network model
nn_model = keras.models.load_model('trained_models/NeuralNetwork.h5')
# nn_model.summary()

# print(x_train_nn)
# print(x_train_nn.shape)

# Produce results
y_pred_nn_proba = nn_model.predict(x_train_nn)  # probability of ">50k"
y_pred_nn_proba = np.squeeze(y_pred_nn_proba)  # to 1-d array
print(y_pred_nn_prob)
print(y_pred_nn_prob.shape)

y_pred_nn = np.where(y_pred_nn_proba > 0.5, 1, 0)  # probability to 0/1
# print(y_pred_nn)
# print(y_pred_nn.shape)


# Evaluate the model
print(classification_report(y_train_nn,y_pred_nn))
auc = roc_auc_score(y_train_nn, y_pred_nn)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))

(26048, 14)
[0 0 0 ... 1 0 1]
(16281,)
              precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     19778
         1.0       0.77      0.57      0.65      6270

    accuracy                           0.85     26048
   macro avg       0.82      0.76      0.78     26048
weighted avg       0.85      0.85      0.85     26048

ROC_AUC score: 0.7561


### 3.4 Result of Bayesian learning

In [436]:
nb_model = pickle.load(open('trained_models/BernoullilNaiveBayes.sav', 'rb'))


y_pred_nb_proba = nb_model.predict_proba(x_train_nb)[:, 1]  # probability of ">50k"
# print(y_pred_nb_proba)
# print(y_pred_nb_proba.shape)

y_pred_nb = nb_model.predict(x_train_nb)
# print(y_pred_nb)
# print(y_pred_nb.shape)

# Evaluate the model
print(classification_report(y_train_nb,y_pred_nb))
auc = roc_auc_score(y_train_nb, y_pred_nb)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88     19778
           1       0.63      0.65      0.64      6270

    accuracy                           0.83     26048
   macro avg       0.76      0.77      0.76     26048
weighted avg       0.83      0.83      0.83     26048

ROC_AUC score: 0.7667


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## 4. Majority Voting

### 4.1 Custom Esemble Model

In [376]:
from scipy.stats import mode

class EnsembleModel():

    def __init__(self, predictions, voting):
        self.predictions = predictions  # prediction of base classifiers; shape: (num_of_base_models, num_of_test_instances)
        self.voting = voting
        

    def predict(self, weights=[]):
        # Stack the predictions into a single array
        predictions_stack = np.stack(predictions)
        
        # print(predictions.shape)
        if (self.voting == "soft"):
            # Soft Voting
            # Compute the weighted average of the predictions along the first axis (i.e., across each column)
            if (weights == []):
                # Default: weights are equal
                soft_pred_prob = np.average(predictions_stack, axis=0)
            else:
                # Use passing weights
                soft_pred_prob = np.average(predictions_stack, axis=0, weights=weights)

            soft_pred = np.where(soft_pred_prob > 0.5, 1, 0)  # probability to 0/1
            return soft_pred
        else:
            # Hard Voting
            # Compute the mode of the predictions along the first axis (i.e., across each column)
            mode_pred = mode(predictions_stack, axis=0).mode
            mode_pred = np.transpose(mode_pred) 
            mode_pred = np.squeeze(mode_pred)
            # print(mode_pred)
            # print(mode_pred.shape)
            return mode_pred

### 4.2 Hard Voting VS. Soft Voting

#### 4.2.1 Hard voting

In [381]:
predictions = [y_pred_tree, y_pred_knn, y_pred_nn, y_pred_nb]  # binary predictions

e = EnsembleModel(predictions, "hard")
hard_pred = e.predict()
# print(hard_pred)
# print(hard_pred.shape)

# Evaluate the model
print(classification_report(y_test,hard_pred))
auc = roc_auc_score(y_test, hard_pred)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))

  mode_pred = mode(predictions_stack, axis=0).mode


              precision    recall  f1-score   support

           0       0.87      0.95      0.91     12435
           1       0.78      0.52      0.62      3846

    accuracy                           0.85     16281
   macro avg       0.82      0.74      0.77     16281
weighted avg       0.84      0.85      0.84     16281

ROC_AUC score: 0.7366


#### 4.2.2 Soft voting

In [374]:
predictions = [y_pred_tree_proba, y_pred_knn_proba, y_pred_nn_proba, y_pred_nb_proba]  # probability predictions
# weights = [1, 1, 1, 1]

e = EnsembleModel(predictions, "soft")
soft_pred = e.predict()

# Evaluate the model
print(classification_report(y_test,soft_pred))
auc = roc_auc_score(y_test, soft_pred)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91     12435
           1       0.75      0.57      0.65      3846

    accuracy                           0.86     16281
   macro avg       0.82      0.76      0.78     16281
weighted avg       0.85      0.86      0.85     16281

ROC_AUC score: 0.7579


### 4.3 Hyper-parameter Tuning

#### 4.2.3 Test domination of different models

In [360]:
# Soft Voting
from scipy.stats import mode

# Stack the predictions into a single array
# predictions = np.stack((y_pred_tree_proba, y_pred_knn_proba, y_pred_nn_proba, y_pred_nb_proba))

weights1 = [0.7, 0.1, 0.1, 0.1]
weights2 = [0.1, 0.7, 0.1, 0.1]
weights3 = [0.1, 0.1, 0.7, 0.1]
weights4 = [0.1, 0.1, 0.1, 0.7]

soft_pred_prob1 = np.average(predictions, axis=0, weights=weights1)
soft_pred_prob2 = np.average(predictions, axis=0, weights=weights2)
soft_pred_prob3 = np.average(predictions, axis=0, weights=weights3)
soft_pred_prob4 = np.average(predictions, axis=0, weights=weights4)

# print(soft_pred_prob)
# print(soft_pred_prob.shape)

# Evaluate the model
print("When decision tree is dominate:")
print(classification_report(y_test,soft_pred_prob1>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob1>0.5)))

print("\n\nWhen k-NN is dominate:")
print(classification_report(y_test,soft_pred_prob2>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob2>0.5)))

print("\n\nWhen neutral network is dominate:")
print(classification_report(y_test,soft_pred_prob3>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob3>0.5)))

print("\n\nWhen Bernoullil Naive Bayes (BNB) is dominate:")
print(classification_report(y_test,soft_pred_prob4>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob4>0.5)))


When decision tree is dominate:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     12435
           1       0.77      0.54      0.64      3846

    accuracy                           0.85     16281
   macro avg       0.82      0.75      0.77     16281
weighted avg       0.85      0.85      0.84     16281

ROC_AUC score: 0.7459


When k-NN is dominate:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90     12435
           1       0.70      0.57      0.63      3846

    accuracy                           0.84     16281
   macro avg       0.79      0.75      0.76     16281
weighted avg       0.83      0.84      0.84     16281

ROC_AUC score: 0.7463


When neutral network is dominate:
              precision    recall  f1-score   support

           0       0.87      0.94      0.90     12435
           1       0.74      0.55      0.63      3846

    accuracy                           0.85  

#### 4.2.3 Attemp improving the preformance
Now that we know when Bernoullil Naive Bayes is dominate, the performance is better. Next, we want to try lifting and decrising its weight, and see if we can further improve.

In [361]:

weights1 = [0.067, 0.067, 0.066, 0.8]
weights2 = [0.034, 0.034, 0.034, 0.9]
weights3 = [0, 0, 0, 1]
weights4 = [0.3, 0.3, 0.3, 0.1]

soft_pred_prob1 = np.average(predictions, axis=0, weights=weights1)
soft_pred_prob2 = np.average(predictions, axis=0, weights=weights2)
soft_pred_prob3 = np.average(predictions, axis=0, weights=weights3)
soft_pred_prob4 = np.average(predictions, axis=0, weights=weights4)

# print(soft_pred_prob)
# print(soft_pred_prob.shape)

# Evaluate the model
print("Give weight 0.8 to BNB:")
print(classification_report(y_test,soft_pred_prob1>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob1>0.5)))

print("\n\nGive weight 0.9 to BNB:")
print(classification_report(y_test,soft_pred_prob2>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob2>0.5)))

print("\n\nGive weight 1 to BNB:")
print(classification_report(y_test,soft_pred_prob3>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob3>0.5)))

print("\n\nGive weight 0.1 to BNB and 0.3 to other models:")
print(classification_report(y_test,soft_pred_prob4>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob4>0.5)))


Give weight 0.8 to BNB:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     12435
           1       0.63      0.65      0.64      3846

    accuracy                           0.83     16281
   macro avg       0.76      0.76      0.76     16281
weighted avg       0.83      0.83      0.83     16281

ROC_AUC score: 0.7647


Give weight 0.9 to BNB:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     12435
           1       0.63      0.65      0.64      3846

    accuracy                           0.83     16281
   macro avg       0.76      0.76      0.76     16281
weighted avg       0.83      0.83      0.83     16281

ROC_AUC score: 0.7647


Give weight 1 to BNB:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     12435
           1       0.63      0.65      0.64      3846

    accuracy                           0.83     16281
   macro a

## 5. Result Visualization