# Model Ensemble

## 1. Collect and Explore the Data
Take a look at these data first.

In [234]:
import pandas as pd

data_train = pd.read_csv("data/data_train.csv")
data_test = pd.read_csv("data/data_test.csv")
print(data_train.shape)
print(data_test.shape)

(32561, 15)
(16281, 15)


In [235]:
data_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [236]:
data_train.describe()

Unnamed: 0,age,fnlwgt,educational_num,capital-gain,capital-loss,hours-per-week,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456,0.24081
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429,0.427581
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


In [237]:
data_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,Private,103497,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,30,United-States,0


In [238]:
data_test.describe()

Unnamed: 0,age,fnlwgt,educational_num,capital-gain,capital-loss,hours-per-week,income
count,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0
mean,38.767459,189435.7,10.072907,1081.905104,87.899269,40.392236,0.236226
std,13.849187,105714.9,2.567545,7583.935968,403.105286,12.479332,0.424776
min,17.0,13492.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,116736.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,177831.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,238384.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1490400.0,16.0,99999.0,3770.0,99.0,1.0


In [239]:
print(data_train.dtypes)

age                 int64
workclass          object
fnlwgt              int64
education          object
educational_num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income              int64
dtype: object


In [240]:
print(data_test.dtypes)

age                 int64
workclass          object
fnlwgt              int64
education          object
educational_num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income              int64
dtype: object


## 2. Preprocessing
As everyone might have different preprocess actions towards the training dataset, it is necessary to perform corresponding preprocess actions to the testing set and then it can be used to test different models. 

In [241]:
# Read test data
import pandas as pd
from sklearn.preprocessing import LabelEncoder

### 2.1 Prepare test data for the decision trees

In [242]:
# Load test data
data_test = pd.read_csv('data/data_test.csv')

feature_names = ['age', 'workclass', 'fnlwgt', 'educational_num', 'marital-status', 'occupation', 'relationship',
                 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

encoder = LabelEncoder()
data_test['workclass'] = encoder.fit_transform(data_test['workclass'])
data_test['marital-status'] = encoder.fit_transform(
    data_test['marital-status'])
data_test['occupation'] = encoder.fit_transform(data_test['occupation'])
data_test['relationship'] = encoder.fit_transform(data_test['relationship'])
data_test['race'] = encoder.fit_transform(data_test['race'])
data_test['gender'] = encoder.fit_transform(data_test['gender'])
data_test['native-country'] = encoder.fit_transform(
    data_test['native-country'])

# Preprocessed test set for decision tree
x_test_tree = data_test[feature_names]
y_test_tree = data_test['income']

# Preprocessed test set
# print(x_test_tree)
# print(y_test_tree)

### 2.2 Prepare test data for the k-NN
Load the data and encode categorical features.

In [243]:
# Load test data
data_test = pd.read_csv('data/data_test.csv')

# transformation 
from sklearn.preprocessing import LabelEncoder

# Select the categorical columns to encode
cat_columns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]

# Encode categorical columns using Label Encoding for data_test
le = LabelEncoder()
test_knn = []
for col in cat_columns:
    data_test[col] = le.fit_transform(data_test[col])

# Split the test set
x_test_knn = data_test.drop(columns =['income'])
y_test_knn = data_test['income']

# Print the first 5 rows of the transformed dataset
print(x_test_knn.head())
print(y_test_knn.head())

   age  workclass  fnlwgt  education  educational_num  marital-status  \
0   25          3  226802          1                7               4   
1   38          3   89814         11                9               2   
2   28          1  336951          7               12               2   
3   44          3  160323         15               10               2   
4   18          3  103497         15               10               4   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             3     2       1             0             0   
1           4             0     4       1             0             0   
2          10             0     4       1             0             0   
3           6             0     2       1          7688             0   
4           9             3     4       0             0             0   

   hours-per-week  native-country  
0              40              37  
1              50              37  
2             

Make sure all features are on the same scale.

In [244]:
#Standard Scaling
from sklearn import preprocessing

x_test_knn = preprocessing.StandardScaler().fit(x_test_knn).transform(x_test_knn.astype(float))
print(x_test_knn)


[[-0.99412926 -0.09851079  0.35347399 ... -0.21806206 -0.03143184
   0.25775643]
 [-0.05541716 -0.09851079 -0.94239062 ... -0.21806206  0.7699177
   0.25775643]
 [-0.77750339 -1.88752825  1.39544986 ... -0.21806206 -0.03143184
   0.25775643]
 ...
 [-0.05541716 -0.09851079  1.75522095 ... -0.21806206  0.7699177
   0.25775643]
 [ 0.37783458 -0.09851079 -0.99842039 ... -0.21806206 -0.03143184
   0.25775643]
 [-0.27204303  0.79599794 -0.0689392  ... -0.21806206  1.57126723
   0.25775643]]


### 2.3 Prepare test data for the neural network


In [245]:
from sklearn.preprocessing import MinMaxScaler
import category_encoders as ce

# Load test data
data_test = pd.read_csv('data/data_test.csv', header=0)

# feature transformation
for col in data_test:
    if data_test[col].dtype == 'object':
        data_test[col] = encoder.fit_transform(data_test[col].astype(str))

# feature scaling
scaler = MinMaxScaler()
for col in data_test.columns:
    data_test[col] = scaler.fit_transform(data_test[[col]])

x_test_nn = data_test.iloc[:, :-1]
y_test_nn = data_test.iloc[:, -1]
# print(x_test_nn)
# print(y_test_nn)

### 2.4 Prepare test data for the Bayesian learning


In [246]:
#Importing the testing data
import pandas as pd
data_test = pd.read_csv('data/data_test.csv')
data_test = data_test.reset_index()
xs_test = data_test.drop(['income'], axis=1)
ys_test = data_test['income']

In [247]:
#Store all the categorical features
categorical = [var for var in xs_test.columns if xs_test[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :\n\n', categorical)

#Store all the numerical features
numerical = [var for var in xs_test.columns if xs_test[var].dtype!='O']
print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :\n\n', numerical)

There are 8 categorical variables

The categorical variables are :

 ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
There are 7 numerical variables

The numerical variables are :

 ['index', 'age', 'fnlwgt', 'educational_num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [248]:
##Discretization the numerical features##
from sklearn.preprocessing import KBinsDiscretizer
kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
#Age
age_t = xs_test['age']
age_t=age_t.values.reshape(-1,1)
age_trans_t = kbins.fit_transform(age_t)

#Final weight
fw_t = xs_test['fnlwgt']
fw_t=fw_t.values.reshape(-1,1)
fw_trans_t = kbins.fit_transform(fw_t)

#educational_num
edunum_t = xs_test['educational_num']
edunum_t=edunum_t.values.reshape(-1,1)
edunum_trans_t = kbins.fit_transform(edunum_t)

#capital gain
cg_t = xs_test['capital-gain']
cg_t=cg_t.values.reshape(-1,1)
cg_trans_t = kbins.fit_transform(cg_t)

#capital loss
cl_t = xs_test['capital-loss']
cl_t=cl_t.values.reshape(-1,1)
cl_trans_t = kbins.fit_transform(cl_t)

#hours-per-week
hours_t = xs_test['hours-per-week']
hours_t=hours_t.values.reshape(-1,1)
hours_trans_t = kbins.fit_transform(hours_t)

In [249]:
age_t=pd.DataFrame(age_trans_t,columns =['age'])
fw_t=pd.DataFrame(fw_trans_t,columns =['fnlwgt'])
edunum_t=pd.DataFrame(edunum_trans_t,columns =['educational-num'])
cg_t=pd.DataFrame(cg_trans_t,columns =['capital-gain'])
cl_t=pd.DataFrame(cl_trans_t,columns =['capital-loss'])
hours_t=pd.DataFrame(hours_trans_t,columns =['hours-per-week'])


numerical_trans_t = pd.concat([age_t,fw_t,edunum_t,cg_t,cl_t,hours_t],axis=1)

xs_bnb_test = pd.concat([xs_test[categorical],numerical_trans_t],axis=1)

In [250]:
#install encoder library
!pip install category_encoders
import category_encoders as ce

#import the trained encoder_bnb
import pickle
with open("trained_models/encoder_bnb.pkl", "rb") as f:
    encoder_bnb = pickle.load(f)

#Encode the unseen test data
# xs_bnb_test = encoder_bnb.transform(xs_bnb_test)
# ys_bnb_test = ys_test

x_test_nb = encoder_bnb.transform(xs_bnb_test)
y_test_nb = ys_test



## 3. Predict the Result

### 3.1 Result of decision tree

In [278]:
# Load the decision tree
import pickle
from sklearn.metrics import classification_report

with open('trained_models/pruned_decision_tree.pkl', 'rb') as f:
    decision_tree = pickle.load(f)

print(decision_tree)

# Produce results and evaluate 
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
# Make predictions on the testing data
y_pred_tree_proba = decision_tree.predict_proba(x_test_tree)[:, 1]  # probability of ">50k"
# print(y_pred_tree)
# print(y_pred_tree.shape)

y_pred_tree = decision_tree.predict(x_test_tree)
# print(y_pred_tree)
# print(y_pred_tree.shape)

# Evaluate the model
print(classification_report(y_test_tree,y_pred_prob_tree > 0.5))
auc = roc_auc_score(y_test_tree, y_pred_prob_tree > 0.5)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))


DecisionTreeClassifier(max_depth=9, max_features=8, min_samples_leaf=10,
                       min_samples_split=8)
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     12435
           1       0.77      0.54      0.64      3846

    accuracy                           0.85     16281
   macro avg       0.82      0.75      0.77     16281
weighted avg       0.85      0.85      0.84     16281

ROC_AUC score: 0.7459


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


### 3.2 Result of k-NN

In [291]:
import joblib

# Load k-NN model
knn_model = joblib.load('trained_models/kNN.pkl')
print(knn_model)

# Produce results
y_pred_knn_proba = knn_model.predict_proba(x_test_knn)[:, 1]  # probability of ">50k"
# print(y_pred_knn_proba)
# print(y_pred_knn_proba.shape)

y_pred_knn = knn_model.predict(x_test_knn)
# print(y_pred_knn)
# print(y_pred_knn.shape)

# Evaluate the the model
print(classification_report(y_test_knn,y_pred_knn))
auc = roc_auc_score(y_test_knn,y_pred_knn)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))

KNeighborsClassifier(leaf_size=16, metric='manhattan', n_neighbors=29)
              precision    recall  f1-score   support

           0       0.87      0.93      0.90     12435
           1       0.70      0.57      0.63      3846

    accuracy                           0.84     16281
   macro avg       0.79      0.75      0.76     16281
weighted avg       0.83      0.84      0.84     16281

ROC_AUC score: 0.7463


### 3.3 Result of neural networks

In [290]:
from tensorflow import keras
import numpy as np


# Load neutral network model
nn_model = keras.models.load_model('trained_models/NeuralNetwork.h5')
# nn_model.summary()

# Produce results
y_pred_nn_proba = nn_model.predict(x_test_nn)  # probability of ">50k"
y_pred_nn_proba = np.squeeze(y_pred_nn)  # to 1-d array
# print(y_pred_nn_prob)
# print(y_pred_nn_prob.shape)

y_pred_nn = np.where(y_pred_nn_proba > 0.5, 1, 0)  # probability to 0/1
# print(y_pred_nn)
# print(y_pred_nn.shape)


# Evaluate the model
print(classification_report(y_test_nn,y_pred_nn))
auc = roc_auc_score(y_test_nn, y_pred_nn)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))

              precision    recall  f1-score   support

         0.0       0.87      0.94      0.90     12435
         1.0       0.74      0.55      0.63      3846

    accuracy                           0.85     16281
   macro avg       0.80      0.74      0.77     16281
weighted avg       0.84      0.85      0.84     16281

ROC_AUC score: 0.7430


### 3.4 Result of Bayesian learning

In [289]:
nb_model = pickle.load(open('trained_models/BernoullilNaiveBayes.sav', 'rb'))

# ys_pred_nb = nb_model.predict(x_test_nb)
# print(ys_pred_nb)

y_pred_nb_proba = nb_model.predict_proba(x_test_nb)[:, 1]  # probability of ">50k"
# print(y_pred_nb_proba)
# print(y_pred_nb_proba.shape)

y_pred_nb = nb_model.predict(x_test_nb)
# print(y_pred_nb)
# print(y_pred_nb.shape)

# Evaluate the model
print(classification_report(y_test_nb,y_pred_nb))
auc = roc_auc_score(y_test_nb, y_pred_nb)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))

              precision    recall  f1-score   support

           0       0.89      0.88      0.89     12435
           1       0.63      0.65      0.64      3846

    accuracy                           0.83     16281
   macro avg       0.76      0.76      0.76     16281
weighted avg       0.83      0.83      0.83     16281

ROC_AUC score: 0.7647


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## 4. Majority Voting

### 4.1 Soft Voting VS. Hard Voting

In [284]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[('dt', decision_tree), ('knn', knn_model), ('nn', nn_model), ('nb', NB_model)],
    voting='hard')

In [285]:
data_test = pd.read_csv("data/data_test.csv")
# Split the test set
x_test = data_test.drop(columns =['income'])
y_test = data_test['income']


In [302]:
# Soft Voting
from scipy.stats import mode

# Stack the predictions into a single array
predictions = np.stack((y_pred_tree_proba, y_pred_knn_proba, y_pred_nn_proba, y_pred_nb_proba))
# print(predictions.shape)

# Compute the weighted average of the predictions along the first axis (i.e., across each column)
weights = [1, 1, 1, 1]
soft_pred_prob = np.average(predictions, axis=0, weights=weights)
# print(soft_pred_prob)
# print(soft_pred_prob.shape)

# Evaluate the model
print(classification_report(y_test,soft_pred_prob > 0.5))
auc = roc_auc_score(y_test, soft_pred_prob > 0.5)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))

(4, 16281)
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     12435
           1       0.75      0.57      0.65      3846

    accuracy                           0.86     16281
   macro avg       0.82      0.76      0.78     16281
weighted avg       0.85      0.86      0.85     16281

ROC_AUC score: 0.7579


In [293]:
# Hard Voting

# Stack the predictions into a single array
predictions = np.stack((y_pred_tree, y_pred_knn, y_pred_nn, y_pred_nb))
# print(predictions.shape)

# Compute the mode of the predictions along the first axis (i.e., across each column)
mode_pred = mode(predictions, axis=0).mode
mode_pred = np.transpose(mode_pred) 
mode_pred = np.squeeze(mode_pred)
# print(mode_pred)
# print(mode_pred.shape)

# Evaluate the model
print(classification_report(y_test, mode_pred))
auc = roc_auc_score(y_test, mode_pred)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))

  mode_pred = mode(predictions, axis=0).mode


              precision    recall  f1-score   support

           0       0.87      0.95      0.91     12435
           1       0.78      0.52      0.62      3846

    accuracy                           0.85     16281
   macro avg       0.82      0.74      0.77     16281
weighted avg       0.84      0.85      0.84     16281

ROC_AUC score: 0.7366


### 4.2 Hyper-parameter Tuning

#### 4.2.1 Test domination of different models

In [297]:
# Soft Voting
from scipy.stats import mode

# Stack the predictions into a single array
# predictions = np.stack((y_pred_tree_proba, y_pred_knn_proba, y_pred_nn_proba, y_pred_nb_proba))

weights1 = [0.7, 0.1, 0.1, 0.1]
weights2 = [0.1, 0.7, 0.1, 0.1]
weights3 = [0.1, 0.1, 0.7, 0.1]
weights4 = [0.1, 0.1, 0.1, 0.7]

soft_pred_prob1 = np.average(predictions, axis=0, weights=weights1)
soft_pred_prob2 = np.average(predictions, axis=0, weights=weights2)
soft_pred_prob3 = np.average(predictions, axis=0, weights=weights3)
soft_pred_prob4 = np.average(predictions, axis=0, weights=weights4)

# print(soft_pred_prob)
# print(soft_pred_prob.shape)

# Evaluate the model
print("When decision tree is dominate:")
print(classification_report(y_test,soft_pred_prob1>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob1>0.5)))

print("\n\nWhen k-NN is dominate:")
print(classification_report(y_test,soft_pred_prob2>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob2>0.5)))

print("\n\nWhen neutral network is dominate:")
print(classification_report(y_test,soft_pred_prob3>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob3>0.5)))

print("\n\nWhen Bernoullil Naive Bayes (BNB) is dominate:")
print(classification_report(y_test,soft_pred_prob4>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob4>0.5)))


When decision tree is dominate:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     12435
           1       0.77      0.54      0.64      3846

    accuracy                           0.85     16281
   macro avg       0.82      0.75      0.77     16281
weighted avg       0.85      0.85      0.84     16281

ROC_AUC score: 0.7459


When k-NN is dominate:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90     12435
           1       0.70      0.57      0.63      3846

    accuracy                           0.84     16281
   macro avg       0.79      0.75      0.76     16281
weighted avg       0.83      0.84      0.84     16281

ROC_AUC score: 0.7463


When neutral network is dominate:
              precision    recall  f1-score   support

           0       0.87      0.94      0.90     12435
           1       0.74      0.55      0.63      3846

    accuracy                           0.85  

#### 4.2.2 Attemp improving the preformance
Now that we know when Bernoullil Naive Bayes is dominate, the performance is better. Next, we want to try lifting and decrising its weight, and see if we can further improve.

In [301]:

weights1 = [0.067, 0.067, 0.066, 0.8]
weights2 = [0.034, 0.034, 0.034, 0.9]
weights3 = [0, 0, 0, 1]
weights4 = [0.3, 0.3, 0.3, 0.1]

soft_pred_prob1 = np.average(predictions, axis=0, weights=weights1)
soft_pred_prob2 = np.average(predictions, axis=0, weights=weights2)
soft_pred_prob3 = np.average(predictions, axis=0, weights=weights3)
soft_pred_prob4 = np.average(predictions, axis=0, weights=weights4)

# print(soft_pred_prob)
# print(soft_pred_prob.shape)

# Evaluate the model
print("Give weight 0.8 to BNB:")
print(classification_report(y_test,soft_pred_prob1>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob1>0.5)))

print("\n\nGive weight 0.9 to BNB:")
print(classification_report(y_test,soft_pred_prob2>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob2>0.5)))

print("\n\nGive weight 1 to BNB:")
print(classification_report(y_test,soft_pred_prob3>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob3>0.5)))

print("\n\nGive weight 0.1 to BNB and 0.3 to other models:")
print(classification_report(y_test,soft_pred_prob4>0.5))
print('ROC_AUC score: {:.4f}'.format(roc_auc_score(y_test,soft_pred_prob4>0.5)))


Give weight 0.8 to BNB:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     12435
           1       0.63      0.65      0.64      3846

    accuracy                           0.83     16281
   macro avg       0.76      0.76      0.76     16281
weighted avg       0.83      0.83      0.83     16281

ROC_AUC score: 0.7647


Give weight 0.9 to BNB:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     12435
           1       0.63      0.65      0.64      3846

    accuracy                           0.83     16281
   macro avg       0.76      0.76      0.76     16281
weighted avg       0.83      0.83      0.83     16281

ROC_AUC score: 0.7647


Give weight 1 to BNB:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     12435
           1       0.63      0.65      0.64      3846

    accuracy                           0.83     16281
   macro a

### Custom Esamble Model

In [332]:
class EnsembleModel():

    def __init__(self, predictions, voting):
        self.predictions = predictions  # prediction of base classifiers; shape: (num_of_base_models, num_of_test_instances)
        self.voting = voting
        

    def predict(self):
        if (self.voting == "soft"):
            # Stack the predictions into a single array
            predictions = np.stack((y_pred_tree_proba, y_pred_knn_proba, y_pred_nn_proba, y_pred_nb_proba))
            # print(predictions.shape)

            # Compute the weighted average of the predictions along the first axis (i.e., across each column)
            soft_pred_prob = np.average(predictions, axis=0)
            soft_pred = np.where(soft_pred_prob > 0.5, 1, 0)  # probability to 0/1
            return soft_pred
        # else:


    def predict(self, weights):
        if (self.voting == "soft"):
            # Stack the predictions into a single array
            predictions = np.stack((y_pred_tree_proba, y_pred_knn_proba, y_pred_nn_proba, y_pred_nb_proba))
            # print(predictions.shape)

            # Compute the weighted average of the predictions along the first axis (i.e., across each column)
            # weights = [1, 1, 1, 1]
            soft_pred_prob = np.average(predictions, axis=0, weights=weights)
            soft_pred = np.where(soft_pred_prob > 0.5, 1, 0)  # probability to 0/1
            return soft_pred
        # else:
            

        



In [333]:
e = EnsembleModel(predictions, "soft")
soft_pred = e.predict([0.1, 0.1, 0.1, 0.7])

# Evaluate the model
print(classification_report(y_test,soft_pred))
auc = roc_auc_score(y_test, soft_pred)  # ROC score
print('ROC_AUC score: {:.4f}'.format(auc))

              precision    recall  f1-score   support

           0       0.89      0.90      0.89     12435
           1       0.65      0.63      0.64      3846

    accuracy                           0.83     16281
   macro avg       0.77      0.76      0.77     16281
weighted avg       0.83      0.83      0.83     16281

ROC_AUC score: 0.7643


              precision    recall  f1-score   support

           0       0.89      0.90      0.89     12435
           1       0.65      0.63      0.64      3846

    accuracy                           0.83     16281
   macro avg       0.77      0.76      0.77     16281
weighted avg       0.83      0.83      0.83     16281

ROC_AUC score: 0.7643
