# Model Ensemble

## 1. Collect and Explore the Data
Take a look at these data first.

In [88]:
import pandas as pd

data_train = pd.read_csv("data/data_train.csv")
data_test = pd.read_csv("data/data_test.csv")
print(data_train.shape)
print(data_test.shape)

(32561, 15)
(16281, 15)


In [89]:
data_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [90]:
data_train.describe()

Unnamed: 0,age,fnlwgt,educational_num,capital-gain,capital-loss,hours-per-week,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456,0.24081
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429,0.427581
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


In [91]:
data_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,Private,103497,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,30,United-States,0


In [92]:
data_test.describe()

Unnamed: 0,age,fnlwgt,educational_num,capital-gain,capital-loss,hours-per-week,income
count,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0
mean,38.767459,189435.7,10.072907,1081.905104,87.899269,40.392236,0.236226
std,13.849187,105714.9,2.567545,7583.935968,403.105286,12.479332,0.424776
min,17.0,13492.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,116736.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,177831.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,238384.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1490400.0,16.0,99999.0,3770.0,99.0,1.0


In [93]:
print(data_train.dtypes)

age                 int64
workclass          object
fnlwgt              int64
education          object
educational_num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income              int64
dtype: object


In [94]:
print(data_test.dtypes)

age                 int64
workclass          object
fnlwgt              int64
education          object
educational_num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income              int64
dtype: object


## 2. Preprocessing
As everyone might have different preprocess actions towards the training dataset, it is necessary to perform corresponding preprocess actions to the testing set and then it can be used to test different models. 

In [95]:
# Read test data
import pandas as pd
from sklearn.preprocessing import LabelEncoder

### 2.1 Prepare test data for the decision trees

In [96]:
# Load test data
data_test = pd.read_csv('data/data_test.csv')

feature_names = ['age', 'workclass', 'fnlwgt', 'educational_num', 'marital-status', 'occupation', 'relationship',
                 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

encoder = LabelEncoder()
data_test['workclass'] = encoder.fit_transform(data_test['workclass'])
data_test['marital-status'] = encoder.fit_transform(
    data_test['marital-status'])
data_test['occupation'] = encoder.fit_transform(data_test['occupation'])
data_test['relationship'] = encoder.fit_transform(data_test['relationship'])
data_test['race'] = encoder.fit_transform(data_test['race'])
data_test['gender'] = encoder.fit_transform(data_test['gender'])
data_test['native-country'] = encoder.fit_transform(
    data_test['native-country'])

# Preprocessed test set for decision tree
x_test_tree = data_test[feature_names]
y_test_tree = data_test['income']

# Preprocessed test set
# print(x_test_tree)
# print(y_test_tree)

### 2.2 Prepare test data for the k-NN
Load the data and encode categorical features.

In [97]:
# Load test data
data_test = pd.read_csv('data/data_test.csv')

# transformation 
from sklearn.preprocessing import LabelEncoder

# Select the categorical columns to encode
cat_columns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]

# Encode categorical columns using Label Encoding for data_test
le = LabelEncoder()
test_knn = []
for col in cat_columns:
    data_test[col] = le.fit_transform(data_test[col])

# Split the test set
x_test_knn = data_test.drop(columns =['income'])
y_test_knn = data_test['income']

# Print the first 5 rows of the transformed dataset
print(x_test_knn.head())
print(y_test_knn.head())

   age  workclass  fnlwgt  education  educational_num  marital-status  \
0   25          3  226802          1                7               4   
1   38          3   89814         11                9               2   
2   28          1  336951          7               12               2   
3   44          3  160323         15               10               2   
4   18          3  103497         15               10               4   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0           6             3     2       1             0             0   
1           4             0     4       1             0             0   
2          10             0     4       1             0             0   
3           6             0     2       1          7688             0   
4           9             3     4       0             0             0   

   hours-per-week  native-country  
0              40              37  
1              50              37  
2             

Make sure all features are on the same scale.

In [98]:
#Standard Scaling
from sklearn import preprocessing

x_test_knn = preprocessing.StandardScaler().fit(x_test_knn).transform(x_test_knn.astype(float))
print(x_test_knn)


[[-0.99412926 -0.09851079  0.35347399 ... -0.21806206 -0.03143184
   0.25775643]
 [-0.05541716 -0.09851079 -0.94239062 ... -0.21806206  0.7699177
   0.25775643]
 [-0.77750339 -1.88752825  1.39544986 ... -0.21806206 -0.03143184
   0.25775643]
 ...
 [-0.05541716 -0.09851079  1.75522095 ... -0.21806206  0.7699177
   0.25775643]
 [ 0.37783458 -0.09851079 -0.99842039 ... -0.21806206 -0.03143184
   0.25775643]
 [-0.27204303  0.79599794 -0.0689392  ... -0.21806206  1.57126723
   0.25775643]]


### 2.3 Prepare test data for the neural network


In [99]:
from sklearn.preprocessing import MinMaxScaler

# Load test data
data_test = pd.read_csv('data/data_test.csv', header=0)

# feature transformation
for col in data_test:
    if data_test[col].dtype == 'object':
        data_test[col] = encoder.fit_transform(data_test[col].astype(str))

# feature scaling
scaler = MinMaxScaler()
for col in data_test.columns:
    data_test[col] = scaler.fit_transform(data_test[[col]])

x_test_nn = data_test.iloc[:, :-1]
y_test_nn = data_test.iloc[:, -1]
# print(x_test_nn)
# print(y_test_nn)

### 2.4 Prepare test data for the Bayesian learning


## 3. Predict the result

### 3.1 Result of decision tree

In [100]:
# Load the decision tree
import pickle
from sklearn.metrics import classification_report

with open('trained_models/pruned_decision_tree.pkl', 'rb') as f:
    decision_tree = pickle.load(f)

print(decision_tree)

# Produce results and evaluate 
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
# Make predictions on the testing data
y_pred_prob_tree = decision_tree.predict_proba(x_test_tree)[:, 1]  # probability of ">50k"
# print(y_pred_prob_tree)

# Evaluate the model
print(classification_report(y_test_tree,y_pred_prob_tree > 0.5))


DecisionTreeClassifier(max_depth=9, max_features=8, min_samples_leaf=10,
                       min_samples_split=8)
              precision    recall  f1-score   support

           0       0.87      0.95      0.91     12435
           1       0.77      0.54      0.64      3846

    accuracy                           0.85     16281
   macro avg       0.82      0.75      0.77     16281
weighted avg       0.85      0.85      0.84     16281



https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


### 3.2 Result of k-NN

In [101]:
import joblib

# Load k-NN model
knn_model = joblib.load('trained_models/kNN.pkl')
print(knn_model)

# Produce results
y_pred_knn = knn_model.predict_proba(x_test_knn)[:, 1]  # probability of ">50k"
print(y_pred_knn)

# Evaluate the the model
print(classification_report(y_test_knn,y_pred_knn > 0.5))


KNeighborsClassifier(leaf_size=16, metric='manhattan', n_neighbors=29)
[0.         0.24137931 0.62068966 ... 0.79310345 0.27586207 0.75862069]
              precision    recall  f1-score   support

           0       0.87      0.93      0.90     12435
           1       0.70      0.57      0.63      3846

    accuracy                           0.84     16281
   macro avg       0.79      0.75      0.76     16281
weighted avg       0.83      0.84      0.84     16281



### 3.3 Result of neural networks

In [102]:
from tensorflow import keras

# Load neutral network model
nn_model = keras.models.load_model('trained_models/NeuralNetwork.h5')
# nn_model.summary()

# Produce results
y_pred_nn = nn_model.predict(x_test_nn)  # probability of ">50k"

# Evaluate the model
print(classification_report(y_test_nn,y_pred_nn > 0.5))

              precision    recall  f1-score   support

         0.0       0.87      0.94      0.90     12435
         1.0       0.74      0.55      0.63      3846

    accuracy                           0.85     16281
   macro avg       0.80      0.74      0.77     16281
weighted avg       0.84      0.85      0.84     16281



### 3.4 Result of Bayesian learning