# Model Comparison

## 1. Preprocessing

### 1.1 Prepare test data for the decision trees

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load test data
data_test = pd.read_csv('data/data_test.csv')

feature_names = ['age', 'workclass', 'fnlwgt', 'educational_num', 'marital-status', 'occupation', 'relationship',
                 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

encoder = LabelEncoder()
data_test['workclass'] = encoder.fit_transform(data_test['workclass'])
data_test['marital-status'] = encoder.fit_transform(
    data_test['marital-status'])
data_test['occupation'] = encoder.fit_transform(data_test['occupation'])
data_test['relationship'] = encoder.fit_transform(data_test['relationship'])
data_test['race'] = encoder.fit_transform(data_test['race'])
data_test['gender'] = encoder.fit_transform(data_test['gender'])
data_test['native-country'] = encoder.fit_transform(
    data_test['native-country'])

# Preprocessed test set for decision tree
x_test_tree = data_test[feature_names]
y_test_tree = data_test['income']

# Preprocessed test set
# print(x_test_tree)
# print(y_test_tree)

### 1.2 Prepare test data for the k-NN

In [8]:
# Load test data
data_test = pd.read_csv('data/data_test.csv')

# transformation 
from sklearn.preprocessing import LabelEncoder

# Select the categorical columns to encode
cat_columns = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]

# Encode categorical columns using Label Encoding for data_test
le = LabelEncoder()
test_knn = []
for col in cat_columns:
    data_test[col] = le.fit_transform(data_test[col])

# Split the test set
x_test_knn = data_test.drop(columns =['income'])
y_test_knn = data_test['income']

# Print the first 5 rows of the transformed dataset
# print(x_test_knn.head())
# print(y_test_knn.head())

#Standard Scaling
from sklearn import preprocessing

x_test_knn = preprocessing.StandardScaler().fit(x_test_knn).transform(x_test_knn.astype(float))
# print(x_test_knn)

### 1.3 Prepare test data for the neural network


In [9]:
from sklearn.preprocessing import MinMaxScaler
import category_encoders as ce

# Load test data
data_test = pd.read_csv('data/data_test.csv', header=0)

# feature transformation
for col in data_test:
    if data_test[col].dtype == 'object':
        data_test[col] = encoder.fit_transform(data_test[col].astype(str))

# feature scaling
scaler = MinMaxScaler()
for col in data_test.columns:
    data_test[col] = scaler.fit_transform(data_test[[col]])

x_test_nn = data_test.iloc[:, :-1]
y_test_nn = data_test.iloc[:, -1]
# print(x_test_nn)
# print(y_test_nn)

### 1.4 Prepare test for the Bayesian learning


In [10]:
#Importing the testing data
import pandas as pd
data_test = pd.read_csv('data/data_test.csv')
data_test = data_test.reset_index()
xs_test = data_test.drop(['income'], axis=1)
ys_test = data_test['income']

#Store all the categorical features
categorical = [var for var in xs_test.columns if xs_test[var].dtype=='O']
# print('There are {} categorical variables\n'.format(len(categorical)))
# print('The categorical variables are :\n\n', categorical)

#Store all the numerical features
numerical = [var for var in xs_test.columns if xs_test[var].dtype!='O']
# print('There are {} numerical variables\n'.format(len(numerical)))
# print('The numerical variables are :\n\n', numerical)

# Discretization the numerical features##
from sklearn.preprocessing import KBinsDiscretizer
kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
#Age
age_t = xs_test['age']
age_t=age_t.values.reshape(-1,1)
age_trans_t = kbins.fit_transform(age_t)

#Final weight
fw_t = xs_test['fnlwgt']
fw_t=fw_t.values.reshape(-1,1)
fw_trans_t = kbins.fit_transform(fw_t)

#educational_num
edunum_t = xs_test['educational_num']
edunum_t=edunum_t.values.reshape(-1,1)
edunum_trans_t = kbins.fit_transform(edunum_t)

#capital gain
cg_t = xs_test['capital-gain']
cg_t=cg_t.values.reshape(-1,1)
cg_trans_t = kbins.fit_transform(cg_t)

#capital loss
cl_t = xs_test['capital-loss']
cl_t=cl_t.values.reshape(-1,1)
cl_trans_t = kbins.fit_transform(cl_t)

#hours-per-week
hours_t = xs_test['hours-per-week']
hours_t=hours_t.values.reshape(-1,1)
hours_trans_t = kbins.fit_transform(hours_t)

age_t=pd.DataFrame(age_trans_t,columns =['age'])
fw_t=pd.DataFrame(fw_trans_t,columns =['fnlwgt'])
edunum_t=pd.DataFrame(edunum_trans_t,columns =['educational-num'])
cg_t=pd.DataFrame(cg_trans_t,columns =['capital-gain'])
cl_t=pd.DataFrame(cl_trans_t,columns =['capital-loss'])
hours_t=pd.DataFrame(hours_trans_t,columns =['hours-per-week'])


numerical_trans_t = pd.concat([age_t,fw_t,edunum_t,cg_t,cl_t,hours_t],axis=1)

xs_bnb_test = pd.concat([xs_test[categorical],numerical_trans_t],axis=1)


In [11]:
import category_encoders as ce

# import the trained encoder_bnb
import pickle
with open("output/Bayes_Learning/encoder_bnb.pkl", "rb") as f:
    encoder_bnb = pickle.load(f)

#Encode the unseen test data
# xs_bnb_test = encoder_bnb.transform(xs_bnb_test)
# ys_bnb_test = ys_test

x_test_nb = encoder_bnb.transform(xs_bnb_test)
y_test_nb = ys_test