# Model Ensemble

## 1. Collect and Explore the Data
Take a look at these data first.

In [67]:
import pandas as pd

data_train = pd.read_csv("data_train.csv")
data_test = pd.read_csv("data_test.csv")
print(data_train.shape)
print(data_test.shape)

(32561, 15)
(16281, 15)


In [68]:
data_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [69]:
data_train.describe()

Unnamed: 0,age,fnlwgt,educational_num,capital-gain,capital-loss,hours-per-week,income
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456,0.24081
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429,0.427581
min,17.0,12285.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0


In [70]:
data_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,Private,103497,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,30,United-States,0


In [71]:
data_test.describe()

Unnamed: 0,age,fnlwgt,educational_num,capital-gain,capital-loss,hours-per-week,income
count,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0
mean,38.767459,189435.7,10.072907,1081.905104,87.899269,40.392236,0.236226
std,13.849187,105714.9,2.567545,7583.935968,403.105286,12.479332,0.424776
min,17.0,13492.0,1.0,0.0,0.0,1.0,0.0
25%,28.0,116736.0,9.0,0.0,0.0,40.0,0.0
50%,37.0,177831.0,10.0,0.0,0.0,40.0,0.0
75%,48.0,238384.0,12.0,0.0,0.0,45.0,0.0
max,90.0,1490400.0,16.0,99999.0,3770.0,99.0,1.0


In [72]:
print(data_train.dtypes)

age                 int64
workclass          object
fnlwgt              int64
education          object
educational_num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income              int64
dtype: object


In [73]:
print(data_test.dtypes)

age                 int64
workclass          object
fnlwgt              int64
education          object
educational_num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income              int64
dtype: object


## 2. Preprocessing
As everyone might have different preprocess actions towards the training dataset, it is necessary to perform corresponding preprocess actions to the testing set and then it can be used to test different models. 

### 2.1 Prepare test data for the decision tree

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


data_test = pd.read_csv('data/data_test.csv')

feature_names = ['age', 'workclass', 'fnlwgt', 'educational_num', 'marital-status', 'occupation', 'relationship',
                 'race', 'gender', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

encoder = LabelEncoder()
data_test['workclass'] = encoder.fit_transform(data_test['workclass'])
data_test['marital-status'] = encoder.fit_transform(
    data_test['marital-status'])
data_test['occupation'] = encoder.fit_transform(data_test['occupation'])
data_test['relationship'] = encoder.fit_transform(data_test['relationship'])
data_test['race'] = encoder.fit_transform(data_test['race'])
data_test['gender'] = encoder.fit_transform(data_test['gender'])
data_test['native-country'] = encoder.fit_transform(
    data_test['native-country'])

# Preprocessed test set for decision tree
x_test_tree = data_test[feature_names]
y_test_tree = data_test['income']

# Preprocessed test set
print(x_test_tree)
# print(y_test)

       age  workclass  fnlwgt  educational_num  marital-status  occupation  \
0       25          3  226802                7               4           6   
1       38          3   89814                9               2           4   
2       28          1  336951               12               2          10   
3       44          3  160323               10               2           6   
4       18          3  103497               10               4           9   
...    ...        ...     ...              ...             ...         ...   
16276   39          3  215419               13               0           9   
16277   64          3  321403                9               6           9   
16278   38          3  374983               13               2           9   
16279   44          3   83891               13               0           0   
16280   35          4  182148               13               2           3   

       relationship  race  gender  capital-gain  capital-loss  

## 3. Predict the result

### 3.1 Result of decision tree

In [15]:
# Load the decision tree
import pickle

with open('pruned_decision_tree.pkl', 'rb') as f:
    decision_tree = pickle.load(f)

decision_tree

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score


# Make predictions on the testing data
y_pred_prob = decision_tree.predict_proba(x_test_tree)[:, 1]

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test_tree, y_pred_prob > 0.5)
precision = precision_score(y_test_tree, y_pred_prob > 0.5)
recall = recall_score(y_test_tree, y_pred_prob > 0.5)
auc = roc_auc_score(y_test_tree, y_pred_prob)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Accuracy: 0.8540015969535041
Precision: 0.772744151503899
Recall: 0.5410816432657306
AUC: 0.9023374799085249


In [22]:
# Make predictions on the testing data
y_pred_prob = decision_tree.predict_proba(x_test_tree)[:, 1]

y_pred_prob

array([0.00299307, 0.36458333, 0.25      , ..., 0.6702396 , 0.45      ,
       0.78494624])