In [1]:
import DataDealer as DD
from DecisionTreeGini import DecisionTreeGini as DT

tHead = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
         'martial-status', 'occupation', 'relationship', 'race', 'sex', 
         'capital-gain', 'capital-loss', 'hours-per-week', 'income']
numericIndex = [0, 2, 4, 10, 11, 12]
training_set = DD.read_data('./adult/adult.data', tHead, 
                           paraNumeric= numericIndex, 
                           paraRemove=[-2])
evaluation_set = DD.read_data('./adult/adult.test', tHead, 
                             paraNumeric=numericIndex, 
                             paraRemove=[-2], isTest=True)
TREE_MIN_SAMPLE_SPLIT = 10
TREE_MAX_DEPTH = 10

print(f"traing data size: {len(training_set)}\t evaluation data set: {len(evaluation_set)}")
print(f"min_samples_split={TREE_MIN_SAMPLE_SPLIT}, max_depth={TREE_MAX_DEPTH}")

traing data size: 30162	 evaluation data set: 15060
min_samples_split=10, max_depth=10


In [2]:
tEncoded_data, tValue_dicts = DD.label_encode(training_set)
tFeatures = [list(data.values())[:-1] for data in tEncoded_data]
tLabels = [data['income'] for data in tEncoded_data]
print("========== TRAINING MODEL on Training SET ==========")



In [3]:
tree = DT(TREE_MIN_SAMPLE_SPLIT, TREE_MAX_DEPTH, numericIndex)
tree.tree = tree.fit(tFeatures, tLabels)
print("========== TRAINING MODEL FINISHED ==========")



In [4]:
print("========== EVALUATION on Training SET ==========")
tPredictions = tree.predict(tFeatures)
DD.print_model_evaluation_result(tEncoded_data, tPredictions)

Accuracy: 0.79773
Precision: 0.99230
Recall: 0.18887
F1 Score: 0.31733


In [5]:
print("========== EVALUATION on Evaluation SET ==========")
eEncoded_data, eValue_dicts = DD.label_encode(evaluation_set, tValue_dicts)
eFeatures = [list(data.values())[:-1] for data in eEncoded_data]
ePredictions = tree.predict(eFeatures)
DD.print_model_evaluation_result(eEncoded_data, ePredictions)

Accuracy: 0.79907
Precision: 0.99269
Recall: 0.18351
F1 Score: 0.30976


In [6]:
print("========== DECISION TREE RESULT ==========")
DD.TreePrinter(tree.tree, tHead, eValue_dicts).print_tree()

|- capital-gain <= 5119.0
L   |- INCOME <=50k
R   |- capital-gain <= 7073.5
R   L   |- capital-gain <= 5316.5
R   L   L   |- INCOME >50k
R   L   R   |- age <= 61.0
R   L   R   L   |- age <= 54.0
R   L   R   R   |- capital-gain <= 6618.5
R   L   R   R   L   |- INCOME >50k
R   L   R   R   R   |- INCOME <=50k
R   R   |- age <= 20.0
R   R   L   |- INCOME <=50k
R   R   R   |- education-num <= 1.5
R   R   R   L   |- INCOME <=50k
R   R   R   R   |- age <= 60.5
R   R   R   R   L   |- fnlwgt <= 24872.5
R   R   R   R   L   L   |- INCOME >50k
R   R   R   R   L   R   |- INCOME >50k
R   R   R   R   R   |- capital-gain <= 10585.5
R   R   R   R   R   L   |- capital-gain <= 10543.0
R   R   R   R   R   L   L   |- INCOME >50k
R   R   R   R   R   L   R   |- INCOME <=50k
R   R   R   R   R   R   |- fnlwgt <= 34367.0
R   R   R   R   R   R   L   |- INCOME <=50k
R   R   R   R   R   R   R   |- INCOME >50k


In [7]:
print("\n\n========== PREDICT RESULT OF EVALUATION SET ==========")
DD.print_all_results_of_eval(eEncoded_data[:10], ePredictions[:10])



SAMPLE 1	: actual result - <=50k	 predicted result - <=50k	 predict True.
SAMPLE 2	: actual result - <=50k	 predicted result - <=50k	 predict True.
SAMPLE 3	: actual result - >50k	 predicted result - <=50k	 predict False.
SAMPLE 4	: actual result - >50k	 predicted result - >50k	 predict True.
SAMPLE 5	: actual result - <=50k	 predicted result - <=50k	 predict True.
SAMPLE 6	: actual result - >50k	 predicted result - <=50k	 predict False.
SAMPLE 7	: actual result - <=50k	 predicted result - <=50k	 predict True.
SAMPLE 8	: actual result - <=50k	 predicted result - <=50k	 predict True.
SAMPLE 9	: actual result - >50k	 predicted result - >50k	 predict True.
SAMPLE 10	: actual result - <=50k	 predicted result - <=50k	 predict True.
