In [1]:
import DataDealer as DD
from DecisionTreeGini import DecisionTreeGini as DT

tHead = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
         'martial-status', 'occupation', 'relationship', 'race', 'sex', 
         'capital-gain', 'capital-loss', 'hours-per-week', 'income']
numericIndex = [0, 2, 4, 10, 11, 12]
training_set = DD.read_data('./adult/adult.data', tHead, 
                           paraNumeric= numericIndex, 
                           paraRemove=[-2])
evaluation_set = DD.read_data('./adult/adult.test', tHead, 
                             paraNumeric=numericIndex, 
                             paraRemove=[-2], isTest=True)
TREE_MIN_SAMPLE_SPLIT = 10
TREE_MAX_DEPTH = 5

print(f"traing data size: {len(training_set)}\t evaluation data set: {len(evaluation_set)}")
print(f"min_samples_split={TREE_MIN_SAMPLE_SPLIT}, max_depth={TREE_MAX_DEPTH}")

traing data size: 30162	 evaluation data set: 15060
min_samples_split=10, max_depth=5


In [2]:
tEncoded_data, tValue_dicts = DD.label_encode(training_set)
tFeatures = [list(data.values())[:-1] for data in tEncoded_data]
tLabels = [data['income'] for data in tEncoded_data]
print("========== TRAINING MODEL on Training SET ==========")



In [3]:
tree = DT(TREE_MIN_SAMPLE_SPLIT, TREE_MAX_DEPTH, numericIndex)
tree.tree = tree.fit(tFeatures, tLabels)
print("========== TRAINING MODEL FINISHED ==========")



In [4]:
print("========== EVALUATION on Training SET ==========")
tPredictions = tree.predict(tFeatures)
DD.print_model_evaluation_result(tEncoded_data, tPredictions)

Accuracy: 0.84275
Precision: 0.79062
Recall: 0.50093
F1 Score: 0.61329


In [5]:
print("========== EVALUATION on Evaluation SET ==========")
eEncoded_data, eValue_dicts = DD.label_encode(evaluation_set, tValue_dicts)
eFeatures = [list(data.values())[:-1] for data in eEncoded_data]
ePredictions = tree.predict(eFeatures)
DD.print_model_evaluation_result(eEncoded_data, ePredictions)

Accuracy: 0.84064
Precision: 0.77993
Recall: 0.48946
F1 Score: 0.60146


In [6]:
print("========== DECISION TREE RESULT ==========")
DD.TreePrinter(tree.tree, tHead, eValue_dicts).print_tree()

|- martial-status is Married-civ-spouse
L   |- education-num <= 12
L   L   |- capital-gain <= 5013
L   L   L   |- education-num <= 8
L   L   L   L   |- capital-loss <= 1735
L   L   L   R   |- age <= 35
L   L   R   |- age <= 60
L   L   R   L   |- education is Preschool
L   L   R   L   L   |- INCOME <=50k
L   L   R   L   R   |- INCOME >50k
L   L   R   R   |- workclass is Local-gov
L   L   R   R   L   |- INCOME <=50k
L   L   R   R   R   |- INCOME >50k
L   R   |- capital-gain <= 5013
L   R   L   |- capital-loss <= 1740
L   R   L   L   |- hours-per-week <= 30
L   R   L   L   L   |- INCOME <=50k
L   R   L   L   R   |- INCOME >50k
L   R   L   R   |- capital-loss <= 1977
L   R   L   R   L   |- INCOME >50k
L   R   L   R   R   |- INCOME >50k
L   R   R   |- age <= 79
L   R   R   L   |- occupation is Farming-fishing
L   R   R   L   L   |- INCOME >50k
L   R   R   L   R   |- INCOME >50k
L   R   R   R   |- INCOME >50k
R   |- capital-gain <= 6849
R   L   |- education-num <= 12
R   L   L   |- capital-l

In [7]:
print("\n\n========== PREDICT RESULT OF EVALUATION SET ==========")
DD.print_all_results_of_eval(eEncoded_data[:10], ePredictions[:10])



SAMPLE 1	: actual result - <=50k	 predicted result - <=50k	 predict True.
SAMPLE 2	: actual result - <=50k	 predicted result - <=50k	 predict True.
SAMPLE 3	: actual result - >50k	 predicted result - <=50k	 predict False.
SAMPLE 4	: actual result - >50k	 predicted result - >50k	 predict True.
SAMPLE 5	: actual result - <=50k	 predicted result - <=50k	 predict True.
SAMPLE 6	: actual result - >50k	 predicted result - >50k	 predict True.
SAMPLE 7	: actual result - <=50k	 predicted result - <=50k	 predict True.
SAMPLE 8	: actual result - <=50k	 predicted result - <=50k	 predict True.
SAMPLE 9	: actual result - >50k	 predicted result - >50k	 predict True.
SAMPLE 10	: actual result - <=50k	 predicted result - >50k	 predict False.
