In [1]:
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pandas as pd
from collections import namedtuple
from collections import UserList
from sklearn import tree

# new packages
from collections import defaultdict
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_linnerud
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, MinMaxScaler
import os

# customized class
from Node import Node
from helper import convert_onehot_v1, preprocess
from load_config import load_yaml, dump_yaml, safe_load_yaml
from decision_tree_more_features import Track_DT

## Initiaze logger
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s :: %(levelname)s :: %(message)s')
logger = logging.getLogger('microsegmenter')

In [2]:
general_config = safe_load_yaml('general_config.yaml')
sample_data = pd.read_csv(general_config['dataset']['data_path'])
ordinal_encoder = OrdinalEncoder()
onehot_encoder = OneHotEncoder()
X, y, column_names, dtypes, target_names = preprocess(sample_data, ordinal_encoder,
                                     onehot_encoder)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

clf = DecisionTreeClassifier(max_leaf_nodes=general_config['model']['decision_tree']['max_leaf_nodes'],
                          random_state=general_config['model']['decision_tree']['random_state'])
clf.fit(X_train, y_train)

summary_dic = {}
for i, (criteria_key, criteria_value) in enumerate(general_config['criteria'].items()):
    interested_class = criteria_value['interested_class']
    min_samples = criteria_value['min_samples']
    lift = criteria_value['lift']
    base_rate = criteria_value['base_rate']
    tree = Track_DT(clf, column_names, target_names, dtypes, interested_class, min_samples,
             lift, base_rate, rule_outcome_map = {})
    summary_dic[f"summary {i}"] = tree.summary_df

2022-07-13 10:23:07,341 :: INFO :: Distribution(%) of classes in the node [0 1 2 3 4] and total samples: [0.12       0.1        0.08666667 0.25333333 0.44      ] , 150
2022-07-13 10:23:07,342 :: INFO :: Distribution(%) of interested class in the node : Na_to_K -  1 : [0.1]
2022-07-13 10:23:07,342 :: INFO :: Evaluation of node to meet the microsegmentation criteria : False
2022-07-13 10:23:07,342 :: INFO :: Distribution(%) of classes in the node [0 1 2 3 4] and total samples: [0.21428571 0.17857143 0.1547619  0.45238095 0.        ] , 84
2022-07-13 10:23:07,343 :: INFO :: Distribution(%) of interested class in the node : HIGH_BP -  1 : [0.17857143]
2022-07-13 10:23:07,343 :: INFO :: Evaluation of node to meet the microsegmentation criteria : False
2022-07-13 10:23:07,343 :: INFO :: Distribution(%) of classes in the node [0 1 2 3 4] and total samples: [0.         0.         0.25490196 0.74509804 0.        ] , 51
2022-07-13 10:23:07,344 :: INFO :: Distribution(%) of interested class in the

2022-07-13 10:23:07,371 :: INFO :: List of rules added to decision path so far
2022-07-13 10:23:07,372 :: INFO :: HIGH_BP > 0.5 , node type : object
2022-07-13 10:23:07,372 :: INFO :: Appending Rule Na_to_K <= 14.828500270843506
2022-07-13 10:23:07,372 :: INFO :: List of rules added to decision path so far
2022-07-13 10:23:07,373 :: INFO :: HIGH_BP > 0.5 , node type : object
2022-07-13 10:23:07,373 :: INFO :: Na_to_K <= 14.828500270843506 , node type : float64
2022-07-13 10:23:07,374 :: INFO :: Distribution(%) of classes in the node [0 1 2 3 4] and total samples: [0.21428571 0.17857143 0.1547619  0.45238095 0.        ] , 84
2022-07-13 10:23:07,374 :: INFO :: Distribution(%) of interested class in the node : HIGH_BP -  1 : [0.17857143]
2022-07-13 10:23:07,375 :: INFO :: Evaluation of node to meet the microsegmentation criteria : False
2022-07-13 10:23:07,375 :: INFO :: Distribution(%) of classes in the node [0 1 2 3 4] and total samples: [0.12       0.1        0.08666667 0.25333333 0.44

2022-07-13 10:23:07,401 :: INFO :: Distribution(%) of classes in the node [0 1 2 3 4] and total samples: [0.12       0.1        0.08666667 0.25333333 0.44      ] , 150
2022-07-13 10:23:07,402 :: INFO :: Distribution(%) of interested class in the node : Na_to_K -  2 : [0.08666667]
2022-07-13 10:23:07,402 :: INFO :: Evaluation of node to meet the microsegmentation criteria : False
2022-07-13 10:23:07,403 :: INFO :: Distribution(%) of classes in the node [0 1 2 3 4] and total samples: [0. 0. 0. 0. 1.] , 66
2022-07-13 10:23:07,403 :: INFO :: Distribution(%) of interested class in the node : HIGH_Cholesterol -  2 : [0.]
2022-07-13 10:23:07,403 :: INFO :: Evaluation of node to meet the microsegmentation criteria : False
2022-07-13 10:23:07,404 :: INFO :: Distribution(%) of classes in the node [0 1 2 3 4] and total samples: [0.12       0.1        0.08666667 0.25333333 0.44      ] , 150
2022-07-13 10:23:07,405 :: INFO :: Distribution(%) of interested class in the node : Na_to_K -  3 : [0.25333

2022-07-13 10:23:07,432 :: INFO :: Evaluation of node to meet the microsegmentation criteria : False
2022-07-13 10:23:07,432 :: INFO :: Distribution(%) of classes in the node [0 1 2 3 4] and total samples: [0. 1. 0. 0. 0.] , 15
2022-07-13 10:23:07,432 :: INFO :: Distribution(%) of interested class in the node : HIGH_Cholesterol -  3 : [0.]
2022-07-13 10:23:07,433 :: INFO :: Evaluation of node to meet the microsegmentation criteria : False
2022-07-13 10:23:07,433 :: INFO :: Distribution(%) of classes in the node [0 1 2 3 4] and total samples: [0.54545455 0.45454545 0.         0.         0.        ] , 33
2022-07-13 10:23:07,434 :: INFO :: Distribution(%) of interested class in the node : Age -  3 : [0.]
2022-07-13 10:23:07,434 :: INFO :: Evaluation of node to meet the microsegmentation criteria : False
2022-07-13 10:23:07,434 :: INFO :: Distribution(%) of classes in the node [0 1 2 3 4] and total samples: [0.21428571 0.17857143 0.1547619  0.45238095 0.        ] , 84
2022-07-13 10:23:07,4

Your desired maximum number of one hot columns is not valid, setting to be the current maximum number of one hot columns.
Branch: root, Node type : root , Feature : Na_to_K ,  Parent : Na_to_K , Left Child : HIGH_BP , Right Child : HIGH_Cholesterol , NodeID : 0
Branch: left, Node type : inter , Feature : HIGH_BP ,  Parent : Na_to_K , Left Child : LOW_BP , Right Child : Age , NodeID : 2
Branch: left, Node type : inter , Feature : LOW_BP ,  Parent : HIGH_BP , Left Child : HIGH_Cholesterol , Right Child : HIGH_Cholesterol , NodeID : 6
Branch: left, Node type : leaf , Feature : HIGH_Cholesterol , Parent : LOW_BP , NodeID : 10
Back Tracking!!
Branch: left, Node type : inter , Feature : LOW_BP ,  Parent : HIGH_BP , Left Child : HIGH_Cholesterol , Right Child : HIGH_Cholesterol , NodeID : 6
Branch: right, Node type : inter , Feature : HIGH_Cholesterol ,  Parent : LOW_BP , Left Child : HIGH_Cholesterol , Right Child : HIGH_Cholesterol , NodeID : 7
Branch: left, Node type : leaf , Feature : HIG

In [3]:
# show sample summary df
summary_dic['summary 2']

Unnamed: 0,Unnamed: 1,interested_class,base_rate,feature,operator,threshold,total_sample_size,interested_class_sample_size,target_lift,actual_lift,data type
decision 0,constraint 0,3,10,Na_to_K,<=,14.8285,84,38,2,4.52,float64
decision 1,constraint 0,3,10,HIGH_BP,<=,0.5,51,38,2,7.45,object
decision 1,constraint 1,3,10,Na_to_K,<=,14.8285,84,38,2,4.52,float64
