In [39]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris, load_breast_cancer, load_wine 
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score

# Todo: remove once have pip install
import sys  
sys.path.insert(0, 'C:\python_projects\AdditiveDecisionTree_project\AdditiveDecisionTree') 
from AdditiveDecisionTree import AdditiveDecisionTreeClasssifier

np.random.seed(0)

## Constants used to specify the tests below

In [40]:
# Set to one of: "iris", "breast_cancer", or "wine"
DATASET_TESTED = "breast_cancer"

In [41]:
def get_iris():
    iris = load_iris()
    X, y = iris.data, iris.target
    X = pd.DataFrame(X, columns=iris['feature_names'])
    y = pd.Series(y)
    return X, y

def get_breast_cancer():
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    return X,y

def get_wine():
    X, y = load_wine(return_X_y=True, as_frame=True)
    return X,y

## Example using sklearn's Decision Tree, RotationFeatures, and ExtendedDecisionTree on the Iris dataset

In [42]:
# Note: this provides only an example of using ExtendedDecisionTree and does not properly test its accuracy

if DATASET_TESTED == "iris":
    X,y = get_iris()
elif DATASET_TESTED == "breast_cancer":
    X,y = get_breast_cancer()
elif DATASET_TESTED == "wine":
    X,y = get_wine()
else:
    assert False, "Not a valid test dataset"

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

def evaluate_model(clf, clf_desc):
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    score_train = f1_score(y_train, y_pred_train, average='macro')
    y_pred_test = clf.predict(X_test)
    score_test = f1_score(y_test, y_pred_test, average='macro')
    complexity = 0
    if hasattr(clf, "get_model_complexity"):
        complexity = clf.get_model_complexity()
    elif hasattr(clf, "tree_"):
        complexity = len(clf.tree_.feature)
    print(f"{clf_desc}: Training score: {round(score_train,2)}, Testing score: {round(score_test,2)}, Complexity: {complexity}")
  
dt_1 = tree.DecisionTreeClassifier(max_depth=4, random_state=42)
evaluate_model(dt_1, "Standard DT")

# edt1 = AdditiveDecisionTreeClasssifier(max_depth=4, allow_additive_nodes=False, verbose_level=0)
# evaluate_model(edt1, "Additive DT (without additive features)")

edt = AdditiveDecisionTreeClasssifier(max_depth=4, allow_additive_nodes=True, verbose_level=0)
evaluate_model(edt, "Additive DT")

Standard DT: Training score: 0.99, Testing score: 0.92, Complexity: 23
Additive DT: Training score: 0.97, Testing score: 0.93, Complexity: 18


## Summary Output of the ExtendedDecisionTree

In [43]:
edt.output_tree()


********************************************************
Generated Tree
********************************************************
# Nodes: 9
Left Chidren: [1, 3, 5, -2, -2, 7, -2, -2, -2]
Right Chidren: [2, 4, 6, -2, -2, 8, -2, -2, -2]
Features: [7, 23, 23, -100, -2, 21, -2, -2, -2]
Thresholds: [0.04891999997198582, 952.8999938964844, 785.7999877929688, 21.574999809265137, -2, 23.739999771118164, -2, -2, -2]
Depths: [0, 1, 1, 2, 2, 2, 2, 3, 3]
Class counts: [[159, 267], [13, 247], [146, 20], [7, 245], [6, 2], [13, 17], [133, 3], [0, 14], [13, 3]]
Leaf Class Counts: [[7, 245], [6, 2], [133, 3], [0, 14], [13, 3]]
Can split:  [True, True, True, True, True, True, True, True, True]
Node igr:  [0.4156254639152989, 0.2031712696855239, 0.29661687709662865, 0.18239393289682015, -2, 0.43241893359216155, -2, -2, -2]
********************************************************



## Example Tuning Hyperparameters with a Cross Validated Grid Search

In [13]:
parameters = {
    'min_samples_split': (5,10,25,50), 
    'min_samples_leaf': (5,10,15),
    'max_depth': (4,5,6),
    'allow_additive_nodes': (True, False)
}

estimator = AdditiveDecisionTreeClasssifier()
gs_estimator = GridSearchCV(estimator, parameters, scoring='f1_macro')
gs_estimator.fit(X_train, y_train)
y_pred = gs_estimator.predict(X_test)
test_score = f1_score(list(y_pred), list(y_test), average="macro") 

print("test_score: ", test_score)
print("best estimator: ", gs_estimator.best_estimator_)

test_score:  0.9537664528176862
best estimator:  min_samples_split: 5, min_samples_leaf: 5, max_depth: 4, allow_additive_nodes: False
