In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris, load_breast_cancer, load_wine, load_diabetes, make_regression 
from sklearn import tree
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, mean_squared_error

# If AdditiveDecisionTree.py is not in the current folder, specify the path 
import sys  
sys.path.insert(0, 'C:\python_projects\AdditiveDecisionTree_project\AdditiveDecisionTree') 
from AdditiveDecisionTree import AdditiveDecisionTreeClasssifier, AdditiveDecisionTreeRegressor

np.random.seed(0)

## Methods used to load the toy datasets

In [2]:
# Classification datasets 

def get_iris():
    iris = load_iris()
    X, y = iris.data, iris.target
    X = pd.DataFrame(X, columns=iris['feature_names'])
    y = pd.Series(y)
    return X, y

def get_breast_cancer():
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    return X, y

def get_wine():
    X, y = load_wine(return_X_y=True, as_frame=True)
    return X, y

# Regression datasets

def get_diabetes():
    data = load_diabetes()
    X = pd.DataFrame(data.data, columns=data.feature_names)
    y = pd.Series(data.target)
    return X, y

# def get_linnerud():
#     data = load_linnerud(as_frame=True)
#     X = data.data
#     y = data.target['Weight']
#     return X,y

def get_make_regression():
    np.random.seed(0)
    X, y = make_regression(noise=0.0)
    X = pd.DataFrame(X)
    y = pd.Series(y)
    return X, y

## Example using sklearn's Decision Tree and AddtiveDecisionTree on toy datasets

In [3]:
# Note: this provides only an example of using AdditiveDecisionTree and does not 
# properly test its accuracy. We can, though, see that in terms of test scores,
# ADT (Additive Decision Trees) often do about the same as DT (standard Decsion
# Trees), but sometimes one or the other does better. 
# Training scores are also show to give a sense of overfitting.

# To estimate complexity for DTs, we use the number of nodes
# To estimate complexity for ADTs, we call get_model_complexity(),
# which is similar, but considers that additive nodes are more complex.

def evaluate_model(clf, clf_desc, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    score_train = f1_score(y_train, y_pred_train, average='macro')
    y_pred_test = clf.predict(X_test)
    score_test = f1_score(y_test, y_pred_test, average='macro')
    complexity = 0
    if hasattr(clf, "get_model_complexity"):
        complexity = clf.get_model_complexity()
    elif hasattr(clf, "tree_"):
        complexity = len(clf.tree_.feature)
    print(f"{clf_desc}: Training score: {round(score_train,2)}, Testing score: {round(score_test,2)}, Complexity: {complexity}")

    
def evaluate_dataset(dataset_name, X,y):
    print(f"\n{dataset_name}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    dt_1 = tree.DecisionTreeClassifier(max_depth=4, random_state=42)
    evaluate_model(dt_1, "Standard DT", X_train, X_test, y_train, y_test)

    adt = AdditiveDecisionTreeClasssifier(max_depth=4, allow_additive_nodes=True, verbose_level=0)
    evaluate_model(adt, "Additive DT", X_train, X_test, y_train, y_test)
    return adt
    
    
X,y = get_iris()
evaluate_dataset("Iris", X,y)

X,y = get_wine()
evaluate_dataset("Wine", X,y)

X,y = get_breast_cancer()
adt = evaluate_dataset("Breast Cancer", X,y)


Iris
Standard DT: Training score: 1.0, Testing score: 0.97, Complexity: 13
Additive DT: Training score: 0.96, Testing score: 0.88, Complexity: 5

Wine
Standard DT: Training score: 1.0, Testing score: 0.92, Complexity: 13
Additive DT: Training score: 0.97, Testing score: 0.95, Complexity: 7

Breast Cancer
Standard DT: Training score: 0.99, Testing score: 0.92, Complexity: 23
Additive DT: Training score: 0.97, Testing score: 0.91, Complexity: 11


## Summary Output of the AdditiveDecisionTree

In [4]:
# This continues the example with the Breast Cancer dataset.

# The output to explain an Additive Decsion Tree is similar as for
# scikit-learn decision trees, though has slighly more information.
# For example, it provides the depth of each node and the class counts 
# in each node. 

# Here node 3 is an additive node. In the features list, it is specified
# as feature -100. In the Features in addtivie nodes list, we see it
# uses both feature 1 and feature 13. 

adt.output_tree()


********************************************************
Generated Tree
********************************************************

# Nodes: 9

Left Chidren:
[1, 3, 5, -2, -2, 7, -2, -2, -2]

Right Chidren:
[2, 4, 6, -2, -2, 8, -2, -2, -2]

# Rows: 
[426, 260, 166, 252, 8, 30, 136, 14, 16]

Features:
[7, 20, 23, -100, -2, 21, -2, -2, -2]

Features in additive nodes:
[[], [], [], [1, 13], [], [], [], [], []]

Thresholds:
[0.04891999997198582, 17.589999198913574, 785.7999877929688, 21.574999809265137, -2, 23.739999771118164, -2, -2, -2]

Depths:
[0, 1, 1, 2, 2, 2, 2, 3, 3]

Can split: 
[True, True, True, True, True, True, True, True, True]

Class counts:
[[159, 267], [13, 247], [146, 20], [7, 245], [6, 2], [13, 17], [133, 3], [0, 14], [13, 3]]

Leaf Class Counts:
[[7, 245], [6, 2], [133, 3], [0, 14], [13, 3]]

Node igr: 
[0.4156254639152989, 0.2031712696855239, 0.29661687709662865, 0.18239393289682015, -2, 0.43241893359216155, -2, -2, -2]
**************************************************

## Explanations of Predictions

In [5]:
# This provides explanations (in the form of the decision path)
# for the first five rows. 

exp_arr = adt.get_explanations(X[:5], y[:5])
for exp in exp_arr: 
    print("\n")
    print(exp)



Initial distribution of classes: [0, 1]: [159, 267]


...............................................................
Prediction for row 0: 0 -- Correct
...............................................................
Path: [0, 2, 6]

mean concave points is greater than 0.04891999997198582 
    (has value: 0.1471) --> (Class distribution: [146, 20]

AND worst area is greater than 785.7999877929688 
    (has value: 2019.0) --> (Class distribution: [133, 3]
where the majority class is: 0


...............................................................
Prediction for row 1: 0 -- Correct
...............................................................
Path: [0, 2, 6]

mean concave points is greater than 0.04891999997198582 
    (has value: 0.07017) --> (Class distribution: [146, 20]

AND worst area is greater than 785.7999877929688 
    (has value: 1956.0) --> (Class distribution: [133, 3]
where the majority class is: 0


...............................................................
Pre

In [6]:
# This gives an example (Row 19) where the decision path includes 
# node 3, which is an additive node. 

exp_arr = adt.get_explanations(X.loc[19:19], y.loc[19:19])
for exp in exp_arr: 
    print("\n")
    print(exp)



Initial distribution of classes: [0, 1]: [159, 267]


...............................................................
Prediction for row 0: 1 -- Correct
...............................................................
Path: [0, 1, 3]

mean concave points is less than 0.04891999997198582 
    (has value: 0.04781) --> (Class distribution: [13, 247]

AND worst radius is less than 17.589999198913574 
    (has value: 15.11) --> (Class distribution: [7, 245]

AND vote based on: 
  1: mean texture is less than 21.574999809265137
     (has value 14.36)  --> (class distribution: [1, 209])
  2: area error is less than 42.19000053405762
     (has value 23.56)  --> (class distribution: [4, 243])
The class with the most votes is 1


## Example wtih Regression

In [7]:
# Note: this provides only an example of using AdditiveDecisionTree and does 
# not properly test its accuracy

# In these examples, the additive decision trees provide slightly lower errors
# but slightly higher complexity.

# In general, Additive Decision Trees tend to work better for classification 
# than regression at least with default hyperparameters.


def evaluate_model(clf, clf_desc, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    score_train = mean_squared_error(y_train, y_pred_train)
    y_pred_test = clf.predict(X_test)
    score_test = mean_squared_error(y_test, y_pred_test)
    complexity = 0
    if hasattr(clf, "get_model_complexity"):
        complexity = clf.get_model_complexity()
    elif hasattr(clf, "tree_"):
        complexity = len(clf.tree_.feature)
    print(f"{clf_desc}: Training MSE: {round(score_train,2)}, Testing MSE: {round(score_test,2)}, Complexity: {complexity}")

    
def evaluate_dataset(dataset_name, X,y):
    print(f"\n{dataset_name}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    dt_1 = tree.DecisionTreeRegressor(max_depth=4, min_samples_leaf=5, random_state=42)
    evaluate_model(dt_1, "Standard DT", X_train, X_test, y_train, y_test)

    adt = AdditiveDecisionTreeRegressor(max_depth=4, min_samples_leaf=5, allow_additive_nodes=True, verbose_level=0)
    evaluate_model(adt, "Additive DT", X_train, X_test, y_train, y_test)
    return adt
  
    
X,y = get_diabetes()
adt = evaluate_dataset("Diabetes", X, y)

X,y = get_make_regression()
adt = evaluate_dataset("Make Regression", X, y)


Diabetes
Standard DT: Training MSE: 2281.54, Testing MSE: 4373.97, Complexity: 29
Additive DT: Training MSE: 2159.58, Testing MSE: 4291.76, Complexity: 33

Make Regression
Standard DT: Training MSE: 3487.28, Testing MSE: 23856.35, Complexity: 17
Additive DT: Training MSE: 3302.9, Testing MSE: 21077.32, Complexity: 20


In [8]:
adt.output_tree()


********************************************************
Generated Tree
********************************************************

# Nodes: 13

Left Chidren:
[1, 3, 5, -2, 7, 9, -2, 11, -2, -2, -2, -2, -2]

Right Chidren:
[2, 4, 6, -2, 8, 10, -2, 12, -2, -2, -2, -2, -2]

# Rows: 
[75, 53, 22, 7, 46, 16, 6, 31, 15, 11, 5, 25, 6]

Features:
[57, 53, 46, -2, 43, 41, -2, 14, -100, -100, -2, -2, -2]

Features in additive nodes:
[[], [], [], [], [], [], [], [], [32, 96], [72, 15, 79, 85, 96], [], [], []]

Thresholds:
[0.2633100152015686, -0.9771790504455566, 0.5912367105484009, -2, 0.3434883654117584, 1.1032692193984985, -2, 0.5415648818016052, -0.8923328518867493, -0.13171404972672462, -2, -2, -2]

Depths:
[0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4]

Can split: 
[True, True, True, True, True, True, True, True, True, True, True, True, True]

Average target values:
[-28.94558781986425, -72.30996714891963, 75.5231442001328, 66.46907443748522, -93.42851695554644, 35.158511395759106, 183.16216501179

## Example Tuning Hyperparameters with a Cross Validated Grid Search

In [9]:
# Note: this can be several minutes to execute.

X,y = get_diabetes()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

parameters = {
    'min_samples_split': (5,10,25,50), 
    'min_samples_leaf': (5,10,15),
    'max_depth': (4,5,6,7),
    'allow_additive_nodes': (True, False),
    'max_added_splits_per_node': (2,3,4,5,10)
}

estimator = AdditiveDecisionTreeRegressor(max_depth=4, min_samples_leaf=5)
gs_estimator = RandomizedSearchCV(estimator, parameters, scoring='neg_mean_squared_error',n_iter=100)
gs_estimator.fit(X_train, y_train)
y_pred = gs_estimator.predict(X_test)
test_score = mean_squared_error(list(y_pred), list(y_test)) 

print("test_score: ", test_score)
print("best estimator: ", gs_estimator.best_estimator_)

test_score:  4277.794998844322
best estimator:  min_samples_split: 25, min_samples_leaf: 15, max_depth: 5, allow_additive_nodes: True, max_added_splits_per_node: 5


In [10]:
# Create an instance of the best model found during tuning

adt = AdditiveDecisionTreeRegressor(
        min_samples_split=25, 
        min_samples_leaf=15, 
        max_depth=5, 
        allow_additive_nodes=True, 
        max_added_splits_per_node=5)
adt.fit(X_train, y_train)

adt.get_model_complexity()

41