In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris, load_breast_cancer, load_wine, load_boston, load_diabetes, load_linnerud 
from sklearn import tree
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, mean_squared_error

# Todo: remove once have pip install
import sys  
sys.path.insert(0, 'C:\python_projects\AdditiveDecisionTree_project\AdditiveDecisionTree') 
from AdditiveDecisionTree import AdditiveDecisionTreeClasssifier, AdditiveDecisionTreeRegressor

np.random.seed(0)

## Methods used to load the toy datasets

In [2]:
# Classification datasets 

def get_iris():
    iris = load_iris()
    X, y = iris.data, iris.target
    X = pd.DataFrame(X, columns=iris['feature_names'])
    y = pd.Series(y)
    return X, y

def get_breast_cancer():
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    return X,y

def get_wine():
    X, y = load_wine(return_X_y=True, as_frame=True)
    return X,y

# Regression datasets

def get_boston():
    data = load_boston()
    X = pd.DataFrame(data.data, columns=data.feature_names)
    y = pd.Series(data.target)
    return X,y

def get_diabetes():
    data = load_diabetes()
    X = pd.DataFrame(data.data, columns=data.feature_names)
    y = pd.Series(data.target)
    return X,y

def get_linnerud():
    data = load_linnerud(as_frame=True)
    X = data.data
    y = data.target['Weight']
    return X,y


## Example using sklearn's Decision Tree and AddtiveDecisionTree on a toy dataset

In [3]:
# Note: this provides only an example of using AdditiveDecisionTree and does not properly test its accuracy

def evaluate_model(clf, clf_desc, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    score_train = f1_score(y_train, y_pred_train, average='macro')
    y_pred_test = clf.predict(X_test)
    score_test = f1_score(y_test, y_pred_test, average='macro')
    complexity = 0
    if hasattr(clf, "get_model_complexity"):
        complexity = clf.get_model_complexity()
    elif hasattr(clf, "tree_"):
        complexity = len(clf.tree_.feature)
    print(f"{clf_desc}: Training score: {round(score_train,2)}, Testing score: {round(score_test,2)}, Complexity: {complexity}")

def evaluate_dataset(dataset_name, X,y):
    print(f"\n{dataset_name}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    dt_1 = tree.DecisionTreeClassifier(max_depth=4, random_state=42)
    evaluate_model(dt_1, "Standard DT", X_train, X_test, y_train, y_test)

    adt = AdditiveDecisionTreeClasssifier(max_depth=4, allow_additive_nodes=True, verbose_level=0)
    evaluate_model(adt, "Additive DT", X_train, X_test, y_train, y_test)
    return adt
    
X,y = get_iris()
evaluate_dataset("Iris", X,y)

X,y = get_wine()
evaluate_dataset("Wine", X,y)

X,y = get_breast_cancer()
adt = evaluate_dataset("Breast Cancer", X,y)


Iris
Standard DT: Training score: 1.0, Testing score: 0.97, Complexity: 13
Additive DT: Training score: 0.96, Testing score: 0.88, Complexity: 5

Wine
Standard DT: Training score: 1.0, Testing score: 0.92, Complexity: 13
Additive DT: Training score: 0.97, Testing score: 0.95, Complexity: 7

Breast Cancer
Standard DT: Training score: 0.99, Testing score: 0.92, Complexity: 23
Additive DT: Training score: 0.97, Testing score: 0.91, Complexity: 11


## Summary Output of the AdditiveDecisionTree

In [4]:
adt.output_tree()


********************************************************
Generated Tree
********************************************************
# Nodes: 9
Left Chidren: [1, 3, 5, -2, -2, 7, -2, -2, -2]
Right Chidren: [2, 4, 6, -2, -2, 8, -2, -2, -2]
# Rows:  [426, 260, 166, 252, 8, 30, 136, 14, 16]
Features: [7, 20, 23, -100, -2, 21, -2, -2, -2]
Features in additive nodes: [[], [], [], [1, 13], [], [], [], [], []]
Thresholds: [0.04891999997198582, 17.589999198913574, 785.7999877929688, 21.574999809265137, -2, 23.739999771118164, -2, -2, -2]
Depths: [0, 1, 1, 2, 2, 2, 2, 3, 3]
Can split:  [True, True, True, True, True, True, True, True, True]
Class counts: [[159, 267], [13, 247], [146, 20], [7, 245], [6, 2], [13, 17], [133, 3], [0, 14], [13, 3]]
Leaf Class Counts: [[7, 245], [6, 2], [133, 3], [0, 14], [13, 3]]
Node igr:  [0.4156254639152989, 0.2031712696855239, 0.29661687709662865, 0.18239393289682015, -2, 0.43241893359216155, -2, -2, -2]
********************************************************



## Explanations of Predictions

In [5]:
exp_arr = adt.get_explanations(X[:5], y[:5])
for exp in exp_arr: 
    print("\n")
    print(exp)



Initial distribution of classes: [0, 1]: [159, 267]


Prediction for row 0: 0 -- Correct
Path: [0, 2, 6]
mean concave points is greater than 0.04891999997198582 (has value: 0.1471) --> (Class distribution: [146, 20]
AND worst area is greater than 785.7999877929688 (has value: 2019.0) --> (Class distribution: [133, 3]
where the majority class is: 0


Prediction for row 1: 0 -- Correct
Path: [0, 2, 6]
mean concave points is greater than 0.04891999997198582 (has value: 0.07017) --> (Class distribution: [146, 20]
AND worst area is greater than 785.7999877929688 (has value: 1956.0) --> (Class distribution: [133, 3]
where the majority class is: 0


Prediction for row 2: 0 -- Correct
Path: [0, 2, 6]
mean concave points is greater than 0.04891999997198582 (has value: 0.1279) --> (Class distribution: [146, 20]
AND worst area is greater than 785.7999877929688 (has value: 1709.0) --> (Class distribution: [133, 3]
where the majority class is: 0


Prediction for row 3: 0 -- Correct
Path: [0, 2, 5

## Example wtih Regression

In [6]:
# Note: this provides only an example of using AdditiveDecisionTree and does not properly test its accuracy

def evaluate_model(clf, clf_desc, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    score_train = mean_squared_error(y_train, y_pred_train)
    y_pred_test = clf.predict(X_test)
    score_test = mean_squared_error(y_test, y_pred_test)
    complexity = 0
    if hasattr(clf, "get_model_complexity"):
        complexity = clf.get_model_complexity()
    elif hasattr(clf, "tree_"):
        complexity = len(clf.tree_.feature)
    print(f"{clf_desc}: Training MSE: {round(score_train,2)}, Testing MSE: {round(score_test,2)}, Complexity: {complexity}")

def evaluate_dataset(dataset_name, X,y):
    print(f"\n{dataset_name}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    dt_1 = tree.DecisionTreeRegressor(max_depth=4, min_samples_leaf=5, random_state=42)
    evaluate_model(dt_1, "Standard DT", X_train, X_test, y_train, y_test)

    adt = AdditiveDecisionTreeRegressor(max_depth=4, min_samples_leaf=5, allow_additive_nodes=True, verbose_level=0)
    evaluate_model(adt, "Additive DT", X_train, X_test, y_train, y_test)
    return adt
    
X,y = get_boston()
adt = evaluate_dataset("Boston", X,y)

# X,y = get_diabetes()
# adt = evaluate_dataset("Diabetes", X,y)

# X,y = get_linnerud()
# adt = evaluate_dataset("Linnerud", X,y)


Boston
Standard DT: Training MSE: 11.39, Testing MSE: 29.76, Complexity: 29
Additive DT: Training MSE: 9.35, Testing MSE: 27.93, Complexity: 40


In [7]:
adt.output_tree()


********************************************************
Generated Tree
********************************************************
# Nodes: 19
Left Chidren: [1, 3, 5, 7, -2, 9, 11, -2, -2, 13, -2, 15, 17, -2, -2, -2, -2, -2, -2]
Right Chidren: [2, 4, 6, 8, -2, 10, 12, -2, -2, 14, -2, 16, 18, -2, -2, -2, -2, -2, -2]
# Rows:  [379, 135, 244, 112, 23, 121, 123, 58, 54, 108, 13, 64, 59, 5, 103, 13, 51, 27, 32]
Features: [12, 5, 12, 5, -100, 5, 0, -100, -100, 2, -100, 4, 12, -2, -2, -2, -2, -2, -2]
Features in additive nodes: [[], [], [], [], [0, 5, 10, 4, 8, 12], [], [], [6, 5, 0, 7, 12, 2], [4, 6, 7], [], [4, 6, 7, 0, 11, 10], [], [], [], [], [], [], [], []]
Thresholds: [8.130000114440918, 7.434999942779541, 15.0, 6.6565001010894775, 5.204999923706055, 6.60450005531311, 5.769209861755371, 16.570000171661377, 91.29999923706055, 2.850000023841858, 16.899999618530273, 0.5309999883174896, 20.3149995803833, -2, -2, -2, -2, -2, -2]
Depths: [0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4

## Example Tuning Hyperparameters with a Cross Validated Grid Search

In [8]:
# Note: this can be several minutes to execute.

X,y = get_boston()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

parameters = {
    'min_samples_split': (5,10,25,50), 
    'min_samples_leaf': (5,10,15),
    'max_depth': (4,5,6,7),
    'allow_additive_nodes': (True, False),
    'max_added_splits_per_node': (2,3,4,5,10)
}

estimator = AdditiveDecisionTreeRegressor(max_depth=4, min_samples_leaf=5)
gs_estimator = RandomizedSearchCV(estimator, parameters, scoring='neg_mean_squared_error',n_iter=100)
gs_estimator.fit(X_train, y_train)
y_pred = gs_estimator.predict(X_test)
test_score = mean_squared_error(list(y_pred), list(y_test)) 

print("test_score: ", test_score)
print("best estimator: ", gs_estimator.best_estimator_)

test_score:  26.448151685940985
best estimator:  min_samples_split: 5, min_samples_leaf: 5, max_depth: 5, allow_additive_nodes: True, max_added_splits_per_node: 3
