# Sample Usage of MetaTree

In [1]:
import sys
sys.path.append('..')

from metatree.model_metatree import LlamaForMetaTree as MetaTree
from metatree.decision_tree_class import DecisionTree, DecisionTreeForest
from metatree.run_train import preprocess_dimension_patch
from transformers import AutoConfig

from sklearn.metrics import accuracy_score
import sklearn

import torch
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import imodels
import random

model_name_or_path = "yzhuang/MetaTree"

config = AutoConfig.from_pretrained(model_name_or_path)
model = MetaTree.from_pretrained(
    model_name_or_path,
    config=config,
)   
decision_tree_forest = DecisionTreeForest()

ensemble_size = 1
seed = 42

  from .autonotebook import tqdm as notebook_tqdm


### Generate the decision tree with MetaTree

We use a dataset from imodels (https://github.com/csinva/imodels) as our example, to install imodels, run *pip install imodels*

In [2]:
X, y, feature_names = imodels.get_clean_dataset('fico', data_source='imodels')

print("Dataset Shapes X={}, y={}, Num of Classes={}".format(X.shape, y.shape, len(set(y))))

train_idx, test_idx = sklearn.model_selection.train_test_split(range(X.shape[0]), test_size=0.3, random_state=seed)

# Dimension 
random.seed(seed)
feature_idx = np.random.choice(X.shape[1], 10, replace=False)
X = X[:, feature_idx]

test_X, test_y = X[test_idx], y[test_idx]

for i in range(ensemble_size):
    # Sample Train and Test Data
    random.seed(seed+i+1)
    subset_idx = random.sample(train_idx, 256)
    train_X, train_y = X[subset_idx], y[subset_idx]
    

    input_x = torch.tensor(train_X, dtype=torch.float32)
    input_y = torch.nn.functional.one_hot(torch.tensor(train_y)).float()

    batch = {"input_x": input_x, "input_y": input_y, "input_y_clean": input_y}
    batch = preprocess_dimension_patch(batch, n_feature=10, n_class=10)
    model.depth = 2
    outputs = model.generate_decision_tree(batch['input_x'], batch['input_y'], depth=model.depth)
    decision_tree_forest.add_tree(DecisionTree(auto_dims=outputs.metatree_dimensions, auto_thresholds=outputs.tentative_splits, input_x=batch['input_x'], input_y=batch['input_y'], depth=model.depth))

    print("Decision Tree Features: ", [x.argmax(dim=-1) for x in outputs.metatree_dimensions])
    print("Decision Tree Threasholds: ", outputs.tentative_splits)

fetching fico from imodels
Dataset Shapes X=(10459, 23), y=(10459,), Num of Classes=2
Decision Tree Features:  [tensor([4]), tensor([8]), tensor([7])]
Decision Tree Threasholds:  [tensor([[72.5000]]), tensor([[64.5000]]), tensor([[97.5000]])]


### Evaluation on the test set

In [3]:
tree_pred = decision_tree_forest.predict(torch.tensor(test_X, dtype=torch.float32))

accuracy = accuracy_score(test_y, tree_pred.argmax(dim=-1).squeeze(0))
print("MetaTree Test Accuracy: ", accuracy)

MetaTree Test Accuracy:  0.6912045889101338


### Fitting CART tree as a comparison

In [4]:
cart_ensemble = []

for i in range(ensemble_size):
    random.seed(seed+i+1)
    subset_idx = random.sample(train_idx, 256)
    train_X, train_y = X[subset_idx], y[subset_idx]
    
    clf = sklearn.tree.DecisionTreeClassifier(max_depth=2, random_state=seed+i+1)
    clf.fit(train_X, train_y)
    cart_ensemble.append(clf)

overall_pred = np.zeros((test_X.shape[0], len(set(test_y))))
for clf in cart_ensemble:
    overall_pred += clf.predict_proba(test_X)
overall_pred = overall_pred / len(cart_ensemble)

accuracy = accuracy_score(test_y, overall_pred.argmax(axis=-1))
print("CART Test Accuracy: ", accuracy)

CART Test Accuracy:  0.6883365200764818
