# Decision Trees

 Name | F1 | F2 | Output|
 | - | - | - | -|
  Ex1 | b | b  | +|
  Ex2 | c | b |  +|
  Ex3 | b|  c  | +|
  Ex4 | a | b  | -|
  Ex5 | c | a |  -|


1. What score would the information gain calculation assign to each of the features?  Be sure to show all your work.

Solution: 

$IG(F1) = \frac{2}{5}\cdot 0 + \frac{2}{5} \cdot 1 + \frac{1}{5} \cdot 0 = 0.4$

$IG(F2) = \frac{3}{5} \cdot I(\frac{2}{3}, \frac{1}{3}) + \frac{1}{5} \cdot 0 + \frac{1}{5} \cdot 0 \approx 0.92 \cdot 0.6 = .552$

2. What would be the recursive calls be (show the specific arguments; use the examples’ names for simplicity)?

Solution:

First, split into 3 groups based on $F1$:

Group 1 - {Ex1, Ex3} (Base Case)

Group 2 - {Ex2, Ex5} (Split again based on $F2$, that takes us to base case for those two nodes)

Group 3 - {Ex4} (Base Case)



In [2]:
import numpy as np

# calculation for I(2/3, 1/3)
-(np.log2(2/3)*(2/3) + np.log2(1/3)*(1/3))

0.9182958340544896

## HW2a - Decision Tree ID3/C4.5 Algorithm (Alpaydin Fig 9.3)

- Implement the earlier-listed ID3 algorithm ten times, train on 9 folds and test on one left out
- Print the 10 learned trees and the min, max, and mean of the 10 testset accuracies

In [3]:
from collections import Counter
import pandas as pd

x = Counter(np.array([1, 1, 2, 3]))
x.get(5, None)

In [4]:
import pandas as pd
import numpy as np
from typing import List

class TreeNode:
    def __init__(self, feature_idx=None):
        self.mapping = {}
        self.is_categorical = None
        self.feature_idx = feature_idx
        self.numeric_split = None
        self.prediction = None

    def predict(self, example: np.array):
        if len(self.mapping) == 0:
            # print(f'raw prediction: {self.prediction}')
            return round(self.prediction)
        if self.is_categorical:
            try:
                return self.mapping[example[self.feature_idx]]
            except:
                # print('missing value during evaluation')
                return self.prediction
        # feature is numeric
        if example[self.feature_idx] <= self.numeric_split:
            return self.mapping['left']
        return self.mapping['right']


    def __str__(self, level=0):
        if self.mapping == {}:
            return repr(self)
        ret = "\t"*level+f'{repr(self)}\n'
        for feature, child in self.mapping.items():
            ret += ("\t"*(level + 1) + f"value {feature} ---" + child.__str__(level+1) + '\n')
        return ret

    def __repr__(self):
        return f'feature idx: {self.feature_idx} is_categorical: {self.is_categorical} numeric_split: {self.numeric_split} prediction: {self.prediction}'


class DecisionTreeClassifier:
    def __init__(self, theta_i=0.5, num_classes=None, num_features=None, max_best_f=5):
        self.theta_i = theta_i
        self.tree = None
        self.num_classes = num_classes
        self.cat_feature_idx = None
        self.count = 0
        self.num_features = num_features
        self.best_features = []
        self.max_best_f = max_best_f
        self.first_split = True

    def get_class_counts(self, y: np.array):
        assert len(y.shape) == 1 # y must be a 1-d array of outputs 0 ... self.num_classes
        assert self.num_classes is not None
        counts = np.zeros(self.num_classes)
        for element in y:
            counts[element] += 1
        
        return counts


    def node_entropy(self, y: np.array): # note that the maximum entropy should be -log2(1/num_classes), not necessarily 1
        counts = self.get_class_counts(y)
        norm_counts = counts / len(y)
        unsummed_entropy = norm_counts * np.log2(norm_counts)
        unsummed_entropy = np.nan_to_num(unsummed_entropy)
        return - np.sum(unsummed_entropy)


    def split_entropy(self, branches: List[List], y: np.array):
        entropy = 0
        # print(branches, y)
        for branch in branches:
            entropy += self.node_entropy(y[branch]) * len(branch) / len(y)
        return entropy


    def split_categorical_feature(self, X_feat: np.array, y: np.array):
        feat_df = pd.DataFrame({'X_feat': X_feat, 'y': y})
        group_indices = {}
        for group_name, group in feat_df.groupby('X_feat'):
            group_indices[group_name] = group.index.to_numpy()
        return group_indices


    def split_attribute(self, X: np.array, y: np.array):
        min_entropy = np.inf
        best_f = None
        is_categorical = False

        if self.num_features is not None:
            np.random.seed()
            features_to_use = np.random.choice(range(X.shape[1]), size=self.num_features,replace=False)
            # print(f'features to use: {features_to_use}')
            # print(f'features to use: {self.features_to_use}')
        else:
            features_to_use = range(X.shape[1])
        for i in features_to_use:
            if i in self.cat_feature_idx:
                feature_partition = self.split_categorical_feature(X[:, i], y)
                # print(f'feature partition: {feature_partition}')
                entropy = self.split_entropy(feature_partition.values(), y)
                if entropy < min_entropy:
                    min_entropy = entropy
                    best_f = i
                    best_mapping = feature_partition
                    is_categorical = True
                    # print(f'best f: {best_f}, entropy: {entropy}, feature_partition: {feature_partition}')
                    cutoff = None
            else:
                # print('in else')
                sort_permutation = X[:, i].argsort()
                for j in range(1, len(y)):
                    entropy = self.split_entropy([sort_permutation[:j], sort_permutation[j:]], y)
                    # print(f'non cat entropy: {entropy}')
                    if entropy < min_entropy:
                        min_entropy = entropy
                        best_mapping = {'left': sort_permutation[:j], 'right': sort_permutation[j:]}
                        best_f = i
                        is_categorical = False
                        cutoff = (X[:, i][sort_permutation[j-1]] + X[:, i][sort_permutation[j]]) / 2 # mean of the two points

                        # print(f'raw: {X[:, i]}')
                        # print(f'best mapping: {best_mapping}')
                        # print(f'bestf: {i}')
                        # print(f'cutoff: {cutoff}')
        
        return best_mapping, best_f, is_categorical, cutoff




    def generate_tree(self, X: np.array, y: np.array, current_node: TreeNode):
        if self.node_entropy(y) <= self.theta_i:
            current_node.prediction = len(y[y == 1]) / len(y)
            return
        best_mapping, best_f, is_categorical, cutoff = self.split_attribute(X, y)
        self.best_features.append(best_f)
        # print(self.best_features[self.max_best_f:])
        # print(self.best_features[self.max_best_f:])
        # if len(self.best_features) > self.max_best_f and len(np.unique(self.best_features[-self.max_best_f:])) == 1:
        #     current_node.prediction = len(y[y == 1]) / len(y)
        #     return    
        for branch_name, branch_idx in best_mapping.items():
            new_node = TreeNode()
            current_node.feature_idx = best_f
            current_node.is_categorical = is_categorical
            current_node.numeric_split = cutoff
            current_node.prediction = len(y[y == 1]) / len(y)
            X_branch = X[branch_idx]
            y_branch = y[branch_idx]
            current_node.mapping[branch_name] = new_node

            # if current_node.is_categorical:
            #     print(f'bestf: {best_f}')
            #     print(X, y)
            self.generate_tree(X_branch, y_branch, new_node)
        
    def fit(self, X: np.array, y: np.array, cat_feature_idx: np.array):
        self.num_classes = len(np.unique(y))
        self.cat_feature_idx = cat_feature_idx
        root = TreeNode()
        self.tree = root
        self.generate_tree(X, y, root)

    def predict(self, X_test: np.array):
        preds = []
        for row in X_test:
            predicted_early = False
            current_node = self.tree
            while len(current_node.mapping) > 0:
                current_node = current_node.predict(row)
                if type(current_node) == float:
                    predicted_early = True
                    prediction = current_node
                    break
            if not predicted_early:
                prediction = current_node.predict(row)
            preds.append(prediction)

        return np.array(preds)

In [5]:
# test 1: confirm classifier works with mock data

X = np.array([[1,1.5], [2,2.3], [5, 5.5]])
y = np.array([1,0,1])

c = DecisionTreeClassifier()
c.fit(X, y, cat_feature_idx=[0])

assert np.array_equal(c.predict(X), y)



In [6]:
# test 2: confirm classifier works with mock data

X = np.array([[6.50, 1.0, 3], [6.9, 1.0, 3]])
y = np.array([1, 0])

c = DecisionTreeClassifier(num_features=2)
c.fit(X, y, cat_feature_idx=[1])



In [7]:
# print tree in ascii

print(str(c.tree))

feature idx: 2 is_categorical: False numeric_split: 3.0 prediction: 0.5
	value left ---feature idx: None is_categorical: None numeric_split: None prediction: 1.0
	value right ---feature idx: None is_categorical: None numeric_split: None prediction: 0.0



# CV

In [8]:
from typing import List
from tqdm import trange

def accuracy_score(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return (y_true == y_pred).sum() / len(y_true)

def load_one_file(filename: str):
    data = np.load(filename, allow_pickle=True)
    X = data['x']
    y = data['y']
    example_names = data['example_names']
    return X, y, example_names

def load_folds(folds: List[int]):
    all_X = []
    all_y = []
    all_example_names = []
    for fold in folds:
        filename = f'/Users/brwang/Desktop/ml_class/hw0/data/heart_fold{fold}.npz'
        X, y, example_names = load_one_file(filename)
        all_X.append(X)
        all_y.append(y)
        all_example_names.append(example_names)

    X_folds = np.concatenate(all_X, axis=0)
    y_folds = np.concatenate(all_y)
    example_names_folds = np.concatenate(all_example_names)
        
    return X_folds, y_folds, example_names_folds

X, y, example_names = load_folds([0, 1])
X.shape, y.shape, example_names.shape

def cv(num_folds=10):
    cv_results = []
    ks = []
    for test_fold in trange(num_folds):
        X_test, y_test, example_names_test = load_folds([test_fold])
        train_folds = [fold for fold in range(num_folds) if fold != test_fold]
        clf = DecisionTreeClassifier(num_classes=2, num_features=5)
        X_train, y_train, example_names_train = load_folds(train_folds)
        clf.fit(X_train, y_train, cat_feature_idx=[1, 2, 5, 6, 8, 10, 11, 12])
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        cv_results.append(acc)
    return cv_results

cv_scores = cv()

print(f'min cv score: {np.min(cv_scores)}')
print(f'average cv score: {np.mean(cv_scores)}')
print(f'max cv score: {np.max(cv_scores)}')

100%|██████████| 10/10 [00:05<00:00,  2.00it/s]

min cv score: 0.7
average cv score: 0.7679310344827586
max cv score: 0.8





In [9]:
cv_scores

[0.7666666666666667,
 0.7333333333333333,
 0.7333333333333333,
 0.7,
 0.8,
 0.7666666666666667,
 0.8,
 0.7931034482758621,
 0.7931034482758621,
 0.7931034482758621]

In [10]:
# debug using 1 fold

X_test, y_test, example_names_test = load_folds([0])
train_folds = [fold for fold in range(10) if fold != 0]
print(train_folds)
# tune_results = get_tune_results(train_folds)
# print(f'tune accuracy matrix, fold {test_fold}: \n{tune_results}')
# k, mean_results = get_optimal_k(tune_results)
# ks.append(k)
# print(f'average accuracy for each k, fold {test_fold}: \n{mean_results}')
# print(f'optimal k for fold {test_fold}: {k}')
clf = DecisionTreeClassifier(num_classes=2, theta_i=0)
X_train, y_train, example_names_train = load_folds(train_folds)
clf.fit(X_train, y_train, cat_feature_idx=[1, 2, 5, 6, 8, 10, 11, 12])
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc

[1, 2, 3, 4, 5, 6, 7, 8, 9]




0.7333333333333333

In [11]:
# printing tree for fold 0

print(clf.tree)

feature idx: 12 is_categorical: True numeric_split: None prediction: 0.47191011235955055
	value 3.0 ---	feature idx: 2 is_categorical: True numeric_split: None prediction: 0.22377622377622378
		value 1.0 ---		feature idx: 0 is_categorical: False numeric_split: 62.5 prediction: 0.2727272727272727
			value left ---			feature idx: 0 is_categorical: False numeric_split: 55.0 prediction: 0.42857142857142855
				value left ---feature idx: None is_categorical: None numeric_split: None prediction: 0.0
				value right ---				feature idx: 1 is_categorical: True numeric_split: None prediction: 0.75
					value 0.0 ---feature idx: None is_categorical: None numeric_split: None prediction: 0.0
					value 1.0 ---feature idx: None is_categorical: None numeric_split: None prediction: 1.0


			value right ---feature idx: None is_categorical: None numeric_split: None prediction: 0.0

		value 2.0 ---		feature idx: 0 is_categorical: False numeric_split: 56.5 prediction: 0.08333333333333333
			value left ---

## HW 2B

1. For each of your ten Heart Disease train sets, create 101 ‘bootstrap’ copies
(if |trainSet| = N, randomly selection with replacement N examples). 
Write these to disk (at least if your ID3 code reads examples from disk)
2. Extend HW2a so that ID3 takes an optional parameter, 
i, the number of candidate features to consider 
do runs with i in {1, 2, 3, 4, 5, 7} and use tuneset to choose the best i per test fold
3. Apply forests to test set, predicting POSITIVE if at least half the trees do
4. Plot mean testset accuracy as a function of size of forest (compare to kNN)
       consider |forest| from 1 to 101   [aside: see ‘early stopping’ later]
optional: learn decision STUMPS (one interior node) and see how they do
                        Or limit each tree depth to, say, 3


In [12]:
from collections import Counter
from tqdm import tqdm

class RandomForestClassifier:
    def __init__(self, num_candidate_features=None, random_seeds=[], trees=None):
        self.num_candidate_features = num_candidate_features
        self.random_seeds = random_seeds
        if trees is not None:
            self.clfs = trees
        else:
            self.clfs = []


    def bootstrap(self, X: np.array, y: np.array, random_seed):
        np.random.seed(random_seed)
        # print(Counter(np.random.randint(X.shape[0], size=X.shape[0])))
        indices = np.random.randint(X.shape[0], size=X.shape[0])
        X_b = X[indices, :]
        y_b = y[indices]
        return X_b, y_b

    def fit(self, X: np.array, y: np.array, cat_feature_idx):
        assert X.shape[0] == y.shape[0]
        for seed in self.random_seeds:
            X_b, y_b = self.bootstrap(X, y, seed)
            if self.num_candidate_features:
                clf = DecisionTreeClassifier(num_features=self.num_candidate_features)
            else:
                clf = DecisionTreeClassifier(num_features=X.shape[0])
            clf.fit(X_b, y_b, cat_feature_idx)
            self.clfs.append(clf)

    def predict(self, X_test: np.array):
        votes = []
        for clf in self.clfs:
            vote = clf.predict(X_test)
            votes.append(vote)

        majority_votes = []
        for all_votes in np.stack(votes).T:
            majority_vote = Counter(all_votes).most_common(1)[0][0]
            majority_votes.append(majority_vote)

        # return majority vote
        return np.array(majority_votes)

In [13]:
# debug using 1 fold

X_test, y_test, example_names_test = load_folds([0])
train_folds = [fold for fold in range(10) if fold != 0]
# tune_results = get_tune_results(train_folds)
# print(f'tune accuracy matrix, fold {test_fold}: \n{tune_results}')
# k, mean_results = get_optimal_k(tune_results)
# ks.append(k)
# print(f'average accuracy for each k, fold {test_fold}: \n{mean_results}')
# print(f'optimal k for fold {test_fold}: {k}')
clf = RandomForestClassifier(num_candidate_features=5, random_seeds=list(range(101)))
X_train, y_train, example_names_train = load_folds(train_folds)
clf.fit(X_train, y_train, cat_feature_idx=[1, 2, 5, 6, 8, 10, 11, 12])
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc



0.8333333333333334

In [14]:
from tqdm import trange
import time
import warnings

# supress warnings
warnings.filterwarnings("ignore")

def get_tune_results(train_tune_folds: List[int], k_values=[1, 2, 3, 4, 5, 7], random_seeds=None):
    results_dict = {k: [] for k in k_values}
    for tune_fold in range(len(train_tune_folds)):
        X_tune, y_tune, example_names_tune = load_folds([tune_fold])
        train_folds = [fold for fold in train_tune_folds if fold != tune_fold]
        X_train, y_train, example_names_train = load_folds(train_folds)

        for k in k_values:
            clf = RandomForestClassifier(num_candidate_features=k, random_seeds=random_seeds)
            clf.fit(X_train, y_train, cat_feature_idx=[1, 2, 5, 6, 8, 10, 11, 12])
            y_pred = clf.predict(X_tune)
            score = accuracy_score(y_tune, y_pred)
            results_dict[k].append(score)
        
    return pd.DataFrame(results_dict)

def get_optimal_k(tune_results: pd.DataFrame):
    return tune_results.columns[tune_results.mean().argmax()], tune_results.mean()


def cv(num_folds=10, random_seeds=list(range(101))):
    cv_results = []
    ks = []
    for test_fold in range(num_folds):
        print(f'starting fold {test_fold}')
        start = time.time()
        X_test, y_test, example_names_test = load_folds([test_fold])
        train_folds = [fold for fold in range(num_folds) if fold != test_fold]
        tune_results = get_tune_results(train_folds, random_seeds=random_seeds)
        print(f'tune accuracy matrix, fold {test_fold}: \n{tune_results}')
        k, mean_results = get_optimal_k(tune_results)
        ks.append(k)
        print(f'average accuracy for each k, fold {test_fold}: \n{mean_results}')
        print(f'optimal k for fold {test_fold}: {k}')
        clf = RandomForestClassifier(num_candidate_features=k, random_seeds=random_seeds)
        X_train, y_train, example_names_train = load_folds(train_folds)
        clf.fit(X_train, y_train, cat_feature_idx=[1, 2, 5, 6, 8, 10, 11, 12])
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        cv_results.append(acc)
        end = time.time()
        print(f'time for fold: {end - start}')

    print(ks)
    return cv_results

cv_scores = cv()
print(f'average cv score: {np.mean(cv_scores)}')

starting fold 0
tune accuracy matrix, fold 0: 
          1         2         3         4         5         7
0  0.833333  0.866667  0.833333  0.800000  0.800000  0.833333
1  0.833333  0.833333  0.866667  0.833333  0.800000  0.833333
2  0.933333  0.900000  0.900000  0.966667  0.933333  0.933333
3  0.666667  0.700000  0.600000  0.633333  0.633333  0.666667
4  0.866667  0.900000  0.800000  0.866667  0.833333  0.833333
5  0.833333  0.766667  0.833333  0.766667  0.800000  0.766667
6  0.833333  0.733333  0.800000  0.766667  0.766667  0.766667
7  0.793103  0.793103  0.793103  0.793103  0.793103  0.793103
8  0.758621  0.827586  0.793103  0.793103  0.793103  0.793103
average accuracy for each k, fold 0: 
1    0.816858
2    0.813410
3    0.802171
4    0.802171
5    0.794764
7    0.802171
dtype: float64
optimal k for fold 0: 1
time for fold: 1396.8318231105804
starting fold 1
tune accuracy matrix, fold 1: 
          1         2         3         4         5         7
0  0.833333  0.833333  0.8666

In [16]:
# redundant, but quick
def short_cv(num_folds=10, random_seeds=None, optimal_k=[1, 3, 1, 5, 2, 3, 2, 1, 2, 1]):
    num_trees_cv_map = {}
    for num_trees in trange(1, 102, 5):
        cv_results = []
        for test_fold in range(num_folds):
            X_test, y_test, example_names_test = load_folds([test_fold])
            train_folds = [fold for fold in range(num_folds) if fold != test_fold]
            clf = RandomForestClassifier(num_candidate_features=optimal_k[test_fold], random_seeds=list(range(num_trees)))
            X_train, y_train, example_names_train = load_folds(train_folds)
            clf.fit(X_train, y_train, cat_feature_idx=[1, 2, 5, 6, 8, 10, 11, 12])
            y_pred = clf.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            cv_results.append(acc)
        num_trees_cv_map[num_trees] = np.mean(cv_results)

    return num_trees_cv_map

cv_map = short_cv()

100%|██████████| 21/21 [35:09<00:00, 100.45s/it]


In [20]:
import plotly.express as px

px.line(x=list(cv_map.keys()), y=list(cv_map.values()))