# Decision Trees

 Name | F1 | F2 | Output|
 | - | - | - | -|
  Ex1 | b | b  | +|
  Ex2 | c | b |  +|
  Ex3 | b|  c  | +|
  Ex4 | a | b  | -|
  Ex5 | c | a |  -|


1. What score would the information gain calculation assign to each of the features?  Be sure to show all your work.

Solution: 

$IG(F1) = \frac{2}{5}\cdot 0 + \frac{2}{5} \cdot 1 + \frac{1}{5} \cdot 0 = 0.4$

$IG(F2) = \frac{3}{5} \cdot I(\frac{2}{3}, \frac{1}{3}) + \frac{1}{5} \cdot 0 + \frac{1}{5} \cdot 0 \approx 0.92 \cdot 0.6 = .552$

2. What would be the recursive calls be (show the specific arguments; use the examples’ names for simplicity)?

Solution:

First, split into 3 groups based on $F1$:

Group 1 - {Ex1, Ex3} (Base Case)

Group 2 - {Ex2, Ex5} (Split again based on $F2$, that takes us to base case for those two nodes)

Group 3 - {Ex4} (Base Case)



In [4]:
import numpy as np

# calculation for I(2/3, 1/3)
-(np.log2(2/3)*(2/3) + np.log2(1/3)*(1/3))

0.9182958340544896

## Decision Tree ID3/C4.5 Algorithm (Alpaydin Fig 9.3)

In [5]:
from collections import Counter
import pandas as pd

x = Counter(np.array([1, 1, 2, 3]))
x.get(5, None)

In [6]:
df = pd.DataFrame({'a': [1,1,2,1,3,2,4], 'b': [1,1,1,1,1,1,1]})
for _, group in df.groupby('a'):
    print(group.index)

Int64Index([0, 1, 3], dtype='int64')
Int64Index([2, 5], dtype='int64')
Int64Index([4], dtype='int64')
Int64Index([6], dtype='int64')


In [278]:
np.array([1,2])[1:]

array([2])

In [325]:
import pandas as pd
from typing import List

class TreeNode:
    def __init__(self, feature_idx=None):
        self.mapping = {}
        self.is_categorical = None
        self.feature_idx = feature_idx
        self.numeric_split = None
        self.prediction = None

    def predict(self, example: np.array):
        if self.prediction is not None:
            return self.prediction
        if self.is_categorical:
            try:
                return self.mapping[example[self.feature_idx]]
            except:
                print('missing value during evaluation')
                return np.random.choice(list(self.mapping.values()))
        # feature is numeric
        if example[self.feature_idx] <= self.numeric_split:
            return self.mapping['left']
        return self.mapping['right']


    def __str__(self, level=0):
        if self.mapping == {}:
            return repr(self)
        ret = "\t"*level+f'{repr(self)}\n'
        for feature, child in self.mapping.items():
            ret += ("\t"*(level + 1) + f"value {feature} ---" + child.__str__(level+1) + '\n')
        return ret

    def __repr__(self):
        return f'feature idx: {self.feature_idx} is_categorical: {self.is_categorical} numeric_split: {self.numeric_split} prediction: {self.prediction}'


class DecisionTreeClassifier:
    def __init__(self, theta_i=0.5, num_classes=None):
        self.theta_i = theta_i
        self.tree = None
        self.num_classes = num_classes
        self.cat_feature_idx = None
        self.count = 0

    def get_class_counts(self, y: np.array):
        assert len(y.shape) == 1 # y must be a 1-d array of outputs 0 ... self.num_classes
        assert self.num_classes is not None
        counts = np.zeros(self.num_classes)
        for element in y:
            counts[element] += 1
        
        return counts


    def node_entropy(self, y: np.array): # note that the maximum entropy should be -log2(1/num_classes), not necessarily 1
        counts = self.get_class_counts(y)
        norm_counts = counts / len(y)
        unsummed_entropy = norm_counts * np.log2(norm_counts)
        unsummed_entropy = np.nan_to_num(unsummed_entropy)
        return - np.sum(unsummed_entropy)


    def split_entropy(self, branches: List[List], y: np.array):
        entropy = 0
        # print(branches, y)
        for branch in branches:
            entropy += self.node_entropy(y[branch]) * len(branch) / len(y)
        return entropy


    def split_categorical_feature(self, X_feat: np.array, y: np.array):
        feat_df = pd.DataFrame({'X_feat': X_feat, 'y': y})
        group_indices = {}
        for group_name, group in feat_df.groupby('X_feat'):
            group_indices[group_name] = group.index.to_numpy()
        return group_indices


    def split_attribute(self, X: np.array, y: np.array):
        min_entropy = np.inf
        best_f = None
        is_categorical = False
        for i in range(X.shape[1]):
            if i in self.cat_feature_idx:
                feature_partition = self.split_categorical_feature(X[:, i], y)
                # print(f'feature partition: {feature_partition}')
                entropy = self.split_entropy(feature_partition.values(), y)
                if entropy < min_entropy:
                    min_entropy = entropy
                    best_f = i
                    best_mapping = feature_partition
                    is_categorical = True
                    # print(f'best f: {best_f}, entropy: {entropy}, feature_partition: {feature_partition}')
                    cutoff = None
            else:
                # print('in else')
                sort_permutation = X[:, i].argsort()
                for j in range(1, len(y)):
                    entropy = self.split_entropy([sort_permutation[:j], sort_permutation[j:]], y)
                    # print(f'non cat entropy: {entropy}')
                    if entropy < min_entropy:
                        min_entropy = entropy
                        best_mapping = {'left': sort_permutation[:j], 'right': sort_permutation[j:]}
                        best_f = i
                        is_categorical = False
                        cutoff = (X[:, i][sort_permutation[j-1]] + X[:, i][sort_permutation[j]]) / 2 # mean of the two points

                        # print(f'raw: {X[:, i]}')
                        # print(f'best mapping: {best_mapping}')
                        # print(f'cutoff: {cutoff}')
        
        return best_mapping, best_f, is_categorical, cutoff




    def generate_tree(self, X: np.array, y: np.array, current_node: TreeNode):
        # self.count += 1
        # if self.count > 10:
        #     return

        if self.node_entropy(y) <= self.theta_i:
            majority_class = np.argmax(np.bincount(y))
            current_node.prediction = majority_class
            return
        best_mapping, best_f, is_categorical, cutoff = self.split_attribute(X, y)
        for branch_name, branch_idx in best_mapping.items():
            new_node = TreeNode()
            current_node.feature_idx = best_f
            current_node.is_categorical = is_categorical
            current_node.numeric_split = cutoff

            X_branch = X[branch_idx]
            y_branch = y[branch_idx]
            current_node.mapping[branch_name] = new_node

            # if current_node.is_categorical:
            #     print(f'bestf: {best_f}')
            #     print(X, y)
            self.generate_tree(X_branch, y_branch, new_node)
        
    def fit(self, X: np.array, y: np.array, cat_feature_idx: np.array):
        self.num_classes = len(np.unique(y))
        self.cat_feature_idx = cat_feature_idx
        root = TreeNode()
        self.tree = root
        self.generate_tree(X, y, root)

    def predict(self, X_test: np.array):
        preds = []
        for row in X_test:
            current_node = self.tree
            while len(current_node.mapping) > 0:
                current_node = current_node.predict(row)
            prediction = current_node.predict(row)
            preds.append(prediction)

        return np.array(preds)

In [331]:
# test 1: confirm classifier works with mock data

X = np.array([[1,1.5], [2,2.3], [5, 5.5]])
y = np.array([1,0,1])

c = DecisionTreeClassifier()
c.fit(X, y, cat_feature_idx=[0])

assert np.array_equal(c.predict(X), y)



In [333]:
# test 2: confirm classifier works with mock data

X = np.array([[6.50, 1.0, 3], [6.9, 1.0, 3]])
y = np.array([1, 0])

c = DecisionTreeClassifier()
c.fit(X, y, cat_feature_idx=[1])



In [334]:
# print tree in ascii

print(str(c.tree))

feature idx: 0 is_categorical: False numeric_split: 6.7 prediction: None
	value left ---feature idx: None is_categorical: None numeric_split: None prediction: 1
	value right ---feature idx: None is_categorical: None numeric_split: None prediction: 0



# CV

In [335]:
from typing import List
from tqdm import trange

def accuracy_score(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return (y_true == y_pred).sum() / len(y_true)

def load_one_file(filename: str):
    data = np.load(filename, allow_pickle=True)
    X = data['x']
    y = data['y']
    example_names = data['example_names']
    return X, y, example_names

def load_folds(folds: List[int]):
    all_X = []
    all_y = []
    all_example_names = []
    for fold in folds:
        filename = f'/Users/brwang/Desktop/ml_class/hw0/data/heart_fold{fold}.npz'
        X, y, example_names = load_one_file(filename)
        all_X.append(X)
        all_y.append(y)
        all_example_names.append(example_names)

    X_folds = np.concatenate(all_X, axis=0)
    y_folds = np.concatenate(all_y)
    example_names_folds = np.concatenate(all_example_names)
        
    return X_folds, y_folds, example_names_folds

X, y, example_names = load_folds([0, 1])
X.shape, y.shape, example_names.shape

def cv(num_folds=10):
    cv_results = []
    ks = []
    for test_fold in trange(num_folds):
        X_test, y_test, example_names_test = load_folds([test_fold])
        train_folds = [fold for fold in range(num_folds) if fold != test_fold]
        clf = DecisionTreeClassifier(num_classes=2)
        X_train, y_train, example_names_train = load_folds(train_folds)
        clf.fit(X_train, y_train, cat_feature_idx=[1, 2, 5, 6, 8, 10, 11, 12])
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        cv_results.append(acc)
    return cv_results

cv_scores = cv()
print(f'average cv score: {np.mean(cv_scores)}')

 20%|██        | 2/10 [00:01<00:06,  1.31it/s]

missing value during evaluation


 30%|███       | 3/10 [00:02<00:05,  1.18it/s]

missing value during evaluation


100%|██████████| 10/10 [00:08<00:00,  1.17it/s]

average cv score: 0.7506896551724138





In [336]:
# debug using 1 fold

X_test, y_test, example_names_test = load_folds([0])
train_folds = [fold for fold in range(10) if fold != 0]
print(train_folds)
# tune_results = get_tune_results(train_folds)
# print(f'tune accuracy matrix, fold {test_fold}: \n{tune_results}')
# k, mean_results = get_optimal_k(tune_results)
# ks.append(k)
# print(f'average accuracy for each k, fold {test_fold}: \n{mean_results}')
# print(f'optimal k for fold {test_fold}: {k}')
clf = DecisionTreeClassifier(num_classes=2, theta_i=0)
X_train, y_train, example_names_train = load_folds(train_folds)
clf.fit(X_train, y_train, cat_feature_idx=[1, 2, 5, 6, 8, 10, 11, 12])
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
acc

[1, 2, 3, 4, 5, 6, 7, 8, 9]




0.7333333333333333