In [2]:
import scipy as sp
import numpy as np
import pandas as pd

%run basic_model.ipynb

# Decision Tree

In [3]:
class SplitFunctions:
    @staticmethod
    def gini_impurity(obj, observation_indexes):        
        node_y = obj.y[observation_indexes, :]

        p_0 = (node_y == 0).sum() / node_y.shape[0]
        p_1 = (node_y == 1).sum() / node_y.shape[0]

        return 1 - p_0 * p_0 - p_1 * p_1
    
    @staticmethod
    def gini_index(
        obj,
        _,
        left_split_indexes, 
        right_split_indexes
    ):
        w_left = left_split_indexes.shape[0]
        w_right = right_split_indexes.shape[0]

        W = w_left + w_right

        gini_index = w_left/W * SplitFunctions.gini_impurity(
            obj,
            left_split_indexes
        )

        gini_index += w_right/W * SplitFunctions.gini_impurity(
            obj,
            right_split_indexes
        )

        return gini_index
    
    @staticmethod
    def entropy(obj, observation_indexes):
        node_y = obj.y[observation_indexes, :]

        p_0 = (node_y == 0).sum() / node_y.shape[0]
        p_1 = (node_y == 1).sum() / node_y.shape[0]

        result = 0

        if p_0 != 0:
            result -= p_0*np.log2(p_0)

        if p_1 != 0:
            result -= p_1*np.log2(p_1)

        return result

    @staticmethod
    def information_gain(
        obj,
        observation_indexes,
        left_split_indexes,
        right_split_indexes
    ):
        entropy_before = SplitFunctions.entropy(
            obj,
            observation_indexes
        )

        entropy_after = SplitFunctions.entropy(
            obj,
            left_split_indexes
        )

        entropy_after += SplitFunctions.entropy(
            obj,
            right_split_indexes
        )

        return entropy_before - entropy_after

    @staticmethod
    def gain_ratio(
        obj,
        observation_indexes,
        left_split_indexes,
        right_split_indexes
    ):
        split_info = 0

        w_left = left_split_indexes.shape[0]
        if w_left != 0:
            split_info += w_left * np.log2(w_left)

        w_right = right_split_indexes.shape[0]
        if w_right != 0:
            split_info += w_right * np.log2(w_right)

        information_gain = SplitFunctions.information_gain(
            obj,
            observation_indexes,
            left_split_indexes,
            right_split_indexes
        )

        return information_gain / split_info

    @staticmethod
    def __mse(
        obj,
        observation_indexes
    ):
        labels = obj.y[observation_indexes, :]

        return ((labels - labels.mean())**2).mean()

    @staticmethod
    def mse_sum(
        obj,
        _,
        left_split_indexes,
        right_split_indexes
    ):
        left_mse = SplitFunctions.__mse(obj, left_split_indexes)

        right_mse = SplitFunctions.__mse(obj, right_split_indexes)

        return left_mse + right_mse

    @staticmethod
    def __mae(
        obj,
        observation_indexes
    ):
        labels = obj.y[observation_indexes, :]

        return (np.abs(labels - labels.mean())).sum()

    @staticmethod
    def mae_sum(
        obj,
        _,
        left_split_indexes,
        right_split_indexes
    ):
        left_mae = SplitFunctions.__mae(obj, left_split_indexes)

        right_mae = SplitFunctions.__mae(obj, right_split_indexes)

        return left_mae + right_mae

In [10]:
from collections import namedtuple
import functools

# CART DT (maybe)
class DecisionTree(BasicModel):
    class Node:
        def __init__(
            self,
            feature,
            value,
            observation_indexes,
            left=None,
            right=None,
            answer=None
        ):
            self.feature = feature
            self.split_value = value
            self.observation_indexes = observation_indexes
            self.left = left
            self.right = right
            self.answer = answer
            
        def is_leaf(self):
            return not self.left and not self.right
    
    criterion_name_to_calculator_method_name = {
        'gini': SplitFunctions.gini_index,
        'entropy': SplitFunctions.information_gain,
        'gain_ratio': SplitFunctions.gain_ratio,
        'mse': SplitFunctions.mse_sum,
        'mae': SplitFunctions.mae_sum,
    }
    
    criterion_name_to_cmp = {
        'gini': lambda x, y: x-y,
        'entropy': lambda x, y: y-x,
        'gain_ratio': lambda x, y: y-x,
        'mse': lambda x, y: y-x,
        'mae': lambda x, y: y-x,
    }
    
    def __init__(
        self,
        criterion='gini',
        splitter='best',
        max_depth=None,
        min_samples_split=2,
        max_features=None,
        random_state=42,
        debug=False
    ):
        super().check_value_and_set(
            'criterion',
            criterion,
            ['gini', 'entropy', 'gain_ratio', 'mse', 'mae']
        )
        
        self.__cmp_criterion_values = \
            self.criterion_name_to_cmp[criterion]
        self.__calc_criterion_value = \
            self.criterion_name_to_calculator_method_name[criterion]
        
        if criterion in ['mse', 'mae']:
            self.__task = 'regression'
        else:
            self.__task = 'classification'
        
        super().check_value_and_set(
            'splitter',
            splitter,
            ['best', 'random']
        )
        
        super().check_value_type_and_set(
            'max_depth',
            max_depth,
            (int, type(None))
        )
        if self.max_depth is None:
            self.max_depth = float('inf')
        
        super().check_value_type_and_set(
            'min_samples_split',
            min_samples_split,
            (int, float)
        )
        
        super().check_value_type_and_set(
            'max_features',
            max_features,
            (int, float, str, type(None))
        )
        
        if type(max_features) == str:
            super().check_value_and_set(
                'max_features',
                max_features,
                ['auto', 'sqrt', 'log2']
            )
        if self.max_features == 'auto':
            self.max_features = 'sqrt'
            
        super().check_value_type_and_set(
            'random_state',
            random_state,
            (np.random.RandomState, int)
        )
        if type(random_state) == int:
            self.random_state = np.random.RandomState(random_state)
            
        super().check_value_type_and_set(
            'debug',
            debug,
            bool
        )
        
        self.root = None
    
    @staticmethod
    def process_max_features(input_max_features, X):
        max_features = input_max_features
        
        if input_max_features is None:
            max_features = X.shape[1]
        
        elif type(input_max_features) == float:
            max_features = np.int(
                X.shape[1] * input_max_features
            )
        
        elif type(input_max_features) == str:
            max_features = np.int(
                getattr(np, input_max_features)(X.shape[1])
            )
            
        return max_features
    
    def fit(self, X, y):
        X = super().check_and_transform_X(X)
        y = super().check_and_transform_y(X, y)
        
        self.X, self.y = X, y
        
        self.max_features = DecisionTree.process_max_features(
            self.max_features,
            X
        )
        
        if not self.debug:
            self.__construct_tree()
        
    def predict(self, X):
        assert self.root != None, "Not fitted"
        
        X = super().check_and_transform_X(X)
        
        predictions = []
        
        for el in X:
            prediction = self.__predict_observation(el)
            predictions.append(prediction)
        
        return np.array(predictions).reshape((-1, 1))
            
    def __predict_observation(self, el):        
        node = self.root
        
        while not node.is_leaf():
            if el[node.feature] <= node.split_value:
                node = node.left
            else:
                node = node.right
        
        return node.answer
    
    def __get_split_pairs_gen(
        self,
        feature,
        observation_indexes
    ):
        node_x = self.X[observation_indexes, [feature]]
        
        observation_indexes, node_x = zip(
            *sorted(
                zip(
                    observation_indexes,
                    node_x.ravel()
                ), 
                key=lambda x: x[1]
            )
        )
        
        # to use numpy views
        observation_indexes = np.array(observation_indexes)
        
        uniques = list(dict.fromkeys(node_x).keys())
        
        last_i = 0
        id_unique = 0
        
        while id_unique < len(uniques) - 1:
            for i in range(last_i, len(node_x)):
                if node_x[i] > uniques[id_unique]:
                    last_i = i
                    break
            
            left_split_indexes = observation_indexes[:last_i]
            right_split_indexes = observation_indexes[last_i:]
            
            yield left_split_indexes, right_split_indexes, uniques[id_unique]
            
            id_unique += 1
    
    BestFeatureSplit = namedtuple(
        'BestFeatureSplit', 
        [
            'criterion_value',
            'split_value',
            'left_split_indexes',
            'right_split_indexes'
        ]
    )
    
    # output can be (None, None)
    def __find_best_split_by_feature(
        self, 
        feature, 
        observation_indexes
    ):
        split_pairs_gen = self.__get_split_pairs_gen(
            feature,
            observation_indexes
        )
        
        # find best split
        best_split = DecisionTree.BestFeatureSplit(
            None,
            None,
            None,
            None
        )
        
        for left_split_indexes, right_split_indexes, split_value in split_pairs_gen:
            criterion_value = self.__calc_criterion_value(
                self,
                observation_indexes,
                left_split_indexes,
                right_split_indexes
            )
            
            current_split = DecisionTree.BestFeatureSplit(
                criterion_value,
                split_value,
                left_split_indexes,
                right_split_indexes
            )
            
            if best_split.criterion_value is None or \
               self.__cmp_criterion_values(
                   criterion_value, 
                   best_split.criterion_value
               ) < 0:
                best_split = current_split
        
        return best_split

    def __construct_tree_helper(self, observation_indexes, depth):
        not_splitted_node = DecisionTree.Node(
            None,
            None,
            observation_indexes,
            answer=self.__get_answer(
                observation_indexes
            )
        )
        
        if depth > self.max_depth or \
           observation_indexes.shape[0] < self.min_samples_split:
            return not_splitted_node
        
        features = np.arange(self.X.shape[1])
        if self.splitter == 'random':
            self.random_state.shuffle(features)
        
        best_feature_split = None
        best_feature = None
    
        feature_counter = 0
        
        for feature in features:
            if best_feature and \
               self.splitter == 'random' and \
               feature_counter > self.max_features:
                break
            
            feature_split = self.__find_best_split_by_feature(
                feature,
                observation_indexes
            )
            
            if feature_split.criterion_value is None:
                continue
            
            if best_feature is None or \
               self.__cmp_criterion_values(
                   feature_split.criterion_value,
                   best_feature_split.criterion_value
               ) < 0:
                best_feature_split = feature_split
                best_feature = feature
            
            feature_counter += 1
        
        if best_feature is None or \
           best_feature_split.criterion_value == 0:
            return not_splitted_node
        
        node = DecisionTree.Node(
            best_feature,
            best_feature_split.split_value,
            observation_indexes
        )
        
        node.left = self.__construct_tree_helper(
            best_feature_split.left_split_indexes,
            depth + 1
        )
        
        node.right = self.__construct_tree_helper(
            best_feature_split.right_split_indexes,
            depth + 1
        )
        
        if node.is_leaf():
            node.answer = self.__get_answer(
                observation_indexes
            )
        
        return node
    
    def __get_answer(self, observation_indexes):
        labels = self.y[observation_indexes, :]
        
        if self.__task == 'classification':
            ones = labels.sum()
            threshold = labels.shape[0] / 2
            
            if ones > threshold:
                return 1
            elif ones < threshold:
                return 0
            else:
                return self.random_state.randint(0, 2)
        else:
            return np.mean(labels)
    
    def __construct_tree(self):
        observation_indexes = np.arange(self.y.shape[0])
        
        self.root = self.__construct_tree_helper(
            observation_indexes, 
            1
        )
        
        if self.root is None:
            self.root = DecisionTree.Node(
                None,
                None,
                observation_indexes
            )
            
            self.answer = self.__get_answer(
                observation_indexes
            )

# Testing

In [6]:
from sklearn.datasets import make_classification, make_regression
from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
import unittest
import time

cl_X, cl_y = make_classification(100, 20)
cl_y = cl_y.reshape((100, 1))

regr_X, regr_y = make_regression(100, 20)
regr_y = regr_y.reshape((100, 1))

def time_fit_predict(
    X, 
    y,
    score_names=['ROC AUC'],
    score_funcs=[roc_auc_score],
    *args,
    **kwargs
):
    start = time.time()
    
    dt = DecisionTree(*args, **kwargs)
    dt.fit(X, y)
    
    for score_name, score_func in zip(score_names, score_funcs):
        score = score_func(cl_y, dt.predict(cl_X))

        print("{} criterion {} score: {}".format(
            kwargs['criterion'].capitalize(), 
            score_name,
            score
        ))
    print("Time: {}\n\n".format(time.time() - start))

class TestDecisionTree(unittest.TestCase):
    def test_gini_impurity(self):
        dt = DecisionTree(debug=True)
        
        dt.fit(
            np.array([[1, 2], [1, 2], [1, 2]]),
            np.array([1, 0, 1]).reshape((-1, 1))
        )
        
        self.assertEqual(
            SplitFunctions.gini_impurity(
                dt,
                np.array([0, 2])
            ),
            0
        )
        
        self.assertEqual(
            SplitFunctions.gini_impurity(
                dt,
                np.array([0, 1])
            ),
            0.5
        )
    
    def test_gini(self):
        time_fit_predict(cl_X, cl_y, criterion='gini')
        
    def test_entropy(self):
        time_fit_predict(cl_X, cl_y, criterion='entropy')
    
    def test_gain_ratio(self):
        time_fit_predict(cl_X, cl_y, criterion='gain_ratio')
        
    def test_mse(self):
        time_fit_predict(
            regr_X, 
            regr_y,
            ['MSE', 'MAE'],
            [mean_squared_error, mean_absolute_error],
            criterion='mse'
        )
    
    def test_mae(self):
        time_fit_predict(
            regr_X, 
            regr_y,
            ['MSE', 'MAE'],
            [mean_squared_error, mean_absolute_error],
            criterion='mae'
        )
    
    def test_max_features_and_random_state(self):
        print('\nDifference in next 3 results means it works')
        
        for i in range(3):
            time_fit_predict(
                cl_X,
                cl_y,
                splitter='random',
                max_features='sqrt',
                criterion='gini',
                random_state=i
            )
    
    def test_max_depth(self):
        max_depth = 5
        
        def check_depth(node, depth=1):
            if not node.is_leaf():
                return check_depth(node.left, depth+1) and \
                       check_depth(node.right, depth+1)
            
            if depth > max_depth:
                result = False
            
            return True
        
        dt = DecisionTree(max_depth=max_depth)
        dt.fit(cl_X, cl_y)
        
        self.assertEqual(
            check_depth(dt.root),
            True
        )
    
    def test_min_samples_split(self):
        min_samples_split = 10
        
        def check_samples_split(node):
            if node.is_leaf():
                return True
            
            is_satisfied = \
                node.observation_indexes.shape[0] >= min_samples_split
            
            return is_satisfied and \
                   check_samples_split(node.left) and \
                   check_samples_split(node.right)
        
        dt = DecisionTree(min_samples_split=min_samples_split)
        dt.fit(cl_X, cl_y)
        
        self.assertEqual(
            check_samples_split(dt.root),
            True
        )

In [7]:
with open('tmp', "w") as f:
    runner = unittest.TextTestRunner(f)
    obj = unittest.main(
        argv=['first-arg-is-ignored', '--verbose'], 
        testRunner=runner,
        exit=False
    )

! cat tmp
! rm -r tmp

Entropy criterion ROC AUC score: 1.0
Time: 1.4752299785614014


Gain_ratio criterion ROC AUC score: 1.0
Time: 1.598250150680542


Gini criterion ROC AUC score: 0.9799919967987195
Time: 0.20250868797302246


Mae criterion MSE score: 8424.338077550778
Mae criterion MAE score: 73.53506606450037
Time: 0.5962285995483398



Difference in next 3 results means it works
Gini criterion ROC AUC score: 0.9799919967987195
Time: 0.05823206901550293


Gini criterion ROC AUC score: 0.959983993597439
Time: 0.08164811134338379


Gini criterion ROC AUC score: 0.9215686274509804
Time: 0.07732915878295898


Mse criterion MSE score: 2399.7822855564905
Mse criterion MAE score: 39.203353697966186
Time: 1.4021096229553223


.........
----------------------------------------------------------------------
Ran 9 tests in 5.896s

OK


entropy and gain_ratio are slower due to calculation of logarithm, mse --- due to exponentiation.