In [148]:
import torch
import numpy as np

In [149]:
import pandas
#dataset taken from https://www.kaggle.com/yashsawarn/wifi-stretgth-for-rooms

USE_GINI = True

def read_dataset(csv_name = 'wifi_localization.txt'):
    """
    Reads a csv dataset
    returns it as a pytorch tensor
    """
    data_frame = pandas.read_table(csv_name, delim_whitespace=True, names=('A', 'B', 'C', 'D','E', 'F', 'G', 'ROOM'),
                       dtype={'A': np.int64, 'B': np.float64, 'C': np.float64, 'D': np.float64,'E': np.float64,'F': np.float64,'G': np.float64,'ROOM': np.float64})

    targets_torch = torch.tensor(data_frame['ROOM'].values)
    dataset_torch = torch.tensor(data_frame.values)

    return dataset_torch

# Implementación de la clasificación multi-clase con árboles de decisión

In [192]:
class Node_CART:
    def __init__(self, num_classes = 4, ref_CART = None, current_depth = 0):
        """
        Create the node attributes
        param num_classes: K number of classes to classify
        param ref_cart: reference to the tree containing the node
        param current_depth: current depth of the node in the tree
        """
        self.ref_CART = ref_CART
        self.threshold_value = 0
        self.feature_num = 0
        self.node_right = None
        self.node_left = None
        self.data_torch_partition = None
        self.gini = 0
        self.dominant_class = None
        self.accuracy_dominant_class = None
        self.num_classes = num_classes
        self.current_depth = current_depth

    def to_xml(self, current_str = ""):
        """
        Recursive function to write the node content to an xml formatted string
        param current_str : the xml content so far in the whole tree
        return the string with the node content
        """
        str_node = "<node><thresh>" + str(self.threshold_value) + "</thresh>" + "<feature>" + str(self.feature_num) + "</feature><depth>" + str(self.current_depth)+ "</depth>"
        str_node += "<gini>" + str(self.gini) + "</gini>"
        if(self.node_right != None):
            str_left = self.node_right.to_xml(current_str)
            str_node += str_left
        if(self.node_left != None):
            str_right = self.node_left.to_xml(current_str)
            str_node += str_right

        if(self.is_leaf()):
            str_node += "<dominant_class>" + str(self.dominant_class) + "</dominant_class><acc_dominant_class>"  + str(self.accuracy_dominant_class) + "</acc_dominant_class>"
        str_node += "</node>"
        return str_node

    def is_leaf(self):
        """
        Checks whether the node is a leaf
        """
        return (self.node_left == None and self.node_right == None)

    def create_with_children(self, data_torch, current_depth, list_selected_features = [], min_gini = 0.000001):
        """
        Creates a node by selecting the best feature and threshold, and if needed, creating its children
        param data_torch: dataset with the current partition to deal with in the node
        param current_depth: depth counter for the node
        param list_selected_features: list of selected features so far for the CART building process
        param min_gini: hyperparmeter selected by the user defining the minimum tolerated gini coefficient for a  node
        return the list of selected features so far
        """
        self.data_torch_partition = data_torch
        self.threshold_value, self.feature_num, self.gini = self.select_best_feature_and_thresh(data_torch, list_selected_features)
        # if not at max depth
        if current_depth < self.ref_CART.max_CART_depth and self.gini >= min_gini:
            # if we found an appropiate feature + threshold
            if self.feature_num != float('inf') and self.threshold_value != float('inf'):
                list_selected_features.append(self.feature_num)
                # left
                self.node_left = Node_CART(ref_CART=self.ref_CART, current_depth=current_depth+1)
                list_selected_features_left = self.node_left.create_with_children(data_torch[data_torch[:,self.feature_num] <= self.threshold_value], (current_depth+1), list_selected_features, min_gini)
                # right
                self.node_right = Node_CART(ref_CART=self.ref_CART, current_depth=current_depth+1)
                list_selected_features_right = self.node_right.create_with_children(data_torch[data_torch[:,self.feature_num] > self.threshold_value], (current_depth+1), list_selected_features, min_gini)
                # combinar listas
                list_selected_features = list(set(list_selected_features_left + list_selected_features_right))
        # dominant class
        labels, counts = torch.unique(data_torch[:,-1], return_counts=True)
        dominant_index = torch.argmax(counts)
        dominant_class = labels[dominant_index].item()
        self.dominant_class = dominant_class
        return list_selected_features

    def calculate_gini(self, data_partition_torch: torch.tensor, num_classes: int = 4):
        """
        Calculates the gini coefficient for a given partition with the given number of classes
        param data_partition_torch: current dataset partition as a tensor
        param num_classes: K number of classes to discriminate from
        returns the calculated gini coefficient
        """
        ROOM_COLUMN_INDEX = 7
        class_counts = torch.bincount(data_partition_torch[:, -1], minlength=num_classes + 1)
        class_counts = class_counts[1:]
        class_probabilities = class_counts.float() / data_partition_torch.shape[0]
        gini_coef = 1.0 - torch.sum(class_probabilities ** 2)
        return gini_coef

    def calculate_entropy(self, data_partition_torch, num_classes = 4):
        """
        Calculates the entropy for a given partition with the given number of classes
        param data_partition_torch: current dataset partition as a tensor
        param num_classes: K number of classes to discriminate from
        returns the calculated entropy
        """
        ROOM_COLUMN_INDEX = 7
        class_counts = torch.bincount(data_partition_torch[:, -1], minlength=num_classes + 1)
        class_counts = class_counts[1:]
        p_k = class_counts.float() / data_partition_torch.shape[0]
        p_k[p_k == 0.0] = 1.0 # Remove 0 value, since log2(0) = -inf and log2(1) = 0
        return -1*torch.sum(p_k*torch.log2(p_k))

    def evaluate_node(self, input_torch):
        """
        Evaluates an input observation within the node.
        If is not a leaf node, send it to the corresponding node
        return predicted label
        """
        feature_val_input = input_torch[self.feature_num]
        if(self.is_leaf()):
            return self.dominant_class
        else:
            if(feature_val_input < self.threshold_value):
                return self.node_left.evaluate_node(input_torch)
            else:
                return self.node_right.evaluate_node(input_torch)

    def evaluate_optimal_gini(self, right_data_partition: torch.tensor,
                              left_data_partition: torch.tensor,
                              n_data_partition: int,
                              num_classes: int,
                              error_func):
        
        return (right_data_partition.shape[0]/n_data_partition) * error_func(right_data_partition, num_classes).item() + \
                (left_data_partition.shape[0]/n_data_partition) * error_func(left_data_partition, num_classes).item()

    def select_best_feature_and_thresh(self, data_torch, list_features_selected = [], num_classes = 4):
        """
        Selects the best feature and threshold that minimizes the gini coefficient
        param data_torch: dataset partition to analyze
        param list_features_selected list of features selected so far, thus must be ignored
        param num_classes: number of K classes to discriminate from
        return min_thresh, min_feature, min_gini found for the dataset partition when
        selecting the found feature and threshold
        """

        min_thresh = float('inf')
        min_feature = 0
        min_gini = float('inf')

        total_data_torch = data_torch.shape[1]

        # -1 to ignore label
        for feature in range(total_data_torch - 1):
          if (feature not in list_features_selected):
            for threshold in data_torch[:, feature]:
                left_data = data_torch[data_torch[:,feature] <= threshold]
                right_data = data_torch[data_torch[:,feature] > threshold]
                # solution to avoid division by 0 in calculate_gini in case a partition has 0 elements
                if (left_data.shape[0] > 0 and right_data.shape[0] > 0):
                    if USE_GINI:
                        weighted_err = self.evaluate_optimal_gini(right_data, left_data, total_data_torch, num_classes, self.calculate_gini)
                    else:
                        #weighted_err = self.evaluate_optimal_gini(data_torch[:,feature], num_classes,self.calculate_entropy) # TODO: Is input data correct?
                        weighted_err = self.evaluate_optimal_gini(right_data, left_data, total_data_torch, num_classes, self.calculate_entropy)
                    if (weighted_err < min_gini):
                        min_thresh = threshold
                        min_feature = feature
                        min_gini = weighted_err
        return (min_thresh, min_feature, min_gini)


class CART:
    def __init__(self, dataset_torch, max_CART_depth, min_observations = 2):
        """
        CART has only one root node
        """
        #min observations per node
        self.min_observations = min_observations
        self.list_selected_features = []
        self.root = Node_CART(num_classes = 4, ref_CART = self, current_depth = 0)
        self.max_CART_depth = max_CART_depth
        print(self.list_selected_features)
        

    def get_root(self):
        """
        Gets tree root
        """
        return self.root

    def get_min_observations(self):
        """
        return min observations per node
        """
        return self.min_observations

    def get_max_depth(self):
        """
        Gets the selected max depth of the tree
        """
        return self.max_CART_depth

    def build_CART(self, data_torch):
        """
        Build CART from root
        """
        self.list_selected_features = self.root.create_with_children(data_torch, current_depth = 0, list_selected_features=self.list_selected_features)

    def to_xml(self, xml_file_name):
        """
        write Xml file with tree content
        """
        str_nodes = self.root.to_xml()
        file = open(xml_file_name,"w+")
        file.write(str_nodes)
        file.close()
        return str_nodes

    def evaluate_input(self, input_torch):
        """
        Evaluate a specific input in the tree and get the predicted class
        """
        return self.root.evaluate_node(input_torch)

def train_CART(dataset_torch, name_xml = "", max_CART_depth = 3, min_obs_per_leaf = 2):
    """
    Train CART model
    """
    tree = CART(dataset_torch = dataset_torch, max_CART_depth = max_CART_depth, min_observations =  min_obs_per_leaf)
    tree.build_CART(dataset_torch)
    if(not name_xml == ""):
        tree.to_xml(name_xml)
    return tree

def test_CART(tree, testset_torch):
    """
    Test a previously built CART
    """
    #ROOM_COLUMN_INDEX = 7
    ROOM_COLUMN_INDEX = -1
    class_columns = testset_torch[:, -1].int()
    num_classes = torch.bincount(class_columns)[1:].shape[0]
    n = testset_torch.shape[0]
    predicted_values = []
    c = 0
    for current_observation in testset_torch:
        real_value = current_observation[ROOM_COLUMN_INDEX].item()
        predicted_value = tree.evaluate_input(current_observation)
        predicted_values.append(predicted_value)
        c = c+1 if predicted_value == real_value else c
        #print('predicted_value=[{}], num_classes=[{}], real_value=[{}]'.format(predicted_value, num_classes, real_value))
    return c/n, predicted_values

In [151]:
# PRUEBA DE JOSEF. TODO: BORRAR
data = torch.tensor([
    [1, 0, 1, 1],
    [1, 0, 1, 1],
    [1, 1, 0, 1],
    [1, 1, 0, 1],
    [1, 1, 0, 2],
    [1, 1, 0, 2],
    [1, 1, 1, 2],
    [1, 1, 1, 2]
])
cart = CART(data, 4)
cart.build_CART(data)
print(cart.list_selected_features)
print(cart.root.node_left, cart.root.node_right)
cart.to_xml('prueba_josef.xml')

test_CART(cart, data)

[]
[1, 2]
<__main__.Node_CART object at 0x00000110F818C2D0> <__main__.Node_CART object at 0x00000110F6D93090>
dominant_class=[2], num_classes=[2], real_value=[1]
dominant_class=[2], num_classes=[2], real_value=[1]
dominant_class=[2], num_classes=[2], real_value=[1]
dominant_class=[2], num_classes=[2], real_value=[1]
dominant_class=[2], num_classes=[2], real_value=[2]
dominant_class=[2], num_classes=[2], real_value=[2]
dominant_class=[2], num_classes=[2], real_value=[2]
dominant_class=[2], num_classes=[2], real_value=[2]


0.5

## Gini Unit Test

In [152]:
import unittest

class GiniUnitTest(unittest.TestCase):

    def test_singleClassOneData(self):
      data = torch.tensor([[-64, -56,	-61,	-66,	-71,	-82,	-81,	1]])
      node = Node_CART()
      gini_result = node.calculate_gini(data, num_classes=1)
      self.assertTrue(torch.equal(gini_result, torch.tensor(0.0)))

    def test_twoClassesOneDataPerClass(self):
      data = torch.tensor([[-64, -56,	-61,	-66,	-71,	-82,	-81,	1], [-64, -56,	-61,	-66,	-71,	-82,	-81,	2]])
      node = Node_CART()
      gini_result = node.calculate_gini(data, num_classes=2)
      self.assertTrue(torch.equal(gini_result, torch.tensor(0.5)))

    def test_twoClassesOnlyOneClassWithData(self):
      data = torch.tensor([[-64, -56,	-61,	-66,	-71,	-82,	-81,	1], [-64, -56,	-61,	-66,	-71,	-82,	-81,	1]])
      node = Node_CART()
      gini_result = node.calculate_gini(data, num_classes=2)
      self.assertTrue(torch.equal(gini_result, torch.tensor(0.0)))

    def test_fourClassesOneDataPerClass(self):
      data = torch.tensor([[-64, -56,	-61,	-66,	-71,	-82,	-81,	1], [-64, -56,	-61,	-66,	-71,	-82,	-81,	2], [-64, -56,	-61,	-66,	-71,	-82,	-81,	3], [-64, -56,	-61,	-66,	-71,	-82,	-81,	4]])
      node = Node_CART()
      gini_result = node.calculate_gini(data, num_classes=4)
      self.assertTrue(torch.equal(gini_result, torch.tensor(0.75)))

    def test_fourClassesOnlyTwoClassesWithData(self):
      data = torch.tensor([[-64, -56,	-61,	-66,	-71,	-82,	-81,	1], [-64, -56,	-61,	-66,	-71,	-82,	-81,	1], [-64, -56,	-61,	-66,	-71,	-82,	-81,	4], [-64, -56,	-61,	-66,	-71,	-82,	-81,	4]])
      node = Node_CART()
      gini_result = node.calculate_gini(data, num_classes=4)
      self.assertTrue(torch.equal(gini_result, torch.tensor(0.5)))

    def test_fourClassesOnlyThreeClassesWithData(self):
      data = torch.tensor([[-64, -56,	-61,	-66,	-71,	-82,	-81,	1], [-64, -56,	-61,	-66,	-71,	-82,	-81,	1], [-64, -56,	-61,	-66,	-71,	-82,	-81,	3], [-64, -56,	-61,	-66,	-71,	-82,	-81,	4]])
      node = Node_CART()
      gini_result = node.calculate_gini(data, num_classes=4)
      self.assertTrue(torch.equal(gini_result, torch.tensor(0.625)))

## Entropy Unit Test

In [153]:
class EntropyUnitTest(unittest.TestCase):
    def test_twoClasses5and9(self):
      data = torch.tensor([
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	1],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	1],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	1],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	1],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	1],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	2],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	2],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	2],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	2],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	2],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	2],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	2],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	2],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	2]
      ])
      node = Node_CART()
      entropy_result = node.calculate_entropy(data, num_classes=2)
      self.assertAlmostEqual(entropy_result.item(), 0.94, delta=0.01)

    def test_twoClassesEqualItems(self):
      data = torch.tensor([
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	1],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	1],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	2],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	2]
      ])
      node = Node_CART()
      entropy_result = node.calculate_entropy(data, num_classes=2)
      self.assertAlmostEqual(entropy_result.item(), 1.0, delta=0.01)

    def test_twoClassesAllItemsOneClass(self):
      data = torch.tensor([
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	1],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	1],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	1],
        [-64, -56,	-61,	-66,	-71,	-82,	-81,	1]
      ])
      node = Node_CART()
      entropy_result = node.calculate_entropy(data, num_classes=2)
      self.assertAlmostEqual(entropy_result.item(), 0.0, delta=0.01)

## Feat and Thresh Selection Unit test

In [193]:
class BestFeatAndThreshUnitTest(unittest.TestCase):

    def test_optimal_gini(self):
      right_data = torch.tensor([[-64, -56,	-61,	-66,	-71,	-82,	-81,	1], [-64, -56,	-61,	-66,	-71,	-82,	-81,	2], [-64, -56,	-61,	-66,	-71,	-82,	-81,	3], [-64, -56,	-61,	-66,	-71,	-82,	-81,	4]])
      left_data = torch.tensor([[-64, -56,	-61,	-66,	-71,	-82,	-81,	1], [-64, -56,	-61,	-66,	-71,	-82,	-81,	1], [-64, -56,	-61,	-66,	-71,	-82,	-81,	3], [-64, -56,	-61,	-66,	-71,	-82,	-81,	4]])
      node = Node_CART()
      result = node.evaluate_optimal_gini(right_data, left_data, n_data_partition=8, num_classes=4, error_func=node.calculate_gini)
      self.assertEqual(result, 0.6875)

    def test_twoClassesOneDataPerClass(self):
      USE_GINI = True
      data = torch.tensor([ [-64, -56,	-61,	-66,	-71,	-82,	-81,  1],
                            [-50, -14,	-1,	-90,	-100,	-80,	-84,	2],
                            [-25, -50,	-100,	-9,	-10,	-85,	-70,	2],
                            [-64, -56,	-61,	-66,	-71,	-82,	-81,  1]])
      node = Node_CART()
      ((min_thresh, min_feature, min_gini)) = node.select_best_feature_and_thresh(data, num_classes=2)
      self.assertEqual(min_thresh, torch.tensor(-64))
      self.assertEqual(min_feature, torch.tensor(0.0))
      self.assertEqual(min_gini, torch.tensor(0.0))

    def test_twoClassesOneDataPerClass_2(self):
      USE_GINI = False
      data = torch.tensor([ [1, 1, 1,	1, 1,	1, 1,	1],
                            [1, 1, 0,	1, 1,	1, 1,	1],
                            [1, 1, 1,	1, 1,	1, 1,	2],
                            [1, 1, 1,	1, 1,	1, 1,	2]])
      node = Node_CART()
      ((min_thresh, min_feature, min_gini)) = node.select_best_feature_and_thresh(data, num_classes=2)
      self.assertEqual(min_thresh, torch.tensor(0.))
      self.assertEqual(min_feature, torch.tensor(2))
      self.assertEqual(min_gini, torch.tensor(0.34436094760894775))

## Create with Children Unit test

In [155]:
class CreateWithChildrenUnitTest(unittest.TestCase):
    def test_twoClassesTwoDataPerClass(self):
      data = torch.tensor([ [-64, -56,	-61,	-90,	-71,	-82,	-81,  1],
                            [-64, -14,	-61,	-90,	-71,	-82,	-81,  2],
                            [-64, -56,	-61,	-90,	-71,	-82,	-81,  2],
                            [-64, -56,	-60,	-90,	-71,	-82,	-81,  1]])
      cart = CART(data, 2)
      cart.build_CART(data)
      self.assertEqual(cart.list_selected_features, [1, 2])

    def test_twoClassesTwoDataPerClass_depth(self):
      data = torch.tensor([ [-64, -56,	-61,	-90,	-71,	-82,	-81,  1],
                            [-64, -14,	-61,	-90,	-71,	-82,	-81,  2],
                            [-64, -56,	-61,	-90,	-71,	-82,	-81,  2],
                            [-64, -56,	-60,	-90,	-71,	-82,	-81,  1]])
      cart = CART(data, 1)
      cart.build_CART(data)
      self.assertEqual(cart.list_selected_features, [1])


unittest.main(argv=[''], verbosity=2, exit=False)

test_optimal_gini (__main__.BestFeatAndThreshUnitTest.test_optimal_gini) ... ok
test_twoClassesOneDataPerClass (__main__.BestFeatAndThreshUnitTest.test_twoClassesOneDataPerClass) ... ok
test_twoClassesTwoDataPerClass (__main__.CreateWithChildrenUnitTest.test_twoClassesTwoDataPerClass) ... ok
test_twoClassesTwoDataPerClass_depth (__main__.CreateWithChildrenUnitTest.test_twoClassesTwoDataPerClass_depth) ... ok
test_twoClasses5and9 (__main__.EntropyUnitTest.test_twoClasses5and9) ... ok
test_twoClassesAllItemsOneClass (__main__.EntropyUnitTest.test_twoClassesAllItemsOneClass) ... ok
test_twoClassesEqualItems (__main__.EntropyUnitTest.test_twoClassesEqualItems) ... ok
test_fourClassesOneDataPerClass (__main__.GiniUnitTest.test_fourClassesOneDataPerClass) ... 

[]
[]


ok
test_fourClassesOnlyThreeClassesWithData (__main__.GiniUnitTest.test_fourClassesOnlyThreeClassesWithData) ... ok
test_fourClassesOnlyTwoClassesWithData (__main__.GiniUnitTest.test_fourClassesOnlyTwoClassesWithData) ... ok
test_singleClassOneData (__main__.GiniUnitTest.test_singleClassOneData) ... ok
test_twoClassesOneDataPerClass (__main__.GiniUnitTest.test_twoClassesOneDataPerClass) ... ok
test_twoClassesOnlyOneClassWithData (__main__.GiniUnitTest.test_twoClassesOnlyOneClassWithData) ... ok

----------------------------------------------------------------------
Ran 13 tests in 0.291s

OK


<unittest.main.TestProgram at 0x110f6cd3e10>

## test_CART Unit Test

In [156]:
class TestCARTUnitTest(unittest.TestCase):

    def test_dummyTestCART(self):
      self.assertTrue(True)

## Ejecución Unit Tests

In [194]:
unittest.main(argv=[''], verbosity=2, exit=False)

test_optimal_gini (__main__.BestFeatAndThreshUnitTest.test_optimal_gini) ... ok
test_twoClassesOneDataPerClass (__main__.BestFeatAndThreshUnitTest.test_twoClassesOneDataPerClass) ... ok
test_twoClassesOneDataPerClass_2 (__main__.BestFeatAndThreshUnitTest.test_twoClassesOneDataPerClass_2) ... FAIL
test_twoClassesTwoDataPerClass (__main__.CreateWithChildrenUnitTest.test_twoClassesTwoDataPerClass) ... ok
test_twoClassesTwoDataPerClass_depth (__main__.CreateWithChildrenUnitTest.test_twoClassesTwoDataPerClass_depth) ... ok
test_twoClasses5and9 (__main__.EntropyUnitTest.test_twoClasses5and9) ... ok
test_twoClassesAllItemsOneClass (__main__.EntropyUnitTest.test_twoClassesAllItemsOneClass) ... ok
test_twoClassesEqualItems (__main__.EntropyUnitTest.test_twoClassesEqualItems) ... ok
test_fourClassesOneDataPerClass (__main__.GiniUnitTest.test_fourClassesOneDataPerClass) ... ok
test_fourClassesOnlyThreeClassesWithData (__main__.GiniUnitTest.test_fourClassesOnlyThreeClassesWithData) ... ok
test_fou

[]
[]


<unittest.main.TestProgram at 0x110ff8f2010>

# Evaluación del CART

## 1-

In [184]:
from sklearn.metrics import f1_score
dataset_torch = read_dataset().int()
print('dataset_torch = [\n{}\n]'.format(dataset_torch))

############################### USING GINI ################################
USE_GINI = True

print('[GINI] - Test CART with max_CART_depth=[3] and min_obs_per_leaf=[2]')
tree_3 = train_CART(dataset_torch, name_xml = "CART_example.xml", max_CART_depth=3, min_obs_per_leaf=2)
acc, predicted_values = test_CART(tree_3, dataset_torch)
f1 = f1_score(dataset_torch[:,-1], predicted_values, average='weighted') 

print(f'[GINI] - Accuracy rate=[{acc}], f1_score=[{f1}]')

#########

print('[GINI] - Test CART with max_CART_depth=[4] and min_obs_per_leaf=[2]')
tree_4 = train_CART(dataset_torch, name_xml = "CART_example.xml", max_CART_depth=4, min_obs_per_leaf=2)
acc, predicted_values = test_CART(tree_4, dataset_torch)
f1 = f1_score(dataset_torch[:,-1], predicted_values, average='weighted') 

print(f'[GINI] - Accuracy rate=[{acc}], f1_score=[{f1}]')

############################### USING ENTROPY ################################
USE_GINI = False

print('[ENTROPY] - Test CART with max_CART_depth=[3] and min_obs_per_leaf=[2]')
tree_3 = train_CART(dataset_torch, name_xml = "CART_example.xml", max_CART_depth=3, min_obs_per_leaf=2)
acc, predicted_values = test_CART(tree_3, dataset_torch)
f1 = f1_score(dataset_torch[:,-1], predicted_values, average='weighted') 

print(f'[ENTROPY] - Accuracy rate=[{acc}], f1_score=[{f1}]')

#########

print('[ENTROPY] - Test CART with max_CART_depth=[4] and min_obs_per_leaf=[2]')
tree_4 = train_CART(dataset_torch, name_xml = "CART_example.xml", max_CART_depth=4, min_obs_per_leaf=2)
acc, predicted_values = test_CART(tree_4, dataset_torch)
f1 = f1_score(dataset_torch[:,-1], predicted_values, average='weighted') 

print(f'[ENTROPY] - Accuracy rate=[{acc}], f1_score=[{f1}]')

dataset_torch = [
tensor([[-64, -56, -61,  ..., -82, -81,   1],
        [-68, -57, -61,  ..., -85, -85,   1],
        [-63, -60, -60,  ..., -85, -84,   1],
        ...,
        [-62, -59, -46,  ..., -87, -88,   4],
        [-62, -58, -52,  ..., -90, -85,   4],
        [-59, -50, -45,  ..., -88, -87,   4]], dtype=torch.int32)
]
[GINI] - Test CART with max_CART_depth=[3] and min_obs_per_leaf=[2]
[]
[GINI] - Accuracy rate=[0.9155], f1_score=[0.9164107351015577]
[GINI] - Test CART with max_CART_depth=[4] and min_obs_per_leaf=[2]
[]
[GINI] - Accuracy rate=[0.7315], f1_score=[0.6530178455802124]
[ENTROPY] - Test CART with max_CART_depth=[3] and min_obs_per_leaf=[2]
[]
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYY

## -2

In [185]:
from sklearn.model_selection import train_test_split

dataset_torch = read_dataset().int()
print('dataset_torch = [\n{}\n]'.format(dataset_torch))

############################### USING GINI ################################
USE_GINI = True

print('[GINI] - Test CART with max_CART_depth=[3] and min_obs_per_leaf=[2]')
X_train, X_test, y_train, y_test = train_test_split(dataset_torch[:,:-1], dataset_torch[:,-1], test_size=0.3, train_size=0.7, random_state=42)
dataset_train = torch.column_stack((X_train, y_train))
dataset_test = torch.column_stack((X_test, y_test)) 
tree_3 = train_CART(dataset_train, name_xml = "CART_example.xml", max_CART_depth=3, min_obs_per_leaf=2)
acc, predicted_values = test_CART(tree_3, dataset_test)
f1 = f1_score(dataset_test[:,-1], predicted_values, average='weighted') 

print(f'[GINI] - Accuracy rate=[{acc}], f1_score=[{f1}]')

#########

print('[GINI] - Test CART with max_CART_depth=[4] and min_obs_per_leaf=[2]')
X_train, X_test, y_train, y_test = train_test_split(dataset_torch[:,:-1], dataset_torch[:,-1], test_size=0.3, train_size=0.7, random_state=42)
dataset_train = torch.column_stack((X_train, y_train))
dataset_test = torch.column_stack((X_test, y_test)) 
tree_4 = train_CART(dataset_train, name_xml = "CART_example.xml", max_CART_depth=4, min_obs_per_leaf=2)
acc, predicted_values = test_CART(tree_4, dataset_test)
f1 = f1_score(dataset_test[:,-1], predicted_values, average='weighted') 

print(f'[GINI] - Accuracy rate=[{acc}], f1_score=[{f1}]')

############################### USING ENTROPY ################################
USE_GINI = False

print('[ENTROPY] - Test CART with max_CART_depth=[3] and min_obs_per_leaf=[2]')
X_train, X_test, y_train, y_test = train_test_split(dataset_torch[:,:-1], dataset_torch[:,-1], test_size=0.3, train_size=0.7, random_state=42)
dataset_train = torch.column_stack((X_train, y_train))
dataset_test = torch.column_stack((X_test, y_test)) 
tree_3 = train_CART(dataset_train, name_xml = "CART_example.xml", max_CART_depth=3, min_obs_per_leaf=2)
acc, predicted_values = test_CART(tree_3, dataset_test)
f1 = f1_score(dataset_test[:,-1], predicted_values, average='weighted') 

print(f'[ENTROPY] - Accuracy rate=[{acc}], f1_score=[{f1}]')

#########

print('[ENTROPY] - Test CART with max_CART_depth=[4] and min_obs_per_leaf=[2]')
X_train, X_test, y_train, y_test = train_test_split(dataset_torch[:,:-1], dataset_torch[:,-1], test_size=0.3, train_size=0.7, random_state=42)
dataset_train = torch.column_stack((X_train, y_train))
dataset_test = torch.column_stack((X_test, y_test)) 
tree_4 = train_CART(dataset_train, name_xml = "CART_example.xml", max_CART_depth=4, min_obs_per_leaf=2)
acc, predicted_values = test_CART(tree_4, dataset_test)
f1 = f1_score(dataset_test[:,-1], predicted_values, average='weighted') 

print(f'[ENTROPY] - Accuracy rate=[{acc}], f1_score=[{f1}]')

dataset_torch = [
tensor([[-64, -56, -61,  ..., -82, -81,   1],
        [-68, -57, -61,  ..., -85, -85,   1],
        [-63, -60, -60,  ..., -85, -84,   1],
        ...,
        [-62, -59, -46,  ..., -87, -88,   4],
        [-62, -58, -52,  ..., -90, -85,   4],
        [-59, -50, -45,  ..., -88, -87,   4]], dtype=torch.int32)
]
[GINI] - Test CART with max_CART_depth=[3] and min_obs_per_leaf=[2]
[]
[GINI] - Accuracy rate=[0.9216666666666666], f1_score=[0.9217110326345751]
[GINI] - Test CART with max_CART_depth=[4] and min_obs_per_leaf=[2]
[]
[GINI] - Accuracy rate=[0.8966666666666666], f1_score=[0.898185203209469]
[ENTROPY] - Test CART with max_CART_depth=[3] and min_obs_per_leaf=[2]
[]
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYYYYYYYYYYYYYY
ENTROPYYYYYYYYYY