# Libraries

In [167]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import random

## Classes and Methods 

In [168]:

def prepare_data(dataset_1=None, dataset_2=None, dataset_3=None, dataset_4=None):
    '''function to prepare the data so it can be used in the classifier'''
    #concatenate the 4 seperate datasets to one single dataset
    dataset_df = pd.concat([dataset_1, dataset_2, dataset_3, dataset_4])

    #seperate attributes from labels
    X = dataset_df.iloc[:, :-1].values
    Y = dataset_df.iloc[:, -1:].values

    #split into training, validation and test data
    
    X_training, X_test, Y_training, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 11)
    X_train, X_val, Y_train, Y_val = train_test_split(X_training, Y_training, test_size = 0.25, random_state = 11) # 0.25 * 0.8 = 0.2
    return X_train, X_test, X_val, Y_train, Y_test, Y_val
    

### Class for Nodes of the Tree
This class only includes the constructor method which assigns values to the variables when a Tree_Node instance is created. Assign default value 'None' because we sometimes dont assign values to all variables on creation. 

Variables:
- attribute_index
- splitting_value
- left
- right
- info_gain
- value

Methods:
- constructor

In [169]:
class Tree_Node():
    def __init__(self, attribute_index=None, splitting_value=None, left=None, right=None, info_gain=None, value=None):
        ''' constructor ''' 
        
        # attribute_index of the attribute that is used to decide whether to follow the left or right side of the tree
        self.attribute_index = attribute_index
        # value at which you split into left and right 
        self.splitting_value = splitting_value
        # contains the left side under this node
        self.left = left
        # contains the right side under this node
        self.right = right
        # information gain of the node
        self.info_gain = info_gain

        # label value which will be predicted if the node is a leaf node (majority class)
        self.value = value

### Class for the Decision Tree Classifier
Includes all variables and methods to build a Classification Tree using ID3 algorithm and to make predictions with that tree.

Variables:
- root
- min_samples_split
- max_depth

Methods:
- constructor
- build_tree_ID3
- find_best_split
- split
- calc_information_gain
- calc_entropy
- majority_vote
- predict
- make_prediction


In [170]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split, max_depth):
        ''' constructor '''
        
        # initialize the root node of the tree 
        self.root = None
        
        # stopping criteria
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree_ID3(self, dataset, curr_depth=0):
        ''' recursive ID3 algorithm to build the tree ''' 
        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_attributes = np.shape(X)
        
        # split until stopping criteria are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split by calling the corresponding method
            best_split = self.find_best_split(dataset, num_samples, num_attributes)
            # check if information gain is positive
            if best_split["info_gain"]>0:
                # call the same function with the left subtree dataset as input
                left_subtree = self.build_tree_ID3(best_split["dataset_left"], curr_depth+1)
                # call the same function with the right subtree dataset as input
                right_subtree = self.build_tree_ID3(best_split["dataset_right"], curr_depth+1)
                # return tree node
                return Tree_Node(best_split["attribute_index"], best_split["splitting_value"], 
                            left_subtree, right_subtree, best_split["info_gain"])
        
        # compute leaf node
        leaf_value = self.majority_vote(Y)
        # return leaf node 
        return Tree_Node(value=leaf_value)
    
    def find_best_split(self, dataset, num_samples, num_attributes):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        # initialise the maximum information gain with -infinity
        max_info_gain = -float("inf")
        
        # loop over all the attributes
        for attribute_index in range(num_attributes):
            attribute_values = dataset[:, attribute_index]
            possible_splitting_values = np.unique(attribute_values)
            # loop over all the attribute values present in the data to determine the best splitting value
            for value in possible_splitting_values:
                # split the dataset
                dataset_left, dataset_right = self.split(dataset, attribute_index, value)
                # check if there are any more child nodes at all
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_info_gain = self.calc_information_gain(y, left_y, right_y)
                    # update the best split if needed
                    if curr_info_gain>max_info_gain:
                        best_split["attribute_index"] = attribute_index
                        best_split["splitting_value"] = value
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain
                        
        # return best split
        return best_split
    
    def split(self, dataset, attribute_index, splitting_value):
        ''' function to split the data into two datasets'''
        
        dataset_left = np.array([row for row in dataset if row[attribute_index]<=splitting_value])
        dataset_right = np.array([row for row in dataset if row[attribute_index]>splitting_value])
        return dataset_left, dataset_right
    
    def calc_information_gain(self, parent, l_child, r_child):
        ''' function to calculate the information gain '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        gain = self.calc_entropy(parent) - (weight_l*self.calc_entropy(l_child) + weight_r*self.calc_entropy(r_child))
        return gain
    
    def calc_entropy(self, dataset):
        ''' function to calculate the entropy '''
        
        class_labels = np.unique(dataset)
        entropy = 0
        for c in class_labels:
            p_class = len(dataset[dataset == c]) / len(dataset)
            entropy += -p_class * np.log2(p_class)
        return entropy
    
        
    def majority_vote(self, dataset):
        ''' function to compute value of leaf node by selecting the class that occurs most often in the leaf node'''
        
        Y = list(dataset)
        max_class = max(Y, key=Y.count)
        return max_class
    
    
    def predict(self, dataset):
        ''' function to get a predictions array of new data points '''
        for x in dataset:
            predictions = [self.make_prediction(x, self.root) for x in dataset]
        return predictions
    
    def make_prediction(self, data_point, tree):
        ''' recursive function to predict a single data point by following the tree from the root node to a leaf node '''
        
        if tree.value!=None: return tree.value
        attribute_val = data_point[tree.attribute_index]
        if attribute_val<=tree.splitting_value:
            return self.make_prediction(data_point, tree.left)
        else:
            return self.make_prediction(data_point, tree.right)


# Decision Tree Classification

## 0. Hyperparameters

In [171]:

# min number of samples in a node in order to allow splitting
min_samples_split=5 
# max depth of the tree
max_depth=50
# set random seed to make results reproducable
random.seed(11)

## 1. Preprocessing

### 1.1 Load Data

#### Hand Gesture Data

In [172]:
#load the four datasets sorted by label
csv_0=pd.read_csv('./0.csv', header=None)
csv_1=pd.read_csv('./1.csv', header=None)
csv_2=pd.read_csv('./2.csv', header=None)
csv_3=pd.read_csv('./3.csv', header=None)

# the first 64 columns contain measurement values of the muscle activity from the sensors
# the last column contains the label (rock - 0, scissors - 1, paper - 2, ok - 3)
print(csv_0.shape, csv_1.shape, csv_2.shape, csv_3.shape)

(2910, 65) (2903, 65) (2943, 65) (2922, 65)


#### Iris Data

In [173]:
col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type']
iris_data = pd.read_csv("iris.csv", skiprows=1, header=None, names=col_names)
iris_data.shape

(150, 5)

### 1.2 Prepare Data for Model

In [174]:
X_train, X_test, X_val, Y_train, Y_test, Y_val = prepare_data(csv_0, csv_1, csv_2, csv_3)
print(X_train.shape,Y_train.shape,X_val.shape, Y_val.shape, X_test.shape, Y_test.shape)
print(type(X_train))
print(type(X_train[1,1]))

(7006, 64) (7006, 1) (2336, 64) (2336, 1) (2336, 64) (2336, 1)
<class 'numpy.ndarray'>
<class 'numpy.float64'>


## 2. Training

### 2.1 Create Classifier 

In [175]:
# create a Decision Tree Classifier instance using the Hyperparameters
classifier = DecisionTreeClassifier(min_samples_split, max_depth)

### 2.2 Build Tree

In [176]:
# Build the best classification tree based based on the training and validation data
X_train = np.concatenate((X_train, X_val), axis=0)
Y_train = np.concatenate((Y_train, Y_val), axis=0)
train_dataset = np.concatenate((X_train, Y_train), axis=1)
classifier.root = classifier.build_tree_ID3(train_dataset)
print(type(classifier.root))

## 3. Prediction

In [None]:
# predict gesture classes of input data using the created decision tree
Y_pred = classifier.predict(X_test)
Y_pred_train = classifier.predict(X_train[:1000,:]) 

## 4. Results

In [None]:
# calculate the accuracy score of the predictions
acc = accuracy_score(Y_test, Y_pred)
acc_train = accuracy_score(Y_train[:1000], Y_pred_train)

print("The accuracy score on the test data is: " + str(acc))
print("The accuracy score on the train data is: " + str(acc_train))

The accuracy score on the test data is: 0.7675513698630136
The accuracy score on the train data is: 0.84


## Iris Results

In [None]:
X_train, X_test, X_val, Y_train, Y_test, Y_val = prepare_data(iris_data)
classifier_iris = DecisionTreeClassifier(min_samples_split, max_depth)
iris_train_dataset = np.concatenate((X_train, Y_train), axis=1)
classifier_iris.root = classifier_iris.build_tree_ID3(iris_train_dataset)
Y_pred = classifier_iris.predict(X_test)
Y_pred_train = classifier_iris.predict(X_train[:1000,:]) 
acc = accuracy_score(Y_test, Y_pred)
acc_train = accuracy_score(Y_train[:1000], Y_pred_train)
print("The accuracy score with the Iris dataset on the test data is: " + str(acc))
print("The accuracy score with the Iris dataset on the train data is: " + str(acc_train))

The accuracy score with the Iris dataset on the test data is: 0.9
The accuracy score with the Iris dataset on the train data is: 0.9666666666666667
