In [1]:
import numpy as np
import pandas as pd
import math
import seaborn as sns

### Node Class

Tree structure is build by Node Objects. Each node object has an attribute and a classification when trained, as well as a parent node and the value associated with the parents attribute.
The nodes are trained with train_data and also offer the printTree function.

In [2]:
class Node:
    '''
    class Node innitialises a tree structure for a non-binary tree
    it has the typical setter and getter methods and a method to remove a 
    child from the list of children
    '''
    def __init__(self, parent = None, attribute = None, classification = None, value = None, valueIsContinuous = False, target = None):
        self.children = []  
        self.parent = parent
        self.attribute = attribute
        self.classification = classification
        self.value = value
        self.valueIsContinuous = valueIsContinuous
        self.target = target

    
    def setChild(self, node):
        self.children.append(node)
        
    def setParent(self, node):
        if (self.parent is not None):
            self.parent.children.remove(self)
        self.parent = node

    def getChildren(self):
        return self.children
    
    def getParent(self):
        return self.parent
    
    def deleteChild(self, node):
        if (node in self.children):
            node.parent = None
            self.children.remove(node)
        else:
            raise TypeError("Child not in Children")
        
    def isLeaf(self):
        if len(self.children) > 0:
            return False
        else:
            return True

    def isRoot(self):
        if (self.parent is None):
            return True
        else:
            return False
    
    def setAttribute(self, attribute):
        self.attribute = attribute
    
    def getAttribute(self):
        return self.attribute
    
    def setClassification(self, classification):
        self.classification = classification
    
    def getClassification(self):
        return self.classification
    
    def setValue(self,value):
        self.value = value

    def getValue(self):
        return self.value
    

    def printTree(self, level = 0):
        tab = "    "
        if level == 0:
            print("Decision tree:")
        if self.isLeaf():
            print(tab*level, "classification: ", self.target, " = " , self.classification)
        else:
            for child in self.children:
                # printing intervals
                if child.valueIsContinuous:
                    interval = "" 
                    if child.value[0] == np.NINF:
                        interval = f"smaller then {child.value[1]}"
                    elif child.value[1] == np.PINF:
                        interval = f"bigger then {child.value[0]}"
                    else:
                        interval = f"between {child.value[0]} and {child.value[1]}"
                    print(tab*level, self.attribute, ": ", interval)
                
                # printing discrete values
                else:
                    print(tab*level, self.attribute, ": ", child.value)
                
                # traverse deeper into the tree
                child.printTree(level + 1)

### train_Data Class

The class represents a data (sub)set that is used to train the (sub)tree. It has a (root) node which it is responsible to train (choose attribute, classification, and child nodes) via the id3 function.
I can also deal with continuous variables by ...

In [3]:
class train_data:
    '''
    class train_data has all important functions for ID3 Algorithm:
    it can calculate the entropy of some data, the information gain, choose an attriute.
    within the ID3 algorithm a tree will be trained.
    the function retrain(data) retrains a trained tree with new data.
    '''
    def __init__(self, data, target, attributes, node:Node = None, recursion_depth = 0, continuous_splitting = 0.1,  max_recursion = 10):
        
        self.data = data
        if not isinstance(self.data, pd.DataFrame):
            raise TypeError("Data has to be a Pandas Dataframe")
        
        self.target = target
        if not isinstance(self.target, str):
            raise TypeError("Taget has to be of type string")
        
        self.attributes = attributes
        if not isinstance(attributes, list):
            raise TypeError("Attributes have to have structure list")
            
        for attribute in self.attributes:
            if not isinstance(attribute, str):
                raise TypeError("Attributes have to be of type string")

        self.node = node
        self.continuous_splitting = continuous_splitting
        self.recursion_depth = recursion_depth
        self.max_recursion = max_recursion
    
    ######################################
    ## methods for continuous variables ##
    ######################################
    
    def is_continuous(self, values):
        # checks is variable is a continuous variable
        # (it is continuous if it has more than 10 different values and is a numericla scalar)
        if len(values) > 10:
            if isinstance(list(values)[5], int) or isinstance(list(values)[0], float):
                return True
        return False

    
    def getBoundaries(self, tColumn, aColumn):
        # by looking at the target column and the attribute column the
        # function decides on decision boundaries in a continuous varibale, where classification changes
        # aColumn -> attribute column with the continuous values
        # tColumn -> target column with the classification
        
        # 1) sort the two columns by attribute values
        columns = pd.DataFrame(data={"a":list(aColumn), "t":list(tColumn)}).sort_values(by="a")
        columns.index = range(len(columns))

        # 2) find decision boundaries where classification changes
        leftBound = np.NINF # first interval has negative infinity as left boundary
        rightBound = None
        boundaries = []
        currentClass = columns["t"][0]
        
        for i in range(len(columns)):
            
            # when classification changes
            if(columns["t"][i] != currentClass):
                currentClass = columns["t"][i]
                
                # get the value in the middel of the values where classification changes
                beforeSwitch = columns["a"][i-1]
                afterSwitch = columns["a"][i]
                rightBound = (beforeSwitch + afterSwitch) / 2

                # safe the tupple of two boundaries 
                # represents an interval with a uniform classification
                boundaries.append((leftBound, rightBound))
                leftBound = rightBound
        
        # last interval has negative infinity as right boundary
        boundaries.append((leftBound, np.PINF))
        
        # if the getBoundaries function returns more then 10 intervals
        # set intervals indipendent of classification
        if len(boundaries) > 10:
            return self.setBoundaries(aColumn)
        
        return boundaries
    
    def setBoundaries(self, aColumn):
        # if the getBoundaries function returns more then 10 intervals
        # sets 10 eaqually sized intervals indipendent of classification
        
        # calculate size of intervals
        maximum = np.max(aColumn)
        minimum = np.min(aColumn)
        stepsize = (maximum - minimum)/ 10
        boundaries = []
        
        # make a tupel for each interval
        leftBound = np.NINF
        rightBound = minimum + stepsize
        for i in range(9):
            boundaries.append((leftBound, rightBound))
            leftBound = rightBound
            rightBound = leftBound + stepsize
        boundaries.append((leftBound, np.PINF))
        
        return boundaries
        
    
    def replaceContinuous(self, boundaries, aColumn):
        # replaces the continuous values of an attribute by the
        # tuples that represent an interval
        
        newAColumn = []
        for value in aColumn:
            # find the interval that includes the value
            foundInterval = False
            for l, r in boundaries:
                if value >= l and value < r:
                    newAColumn.append((l, r))
                    foundInterval = True
                    break
            if foundInterval == False:
                raise TypeError("could not find and interval for ", value)
        
        return pd.Series(newAColumn)
    
    
    #######################################
    ## methods for choosing an attribute ##
    #######################################
    
    
    def entropy(self, targetColumn):
        # calculates entropy 
        #targetColumn = self.data.loc[:, self.target]

        values = set(targetColumn)
        entropySum = 0
        for value in values:
            p = list(targetColumn).count(value) / len(targetColumn)
            entropySum = entropySum + (- p * np.log(p))

        return entropySum
    

    def informationGain(self, attributeColumn, values):
        # calculates the informationGain
        gainSum = 0
        for value in values:
            mask = lambda aColumn, value :(row == value for row in aColumn) 
            subsetData = self.data.iloc[mask(attributeColumn, value),:]
            subsetTargetColumn = subsetData[self.target]
            # claculate entropy and normalize by size of subsets
            gainSum = gainSum + (len(subsetData)/ len(self.data)) * self.entropy(subsetTargetColumn)

        # substract summed and weighted entropy of subsets from entropy of whole set
        infoGain = self.entropy(self.data.loc[:, self.target]) - gainSum

        return infoGain

    def gainRatio(self, attributeColumn, values):
        # calculating the Gain Ratio instead of the InforamtionGain
        # to prefer attributes with few values
        
        infoGain = self.informationGain(attributeColumn, values)

        splitInfo = 0.0
        for value in values:
            
            subset = attributeColumn[attributeColumn == value]
            # proportion of subset size and whole set size
            s = len(subset) / len(attributeColumn)
            if s != 0.0:
                splitInfo = splitInfo + ((- s) * np.log(s))
        
        # to avoid dividing by zero
        if splitInfo == 0:
            splitInfo = infoGain
            if infoGain == 0:
                return 0
            
        return infoGain / splitInfo
        

    def chooseAttribute(self):
        # chooses an attribute that maximises GainRatio
        
        maxGain= 0
        maxAttribute = ""

        # calculate Gain Ratio for each attribute
        for attribute in self.attributes:
            
            attributeColumn = self.data[attribute]
            values = set(attributeColumn)
            gain = 0
            
            # replace the values in attributeColumn with continuous values by Intervals
            if self.is_continuous(values):
                targetColumn = self.data[self.target]
                boundaries = self.getBoundaries(targetColumn, attributeColumn)
                attributeColumn = self.replaceContinuous(boundaries, attributeColumn)
                values = set(attributeColumn)            
                
            # calculate gainRatio
            gain = self.gainRatio(attributeColumn, values)

            # store attribute with highest information gain
            if gain >= maxGain:
                maxGain = gain
                maxAttribute = attribute

        # choose attribute with highest Information Gain
        return maxAttribute

    ############################################
    ## methods for building the decision tree ##
    ############################################
    
    def classify(self):
        # returns the most commen classification of the dataset
        
        targetColumn = self.data.loc[:, self.target]
        values = set(targetColumn)
        maxClass = 0  # highest number of values
        classification = "" # classification of most common value
        for value in values:
            # check if calssification value is more common then other classification values
            if list(targetColumn).count(value) > maxClass:
                maxClass = list(targetColumn).count(value)
                classification = value

        return classification
    
    def sortIntervals(self, unsortedV):
        sortedV = []
        leftBound = np.NINF
        for value in unsortedV:
            if value[0] == leftBound:
                leftBound = value[1]
                sortedV.append(value)
        return sortedV

    def id3(self):
        # base cases:
        # 1) all instances have same target value -> leaf node with target value
        if (self.data[self.target].nunique() == 1):
            self.node.setClassification(self.data[self.target].iloc[0])
            #print("basecase1")-----------------------------------------------------------------
            return 
        # 2) out of discriptive features -> leaf node with majority of target values
        if (not self.attributes):
            self.node.setClassification(self.classify())
            #print("basecase2")-----------------------------------------------------------------
            return
        # 3) no instances left in dataset -> take majority of parent node
        if (self.data is None):
            parent = self.node.getParent()
            self.node.setClassification(parent.getClassification())
            #print("basecase3")-----------------------------------------------------------------
            return
        # 4) maximal recursion depth:
        if self.recursion_depth >= self.max_recursion:
            self.node.setClassification(self.classify())
            #print("basecase4")-----------------------------------------------------------------
            return


        # recursive case:
        # choose attribute with highest explainatory power
        #print("in recursion")-----------------------------------------------------------------
        #print("attributs: ", self.attributes)-----------------------------------------------------------------
        attribute = self.chooseAttribute()
        self.node.setAttribute(attribute)
        self.node.setClassification(self.classify())

        # split data according to attribute
        attributeColumn = self.data.loc[:, attribute]
        values = set(attributeColumn)
        new_attributes = self.attributes
        new_attributes.remove(attribute)
        
        recursion_depth = self.recursion_depth + 1

        # chosen attribute is a continuous variable:
        valueIsContinuous=False
        if self.is_continuous(values):
            #print("continuous")-----------------------------------------------------------------
            
            targetColumn = self.data[self.target]
            boundaries = self.getBoundaries(targetColumn, attributeColumn)
            attributeColumn = self.replaceContinuous(boundaries, attributeColumn)
            values = set(attributeColumn)
            values = self.sortIntervals(values)
            valueIsContinuous = True
        
        # create leaf node for each attribute value
        for value in values:
            # get the subset determined by the attribute value
            mask = lambda aColumn, value :(row == value for row in aColumn) 
            subsetData = self.data.iloc[mask(attributeColumn, value),:]
            # create a node in the tree
            childNode = Node(parent=self.node, value=value, valueIsContinuous=valueIsContinuous, target=self.target)
            self.node.setChild(childNode)
            # train the node with the data subset
            subset = train_data(data=subsetData, 
                                target=self.target, 
                                attributes=new_attributes, 
                                node=childNode, 
                                recursion_depth=recursion_depth, 
                                max_recursion = self.max_recursion)
            subset.id3() # recursive call on all partitions
            

    def retrain(self, data):
        self.data = data
        self.id3()
    

### test_data Class

to classify datapoints and test accuracy of tree

In [4]:
class test_data:
    
    def __init__(self, testData, target, node:Node):
        self.testData = testData
        self.target = target
        self.rootNode = node

        # check whether node is trained:
        if node.getAttribute() is None:
            raise TypeError("node has to be part of a trained Decisiontree")

    
    def classify(self, datapoint, node):
        
        # get leaf node classification (basecase)
        if node.isLeaf() == True:
            return node.getClassification()

        # traverse down the tree with the decision nodes (recursive case)
        else:
            attribute = node.getAttribute()
            dataValue = datapoint.loc[attribute]
            for child in node.getChildren():
                cValue = child.getValue()
                # for interval values
                if child.valueIsContinuous:
                    if dataValue >= cValue[0] and dataValue < cValue[1]:
                        return self.classify(datapoint, child)
                # for discrete values
                elif cValue is dataValue:
                    return self.classify(datapoint, child)
        
        # if there are no children with the right value at decision node, get current classification (base case)
        return node.getClassification()
        
    def classifySet(self):
        classes = []
        #print("testData: ", self.testData)
        for i in range(len(self.testData)):
            datapoint = self.testData.iloc[i]
            #print("datapoint: ", datapoint)
            classes.append(self.classify(datapoint, self.rootNode))
        return classes
    
    def accuracy(self):
        classes = self.classifySet()
        targets = self.testData[self.target]
        
        errors = []
        for target, classification in zip(targets, classes):
            #print("target: ", target , " class: ", classification)
            if target == classification:
                errors.append(True)
            else:
                errors.append(False)
                
        return np.mean(errors)



### prepare_data function

In [5]:
def prepare_data(data:pd.DataFrame, tratio = 0.1):
    
    # remove any Nans from Dataframe
    data = data.dropna(how='any')
    
    # shuffle data
    data = data.sample(frac=1).reset_index(drop=True)
    
    # variables to return
    testData = []
    trainingData = []

    # check whether tratio is smaller than 1
    if tratio >= 1:
        raise TypeError("tratio has to be smaller than 1")
    
    # get length of dataframe
    dataLength = data.shape[0]
    # get chunck size:
    chunkSize = int(dataLength * tratio) 
    # get number of chunks
    nr_chunks = int(1/tratio)

    # append data set to existing dataset
    doubleData = data.append(data)

    # itterate through doubleData, assigh a certain chunk to testing and training
    for chunk in range(nr_chunks-1):
        testData.append(doubleData.iloc[chunk: chunkSize*(chunk+1), :])
        trainingData.append(doubleData.iloc[chunkSize*(chunk+1): chunkSize*(nr_chunks+chunk), :])
    
    return [testData, trainingData]

### training with pokemon data

In [6]:
# 1. load data
data = pd.read_csv("data/pokemon_no_duplicates.csv")

# 2. prepare data
data = prepare_data(data)

# 3. choose the target value
target = "Generation"

# 4. train a tree for each chunk of the training set
for trainingSet in data[0]:
    
    attributes = list(trainingSet.columns)
    attributes.remove(target)
    
    rootNode = Node()
    decisionTree = train_data(data=trainingSet, target=target, attributes=attributes, node=rootNode, max_recursion = 5)
    decisionTree.id3()

    rootNode.printTree()

  doubleData = data.append(data)


Decision tree:
 # :  smaller then 154.5
     classification:  Generation  =  1
 # :  between 154.5 and 242.0
     classification:  Generation  =  2
Decision tree:
 # :  smaller then 152.0
     classification:  Generation  =  1
 # :  between 152.0 and 243.5
     classification:  Generation  =  2
Decision tree:
 # :  smaller then 155.0
     classification:  Generation  =  1
Decision tree:
 # :  smaller then 156.5
     classification:  Generation  =  1
Decision tree:
 # :  smaller then 156.5
     classification:  Generation  =  1
Decision tree:
 # :  smaller then 156.0
     classification:  Generation  =  1
 # :  between 156.0 and 253.0
     classification:  Generation  =  2
Decision tree:
 # :  smaller then 156.0
     classification:  Generation  =  1
 # :  between 156.0 and 253.5
     classification:  Generation  =  2
Decision tree:
 # :  smaller then 156.0
     classification:  Generation  =  1
 # :  between 156.0 and 253.5
     classification:  Generation  =  2
Decision tree:
 # :  sm

### training with seaborn toy datasets

In [7]:
# check out available dataset form seaborn

# datasets from seaborn:
# ['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes',
#'diamonds', 'dots', 'exercise', 'flights', 'fmri', 'gammas', 'geyser',
#'iris', 'mpg', 'penguins', 'planets', 'taxis', 'tips', 'titanic']

data = sns.load_dataset("titanic")
data = data.dropna(how = "any")
print(data.columns)
print(data)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')
     survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
1           1       1  female  38.0      1      0  71.2833        C  First   
3           1       1  female  35.0      1      0  53.1000        S  First   
6           0       1    male  54.0      0      0  51.8625        S  First   
10          1       3  female   4.0      1      1  16.7000        S  Third   
11          1       1  female  58.0      0      0  26.5500        S  First   
..        ...     ...     ...   ...    ...    ...      ...      ...    ...   
871         1       1  female  47.0      1      1  52.5542        S  First   
872         0       1    male  33.0      0      0   5.0000        S  First   
879         1       1  female  56.0      0      1  83.1583        C  First   
887         1       1  female  

In [8]:
data = sns.load_dataset("penguins")

# 2. prepare data
data = prepare_data(data)

# 3. choose the target value
target = "species"

# 4. train a tree for each chunk of the training set
decisionTrees = []
for trainingSet in data[0]:
    
    attributes = list(trainingSet.columns)
    attributes.remove(target)
    
    rootNode = Node()
    decisionTree = train_data(data=trainingSet, target=target, attributes=attributes, node=rootNode, max_recursion = 10)
    decisionTree.id3()
    decisionTrees.append(rootNode)
    rootNode.printTree()

for testingSet, tree in zip(data[1], decisionTrees):
    
    testData = test_data(testingSet, target, tree)
    print(testData.accuracy())
    
    

Decision tree:
 flipper_length_mm :  smaller then 195.0
     classification:  species  =  Adelie
Decision tree:
 island :  Dream
     bill_length_mm :  smaller then 40.3
         classification:  species  =  Adelie
 island :  Biscoe
     flipper_length_mm :  smaller then 200.0
         classification:  species  =  Adelie
     flipper_length_mm :  bigger then 200.0
         classification:  species  =  Gentoo
 island :  Torgersen
     classification:  species  =  Adelie
Decision tree:
 island :  Dream
     bill_length_mm :  smaller then 40.55
         classification:  species  =  Adelie
     bill_length_mm :  between 40.55 and 41.0
         classification:  species  =  Chinstrap
 island :  Biscoe
     bill_depth_mm :  smaller then 16.9
         classification:  species  =  Gentoo
     bill_depth_mm :  bigger then 16.9
         classification:  species  =  Adelie
 island :  Torgersen
     classification:  species  =  Adelie


  doubleData = data.append(data)


Decision tree:
 island :  Dream
     bill_length_mm :  smaller then 40.55
         classification:  species  =  Adelie
     bill_length_mm :  between 40.55 and 41.0
         classification:  species  =  Chinstrap
 island :  Biscoe
     flipper_length_mm :  smaller then 203.0
         classification:  species  =  Adelie
 island :  Torgersen
     classification:  species  =  Adelie
Decision tree:
 island :  Dream
     bill_length_mm :  smaller then 40.9
         classification:  species  =  Adelie
     bill_length_mm :  between 40.9 and 41.0
         sex :  Female
             classification:  species  =  Chinstrap
         sex :  Male
             classification:  species  =  Adelie
 island :  Biscoe
     flipper_length_mm :  smaller then 203.0
         classification:  species  =  Adelie
 island :  Torgersen
     classification:  species  =  Adelie
Decision tree:
 island :  Dream
     bill_length_mm :  smaller then 40.9
         classification:  species  =  Adelie
     bill_length_mm :

In [9]:

# 1. load data
data = sns.load_dataset("titanic")
data = data.drop("alive", axis=1)

accuraciesAll = []
for ratio in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]:
    # 2. prepare data
    dataPrepared = prepare_data(data = data, tratio=ratio)

    # 3. choose the target value
    target = "survived"

    accuracies = []
    decisionTrees = []
    for trainingSet in dataPrepared[0]:

        attributes = list(trainingSet.columns)
        attributes.remove(target)

        rootNode = Node()
        decisionTree = train_data(data=trainingSet, target=target, attributes=attributes, node=rootNode, max_recursion = 10)
        decisionTree.id3()
        decisionTrees.append(rootNode)
        #rootNode.printTree()

    for testingSet, tree in zip(dataPrepared[1], decisionTrees):

        testData = test_data(testingSet, target, tree)
        accuracies.append(testData.accuracy())
        
    accuraciesAll.append(np.mean(accuracies))
print(accuraciesAll) 

  doubleData = data.append(data)
  doubleData = data.append(data)
  doubleData = data.append(data)
  doubleData = data.append(data)
  doubleData = data.append(data)
  doubleData = data.append(data)


[0.6746691289627578, 0.676954732510288, 0.673611111111111, 0.6574074074074074, 0.6666666666666666, 0.7032967032967034]
