In [1]:
import numpy as np
import pandas as pd

### Node Class

Tree structure is build by Node Objects. Each node object has an attribute and a classification when trained, as well as a parent node and the value associated with the parents attribute and a classification.
The nodes are trained by tran_data and also offer the printTree function.

In [2]:
class Node:
    '''
    class Node innitialises a tree structure for a non-binary tree
    it has the typical setter and getter methods and a method to remove a 
    child from the list of children
    '''
    def __init__(self, parent = None, attribute = None, classification = None, value = None, valueIsContinuous = False, target = None):
        self.children = []  
        self.parent = parent
        self.attribute = attribute
        self.classification = classification
        self.value = value
        self.valueIsContinuous = valueIsContinuous
        self.target = target
        

    def setChild(self, node):
        self.children.append(node)
        
    def setParent(self, node):
        if (self.parent is not None):
            self.parent.children.remove(self)
        self.parent = node

    def getChildren(self):
        return self.children
    
    def getParent(self):
        return self.parent
    
    def deleteChild(self, node):
        if (node in self.children):
            node.parent = None
            self.children.remove(node)
        else:
            raise TypeError("Child not in Children")
        
    def isLeaf(self):
        if len(self.children) > 0:
            return False
        else:
            return True

    def isRoot(self):
        if (self.parent is None):
            return True
        else:
            return False
    
    def setAttribute(self, attribute):
        self.attribute = attribute
    
    def getAttribute(self):
        return self.attribute
    
    def setClassification(self, classification):
        self.classification = classification
    
    def getClassification(self):
        return self.classification
    
    def setValue(self,value):
        self.value = value

    def getValue(self):
        return self.value
    

    def printTree(self, level = 0):
        tab = "    "
        if self.isLeaf():
            print(tab*level, "classification: ", self.target, " = " , self.classification)
        else:
            for child in self.children:
                
                # printing intervals
                if child.valueIsContinuous:
                    interval = ""
                    if child.value[0] == np.NINF:
                        interval = f"smaller then {child.value[1]}"
                    elif child.value[1] == np.PINF:
                        interval = f"bigger then {child.value[0]}"
                    else:
                        interval = f"between {child.value[0]} and {child.value[1]}"
                    print(tab*level, self.attribute, ": ", interval)
                
                # printing discrete values
                else:
                    print(tab*level, self.attribute, ": ", child.value)
                
                # traverse deeper into the tree
                child.printTree(level + 1)

### train_Data Class

The class represents a data (sub)set that is used to train the (sub)tree. It has a (root) node which it is responsible to train (choose attribute, classification, and child nodes) via the id3 function.
I can also deal with continuous variables by ...

In [3]:
class train_data:
    '''
    class train_data has all important functions for ID3 Algorithm:
    it can calculate the entropy of some data, the information gain, choose an attriute.
    within the ID3 algorithm a tree will be trained.
    the function retrain(data) retrains a trained tree with new data.
    '''
    def __init__(self, data, target, attributes, node:Node = None, recursion_depth = None, continuous_splitting = 0.1,  max_recursion = 10):
        
        self.data = data
        if not isinstance(self.data, pd.DataFrame):
            raise TypeError("Data has to be a Pandas Dataframe")
        
        self.target = target
        if not isinstance(self.target, str):
            raise TypeError("Taget has to be of type string")
        
        self.attributes = attributes
        if not isinstance(attributes, list):
            raise TypeError("Attributes have to have structure list")
            
        for attribute in self.attributes:
            if not isinstance(attribute, str):
                raise TypeError("Attributes have to be of type string")

        self.node = node
        self.continuous_splitting = continuous_splitting
        self.recursion_depth = recursion_depth
        self.max_recursion = max_recursion
    
    ######################################
    ## methods for continuous variables ##
    ######################################
    
    def is_continuous(self, values):
        # checks is variable is a continuous variable
        # (it is continuous if it has more than 10 different values and is a numericla scalar)
        if len(values) > 10:
            if isinstance(list(values)[5], int) or isinstance(list(values)[5], float):
                return True
        return False

    
    def getBoundaries(self, tColumn, aColumn):
        # by looking at the target column and the attribute column the
        # function decides on deciosion boundaries in a continuous varibale, where classification changes
        # aColumn -> attribute column with the continuous values
        # tColumn -> target column with the classification
        
        # 1) sort the two columns
        columns = pd.DataFrame(data={"a":list(aColumn), "t":list(tColumn)}).sort_values(by="a")
        columns.index = range(len(columns))

        # 2) find decision boundaries where classification changes
        leftBound = np.NINF
        rightBound = None
        boundaries = []
        currentClass = columns["t"][0]
        for i in range(len(columns)):
            # when classification changes
            if(columns["t"][i] != currentClass):
                currentClass = columns["t"][i]
                
                # get the value in the middel
                beforeSwitch = columns["a"][i-1]
                afterSwitch = columns["a"][i]
                rightBound = (beforeSwitch + afterSwitch) / 2

                # safe the tupple of two boundaries with a uniform classification
                boundaries.append((leftBound, rightBound))
                leftBound = rightBound
        
        # last tupple that does not get triggerd by a switch of classification
        boundaries.append((leftBound, np.PINF))
        
        # if the getBoundaries function returns more then 10 intervals
        # set intervals indipendent of classification
        if len(boundaries) > 10:
            return self.setBoundaries(aColumn)
        
        return boundaries
    
    def setBoundaries(self, aColumn):
        # if the getBoundaries function returns more then 10 intervals
        # sets intervals indipendent of classification
        
        # calculate size of intervals
        maximum = np.max(aColumn)
        minimum = np.min(aColumn)
        stepsize = (maximum - minimum)/ 10
        boundaries = []
        
        # make a tupel for each interval
        leftBound = np.NINF
        rightBound = minimum + stepsize
        for i in range(9):
            boundaries.append((leftBound, rightBound))
            leftBound = rightBound
            rightBound = leftBound + stepsize
        boundaries.append((leftBound, np.PINF))
        
        return boundaries
        
    
    def replaceContinuous(self, boundaries, aColumn):
        # replaces the continuous values of an attribute by the
        # tuples that represent an interval
        
        newAColumn = []
        for value in aColumn:
            for l, r in boundaries:
                if value >= l and value < r:
                    newAColumn.append((l, r))
        return newAColumn
    
    
    #######################################
    ## methods for choosing an attribute ##
    #######################################
    
    
    def entropy(self):
        # calculates entropy 
        targetColumn = self.data.loc[:, self.target]

        values = set(targetColumn)
        entropySum = 0
        for value in values:
            p = list(targetColumn).count(value) / len(targetColumn)
            entropySum = entropySum + (- p * np.log(p))

        return entropySum
    

    def informationGain(self, attributeColumn, values):
        # calculates the informationGain
        gainSum = 0
        
        for value in values:
            
            subsetData = self.data[attributeColumn == value]
            subset = train_data(subsetData, self.target, self.attributes)
            # claculate entropy and normalize by size of subsets
            gainSum = gainSum + (subsetData.shape[0] / self.data.shape[0]) * subset.entropy()

        # substract summed and weighted entropy of subsets from entropy of whole set    
        infoGain = self.entropy() - gainSum

        return infoGain

    def gainRatio(self, attributeColumn, values):
        # calculating the Gain Ratio instead of the InforamtionGain
        # to prefer attributes with few values
        
        infoGain = self.informationGain(attributeColumn, values)

        splitInfo = 0
        for value in values:
            subset = attributeColumn[attributeColumn == value]
            # proportion of subset size and whole set size
            s = len(subset) / len(attributeColumn)
            print("size proportion:", s)
            print("for value ", value ," in ", attributeColumn)
            splitInfo = splitInfo + (- s * np.log(s))
            
        
        if splitInfo == 0:
            splitInfo = infoGain
            
        return infoGain / splitInfo
        

    def chooseAttribute(self):
        # chooses an attribute that maximises GainRatio
        
        maxGain= 0
        maxAttribute = ""

        # calculate Gain Ratio for each attribute
        for attribute in self.attributes:
            
            print("--------------------------------------------------------------------------")
            attributeColumn = self.data.loc[:, attribute]
            values = set(attributeColumn)
            gain = 0

            # replace the values in attributeColumn with continuous values by Intervals
            print(f"{attribute} is continuous and has {len(values)} values: {self.is_continuous(values)}")
            if self.is_continuous(values):
                targetColumn = self.data[self.target]
                boundaries = self.getBoundaries(targetColumn, attributeColumn)
                attributeColumn = self.replaceContinuous(boundaries, attributeColumn)
                values = set(attributeColumn)
                print("")
                print(f"replaced values of {attribute} by intervals: \n{values}")
                print("")                
                
            # calculate gainRatio
            gain = self.gainRatio(attributeColumn, values)

            # store attribute with highest information gain
            if gain >= maxGain:
                maxGain = gain
                maxAttribute = attribute

        # choose attribute with highest Information Gain
        return maxAttribute

    ############################################
    ## methods for building the decision tree ##
    ############################################
    
    def classify(self):
        # returns the most commen classification of the dataset
        
        targetColumn = self.data.loc[:, self.target]
        values = set(targetColumn)
        maxClass = 0  # highest number of values
        classification = "" # classification of most common value
        for value in values:
            # check if calssification value is more common then other classification values
            if list(targetColumn).count(value) > maxClass:
                maxClass = list(targetColumn).count(value)
                classification = value

        return classification

    def id3(self):
        # base cases:
        # 1) all instances have same target value -> leaf node with target value
        if (self.data[self.target].nunique() == 1):
            self.node.setClassification(self.data[self.target].iloc[0])
            print("basecase1")
            return 
        # 2) out of discriptive features -> leaf node with majority of target values
        if (not self.attributes):
            self.node.setClassification(self.classify())
            print("basecase2")
            return
        # 3) no instances left in dataset -> take majority of parent node
        if (self.data is None):
            parent = self.node.getParent()
            self.node.setClassification(parent.getClassification())
            print("basecase3")
            return
        # 4) maximal recursion depth:
        if self.recursion_depth == self.max_recursion:
            self.node.setClassification(self.classify())
            print("basecase4")
            return


        # recursive case:
        # choose attribute with highest explainatory power
        print("in recursion")
        print("attributs: ", self.attributes)
        attribute = self.chooseAttribute()
        self.node.setAttribute(attribute)
        self.node.setClassification(self.classify())

        # split data according to attribute
        attributeColumn = self.data.loc[:, attribute]
        values = set(attributeColumn)
        new_attributes = self.attributes
        new_attributes.remove(attribute)
        recursion_depth = self.recursion_depth + 1

        valueIsContinuous=False
        
        # chosen attribute is a continuous variable:
        if self.is_continuous(values):
            print("continuous")
            
            targetColumn = self.data[self.target]
            boundaries = self.getBoundaries(targetColumn, attributeColumn)
            attributeColumn = self.replaceContinuous(boundaries, attributeColumn)
            values = set(attributeColumn)
            valueIsContinuous=True
        
        # create leaf node for each attribute value
        for value in values:
            subsetData = self.data[attributeColumn == value]
            childNode = Node(parent=self.node, value=value, valueIsContinuous=valueIsContinuous, target=self.target)
            self.node.setChild(childNode)
            subset = train_data(data=subsetData, target=self.target, attributes=new_attributes, node=childNode, recursion_depth=recursion_depth)

            # recursive call on all partitions
            subset.id3()
            

    def retrain(self, data):
        self.data = data
        self.id3()
    

### test_data Class

to classify datapoints and test accuracy of tree

In [4]:
class test_data:
    
    def __init__(self, testData, target, node:Node):
        self.testData = testData
        self.target = target
        self.node = node

        # check whether node is trained:
        if node.getAttribute() is None:
            raise TypeError("node has to be part of a trained Decisiontree")


    def calcError(self, datapoint):

        # compare leaf node classification and datapoint classification (basecase)
        if self.node.isLeaf() == False:
            return self.node.getClassification() == datapoint[self.target]

        # traverse down the tree with the decision nodes (recursive case)
        else:
            attribute = self.node.getAttribute()
            dataValue = datapoint[attribute]
            for child in self.node.children:
                # for interval values
                if child.valueIsContinuous:
                    if dataValue >= child.getValue()[0] and dataValue < child.getValue()[1]:
                        return test_data(datapoint, self.target, child).calcError()
                # for discrete values
                if child.getValue() is dataValue:
                    return test_data(datapoint, self.target, child).calcError()
        
        # if there are no children with the right value at decision node, use current classification (base case)
        return self.node.getClassification() == self.target

    
    def calcClassification(self, datapoint):
        
        # get leaf node classification (basecase)
        if self.node.isLeaf() == False:
            return self.node.getClassification()

        # traverse down the tree with the decision nodes (recursive case)
        else:
            attribute = self.node.getAttribute()
            dataValue = datapoint[attribute]
            for child in self.node.children:
                # for interval values
                if child.valueIsContinuous:
                    if dataValue >= child.getValue()[0] and dataValue < child.getValue()[1]:
                        return test_data(datapoint, self.target, child).calcClassification()
                # for discrete values
                if child.getValue() is dataValue:
                    return test_data(datapoint, self.target, child).calcClassification()
        
        # if there are no children with the right value at decision node, get current classification (base case)
        return self.node.getClassification()
        
    def classify(self):
        classificationArray = []
        for i in range(self.testData.shape[0]):
            datapoint = self.testData.loc[i]
            classificationArray.append(self.calcClassification(datapoint))
        
        return classificationArray
    
    def accuracy(self):
        errorArray = []
        for i in range(self.testData.shape[0]):
            datapoint = self.testData.loc[i]
            errorArray.append(self.calcError(datapoint))
        
        return np.mean(errorArray)



### Main

In [5]:
d= {"gender": ["f", "f", "f", "f", "f", "m", "m", "m", "m", "m"],
                   "vegan": [True, True, True, False, False, True, False, False, False, False],
                   "coxi": [True, True, True, False, True, True, True, False, False, False],
                   "green": [True, True, True, False, False, True, False, True, False, True]}

data = pd.DataFrame(data = d)

target = "vegan"
attributes = list(data.columns)
attributes.remove(target)

rootNode = Node()

decisionTree = train_data(data=data, target=target, attributes=attributes, node=rootNode, recursion_depth=5)
decisionTree.id3()

rootNode.printTree()

in recursion
attributs:  ['gender', 'coxi', 'green']
--------------------------------------------------------------------------
gender is continuous and has 2 values: False
size proportion: 0.5
for value  f  in  0    f
1    f
2    f
3    f
4    f
5    m
6    m
7    m
8    m
9    m
Name: gender, dtype: object
size proportion: 0.5
for value  m  in  0    f
1    f
2    f
3    f
4    f
5    m
6    m
7    m
8    m
9    m
Name: gender, dtype: object
--------------------------------------------------------------------------
coxi is continuous and has 2 values: False
size proportion: 0.4
for value  False  in  0     True
1     True
2     True
3    False
4     True
5     True
6     True
7    False
8    False
9    False
Name: coxi, dtype: bool
size proportion: 0.6
for value  True  in  0     True
1     True
2     True
3    False
4     True
5     True
6     True
7    False
8    False
9    False
Name: coxi, dtype: bool
--------------------------------------------------------------------------
green i

In [6]:
datapoints = {"gender": ["f", "m", "m"],
                "vegan": [True, False, False],
                "coxi": [False, False, False],
                "green": [False, False, True]}

datapoints = pd.DataFrame(data = datapoints)

print("accuracy: ", test_data(datapoints, "vegan", rootNode).accuracy() )
print("classifications: ", test_data(datapoints, "vegan", rootNode).classify())

accuracy:  0.6666666666666666
classifications:  [False, False, False]


In [7]:
# 1. load data
data = pd.read_csv("data/pokemon_no_duplicates.csv")

# 2. prepare data
#data = prepare_data(data)

# choose the target value
data = data.drop(columns=["Name", "#"])
target = "Generation"
attributes = list(data.columns)
attributes.remove(target)
# 3. split_data with id3
rootNode = Node()
decisionTree = train_data(data=data, target=target, attributes=attributes, node=rootNode, recursion_depth=0)
decisionTree.id3()

rootNode.printTree()

in recursion
attributs:  ['Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Legendary']
--------------------------------------------------------------------------
Type 1 is continuous and has 18 values: False
size proportion: 0.0319001386962552
for value  Ghost  in  0        Grass
1        Grass
2        Grass
3         Fire
4         Fire
        ...   
716       Dark
717     Dragon
718       Rock
719    Psychic
720       Fire
Name: Type 1, Length: 721, dtype: object
size proportion: 0.09153952843273232
for value  Grass  in  0        Grass
1        Grass
2        Grass
3         Fire
4         Fire
        ...   
716       Dark
717     Dragon
718       Rock
719    Psychic
720       Fire
Name: Type 1, Length: 721, dtype: object
size proportion: 0.056865464632454926
for value  Rock  in  0        Grass
1        Grass
2        Grass
3         Fire
4         Fire
        ...   
716       Dark
717     Dragon
718       Rock
719    Psychic
720       Fire


  splitInfo = splitInfo + (- s * np.log(s))
  splitInfo = splitInfo + (- s * np.log(s))


KeyError: False

In [None]:

#datapoints = data.loc(5) #error

#print("accuracy: ", test_data(datapoints, "Generation", rootNode).accuracy() )
#print("classifications: ", test_data(datapoints, "Generation", rootNode).classify())

In [None]:
def getBoundaries(tColumn, aColumn):
        # by looking at the target column and the attribute column the
        # function decides on deciosion boundaries in a continuous varibale, where classification changes
        
        # 1) sort the two columns
        columns = pd.DataFrame(data={"a":list(aColumn), "t":list(tColumn)}).sort_values(by="a")
        columns.index = range(len(columns))

        # 2) find decision boundaries where classification changes
        leftBound = np.NINF
        rightBound = None
        boundaries = []
        currentClass = columns["t"][0]
        for i in range(len(columns)):
            # when classification changes
            if(columns["t"][i] != currentClass):
                currentClass = columns["t"][i]
                
                # get the value in the middel
                beforeSwitch = columns["a"][i-1]
                afterSwitch = columns["a"][i]
                rightBound = (beforeSwitch + afterSwitch) / 2

                # safe the tupple of two boundaries with a uniform classification
                boundaries.append((leftBound, rightBound))
                leftBound = rightBound
        
        # last tupple that does not get triggerd by a switch of classification
        boundaries.append((leftBound, np.PINF))
        
        return columns, boundaries


aColumn = pd.Series([1, 12, 4, 5.5, 6, 10, 15])
tColumn = pd.Series(["a", "c", "b", "a", "a", "c", "c"])
columns, boundaries = getBoundaries(tColumn, aColumn)
print("column of classification sorted by attribute: \n", columns)
print("boundary tupples: ", boundaries)

def replaceContinuous(boundaries, aColumn):
        # replaces the continuous values of an attribute by the
        # tuples that represent an interval
        
        newAColumn = []
        for value in aColumn:
            for l, r in boundaries:
                if value > l and value < r:
                    newAColumn.append((l, r))
        return newAColumn
            
        '''for i in range(len(boundaries)):
            # get left and right boundary of interval
            l = boundaries[i][0]
            r = boundaries[i][1]
            print(f"{l}, {r}")
            # replac any value of column that fits the interval by the interval
            for value in newAColumn:
                
                    print(f"{value} is in interval {boundaries[i]}")
                    value = boundaries[i]
                    print("new value: ", value)
            print(newAColumn)
        return pd.Series(newAColumn)'''
print(aColumn)
print(replaceContinuous(boundaries, aColumn))

In [None]:
def setBoundaries(aColumn):
        
        maximum = np.max(aColumn)
        minimum = np.min(aColumn)
        stepsize = (maximum - minimum)/ 10
        boundaries = []
        
        leftBound = np.NINF
        rightBound = minimum + stepsize
        for i in range(9):
            boundaries.append((leftBound, rightBound))
            leftBound = rightBound
            rightBound = leftBound + stepsize
        boundaries.append((leftBound, np.PINF))
        
        return boundaries
setBoundaries([3,9])