In [1]:
import numpy as np
import pandas as pd

### Node Class

Tree structure is build by Node Objects. Each node object has an attribute and a classification when trained, as well as a parent node and the value associated with the parents attribute and a classification.
The nodes are trained by tran_data and also offer the printTree function.

In [2]:
class Node:
    '''
    class Node innitialises a tree structure for a non-binary tree
    it has the typical setter and getter methods and a method to remove a 
    child from the list of children
    '''
    def __init__(self, parent = None, attribute = None, classification = None, value = None, target = None):
        self.children = []  
        self.parent = parent
        self.attribute = attribute
        self.classification = classification
        self.value = value
        self.target = target
        

    def setChild(self, node):
        self.children.append(node)
        
    def setParent(self, node):
        if (self.parent is not None):
            self.parent.children.remove(self)
        self.parent = node

    def getChildren(self):
        return self.children
    
    def getParent(self):
        return self.parent
    
    def deleteChild(self, node):
        if (node in self.children):
            node.parent = None
            self.children.remove(node)
        else:
            raise TypeError("Child not in Children")
        
    def isLeaf(self):
        if len(self.children) > 0:
            return False
        else:
            return True

    def isRoot(self):
        if (self.parent is None):
            return True
        else:
            return False
    
    def setAttribute(self, attribute):
        self.attribute = attribute
    
    def getAttribute(self):
        return self.attribute
    
    def setClassification(self, classification):
        self.classification = classification
    
    def getClassification(self):
        return self.classification
    
    def setValue(self,value):
        self.value = value

    def getValue(self):
        return self.value
    

    def printTree(self, level = 0):
        tab = "    "
        if self.isLeaf():
            print(tab*level, "classification: ", self.target, " = " , self.classification)
        else:
            for child in self.children:
                print(tab*level, self.attribute, ": ", child.value)
                child.printTree(level + 1)

### train_Data Class

The class represents a data (sub)set that is used to train the (sub)tree. It has a (root) node which it is responsible to train (choose attribute, classification, and child nodes) via the id3 function.
I can also deal with continuous variables by ...

In [3]:
class train_data:
    '''
    class train_data has all important functions for ID3 Algorithm:
    it can calculate the entropy of some data, the information gain, choose an attriute.
    within the ID3 algorithm a tree will be trained.
    the function retrain(data) retrains a trained tree with new data.
    '''
    def __init__(self, data, target, attributes, node:Node = None, recursion_depth = None, continuous_splitting = 0.1,  max_recursion = 10):
        
        self.data = data
        if not isinstance(self.data, pd.DataFrame):
            raise TypeError("Data has to be a Pandas Dataframe")
        
        self.target = target
        if not isinstance(self.target, str):
            raise TypeError("Taget has to be of type string")
        
        self.attributes = attributes
        if not isinstance(attributes, list):
            raise TypeError("Attributes have to have structure list")
            
        for attribute in self.attributes:
            if not isinstance(attribute, str):
                raise TypeError("Attributes have to be of type string")

        self.node = node
        self.continuous_splitting = continuous_splitting
        self.recursion_depth = recursion_depth
        self.max_recursion = max_recursion
    
    def is_continuous(self, values):
        # checks is variable is a continuous variable
        if len(values) > 10:
            for value in values:
                if value is not int or float:
                    return False
            return True
        else:
            return False

    def attribute_continuous(self, attributeColumn, values):
        
        # sort the set
        values = list(sorted(values))
        split_lenght = int(len(values) * self.continuous_splitting)
        maxGain = 0
        maxValue = 0
        for split in range(split_lenght):
            value = values[split]

            subsetData1 = self.data[attributeColumn <= value]
            subsetData2 = self.data[attributeColumn > value]
            subset1 = train_data(subsetData1, self.target, self.attributes)
            subset2 = train_data(subsetData2, self.target, self.attributes)

            gainSum =  (subsetData1.shape[0] / self.data.shape[0]) * subset1.entropy() + (subsetData2.shape[0] / self.data.shape[0]) * subset2.entropy()
            infoGain = self.data.entropy() - gainSum

            if infoGain > maxGain:
                maxGain = infoGain
                maxValue = value
        
        return [maxGain, maxValue]

    def entropy(self):
        targetColumn = self.data.loc[:, self.target]

        values = set(targetColumn)
        entropySum = 0
        for value in values:
            p = list(targetColumn).count(value) / len(targetColumn)
            entropySum = entropySum + (- p * np.log(p))

        return entropySum
    

    def informationGain(self, attributeColumn, values):
        gainSum = 0
        
        for value in values:
            
            subsetData = self.data[attributeColumn == value]
            subset = train_data(subsetData, self.target, self.attributes)
            # claculate entropy and normalize by size of subsets
            gainSum = gainSum + (subsetData.shape[0] / self.data.shape[0]) * subset.entropy()

        # substract summed and weighted entropy of subsets from entropy of whole set    
        infoGain = self.entropy() - gainSum

        return infoGain


    def chooseAttribute(self):
        maxGain= 0
        maxAttribute = ""

        # calculate Information Gain for each attribute
        for attribute in self.attributes:
            attributeColumn = self.data.loc[:, attribute]
            values = set(attributeColumn)
            gain = 0

            # calculate Information gain for this attribute
            if self.is_continuous(values):
                gain = self.attribute_continuous(attributeColumn, values)[0] # calculates the split information ???
            else:
                gain = self.informationGain(attributeColumn, values)

            # store attribute with highest information gain
            if gain >= maxGain:
                maxGain = gain
                maxAttribute = attribute

        # choose attribute with highest Information Gain
        return maxAttribute

    def classify(self):
        # returns the most commen classification of the dataset
        
        targetColumn = self.data.loc[:, self.target]
        values = set(targetColumn)
        maxClass = 0  # highest number of values
        classification = "" # classification of most common value
        for value in values:
            # check if calssification value is more common then other classification values
            if list(targetColumn).count(value) > maxClass:
                maxClass = list(targetColumn).count(value)
                classification = value

        return classification

    def id3(self):
        # base cases:
        # 1) all instances have same target value -> leaf node with target value
        if (self.data[self.target].nunique() == 1):
            self.node.setClassification(self.data[self.target].iloc[0])
            print("basecase1")
            return 
        # 2) out of discriptive features -> leaf node with majority of target values
        if (not self.attributes):
            self.node.setClassification(self.classify())
            print("basecase2")
            return
        # 3) no instances left in dataset -> take majority of parent node
        if (self.data is None):
            parent = self.node.getParent()
            self.node.setClassification(parent.getClassification())
            print("basecase3")
            return
        # 4) maximal recursion depth:
        if self.recursion_depth == self.max_recursion:
            self.node.setClassification(self.classify())
            print("basecase4")
            return


        # recursive case:
        # choose attribute with highest explainatory power
        print("in recursion")
        print("attributs: ", self.attributes)
        attribute = self.chooseAttribute()
        self.node.setAttribute(attribute)
        self.node.setClassification(self.classify())

        # split data according to attribute
        attributeColumn = self.data.loc[:, attribute]
        values = set(attributeColumn)
        new_attributes = self.attributes
        new_attributes.remove(attribute)
        recursion_depth = self.recursion_depth + 1

        # chosen attribute is a continuous variable:
        if self.is_continuous(values):
            print("continuous")
            value = self.attribute_continuous(attributeColumn, values)[1]

            subsetData1 = self.data[attributeColumn <= value]
            subsetData2 = self.data[attributeColumn > value]
            childNode1 = Node(parent=self.node, value=f"<= {value}")
            childNode2 = Node(parent=self.node, value=f"> {value}")
            self.node.setChild(childNode1)
            self.node.setChild(childNode2)
            
            subset1 = train_data(data=subsetData1, target=self.target, attributes=new_attributes, node=childNode1, recursion_depth=recursion_depth)
            subset2 = train_data(data=subsetData2, target=self.target, attributes=new_attributes, node=childNode2, recursion_depth=recursion_depth)
            # recursive call on all partitions
            subset1.id3()
            subset2.id3()
        
        # chosen attribute is a categorical variable:
        else:
            print("not continuous")
            for value in values:
                subsetData = self.data[attributeColumn == value]
                childNode = Node(parent=self.node, value=value, target=self.target)
                self.node.setChild(childNode)
                subset = train_data(data=subsetData, target=self.target, attributes=new_attributes, node=childNode, recursion_depth=recursion_depth)
    
                # recursive call on all partitions
                subset.id3()

    def retrain(self, data):
        self.data = data
        self.id3()
    

### test_data Class

to classify datapoints and test accuracy of tree

In [10]:
class test_data:
    
    def __init__(self, testData, target, node:Node):
        self.testData = testData
        self.target = target
        self.node = node

        # check whether node is trained:
        if node.getAttribute() is None:
            raise TypeError("node has to be part of a trained Decisiontree")


    def calcError(self, datapoint):

        # compare leaf node classification and datapoint classification (basecase)
        if self.node.isLeaf() == False:
            return self.node.getClassification() == datapoint[self.target]

        # traverse down the tree with the decision nodes (recursive case)
        else:
            attribute = self.node.getAttribute()
            dataValue = datapoint[attribute]
            for child in self.node.children:
                if child.getValue() is dataValue:
                    return test_data(datapoint, self.target, child).calcError()
        
        # if there are no children with the right value at decision node, use current classification (base case)
        return self.node.getClassification() == self.target

    
    def calcClassification(self, datapoint):
        
        # get leaf node classification (basecase)
        if self.node.isLeaf() == False:
            return self.node.getClassification()

        # traverse down the tree with the decision nodes (recursive case)
        else:
            attribute = self.node.getAttribute()
            dataValue = datapoint[attribute]
            for child in self.node.children:
                if child.getValue() is dataValue:
                    return test_data(datapoint, self.target, child).calcClassification()
        
        # if there are no children with the right value at decision node, get current classification (base case)
        return self.node.getClassification()
        
    def classify(self):
        classificationArray = []
        for i in range(self.testData.shape[0]):
            datapoint = self.testData.loc[i]
            classificationArray.append(self.calcClassification(datapoint))
        
        return classificationArray
    
    def accuracy(self):
        errorArray = []
        for i in range(self.testData.shape[0]):
            datapoint = self.testData.loc[i]
            errorArray.append(self.calcError(datapoint))
        
        return np.mean(errorArray)



### Main

In [11]:
d= {"gender": ["f", "f", "f", "f", "f", "m", "m", "m", "m", "m"],
                   "vegan": [True, True, True, False, False, True, False, False, False, False],
                   "coxi": [True, True, True, False, True, True, True, False, False, False],
                   "green": [True, True, True, False, False, True, False, True, False, True]}

data = pd.DataFrame(data = d)

target = "vegan"
attributes = list(data.columns)
attributes.remove(target)

rootNode = Node()

decisionTree = train_data(data=data, target=target, attributes=attributes, node=rootNode, recursion_depth=5)
decisionTree.id3()

rootNode.printTree()

in recursion
attributs:  ['gender', 'coxi', 'green']
not continuous
basecase1
in recursion
attributs:  ['gender', 'coxi']
not continuous
basecase1
basecase1
 green :  False
     classification:  vegan  =  False
 green :  True
     coxi :  False
         classification:  vegan  =  False
     coxi :  True
         classification:  vegan  =  True


In [12]:
datapoints = {"gender": ["f", "m", "m"],
                "vegan": [True, False, False],
                "coxi": [False, False, False],
                "green": [False, False, True]}

datapoints = pd.DataFrame(data = datapoints)

print("accuracy: ", test_data(datapoints, "vegan", rootNode).accuracy() )
print("classifications: ", test_data(datapoints, "vegan", rootNode).classify())

accuracy:  0.6666666666666666
classifications:  [False, False, False]


In [13]:
# 1. load data
data = pd.read_csv("data/pokemon_no_duplicates.csv")

# 2. prepare data
#data = prepare_data(data)

# choose the target value
data = data.drop(columns=["Name", "#"])
target = "Generation"
attributes = list(data.columns)
attributes.remove(target)
# 3. split_data with id3
rootNode = Node()
decisionTree = train_data(data=data, target=target, attributes=attributes, node=rootNode, recursion_depth=0)
decisionTree.id3()

rootNode.printTree()

in recursion
attributs:  ['Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Legendary']
not continuous
in recursion
attributs:  ['Type 1', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Legendary']
not continuous
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
in recursion
attributs:  ['Type 1', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Legendary']
not continuous
basecase1
basecase1
basecase1
in recursion
attributs:  ['Type 1', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Legendary']
not continuous
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
basecase1
in recursion
attributs:  ['Type 1', 'Total', 'HP', 'Defense', 'Sp. Atk', 'Legendary']
not continuous
basecase1
basecase1
basecase1
basecase1
basecase1
in recursion
attributs:  ['Type 1', 'Total',

In [18]:

datapoints = data.loc(5)

print("accuracy: ", test_data(datapoints, "Generation", rootNode).accuracy() )
print("classifications: ", test_data(datapoints, "Generation", rootNode).classify())

ValueError: No axis named 5 for object type DataFrame