In [1]:
import numpy as np
import numbers

In [2]:
#to find total no.of classes in the dataset
#dictionary will have class label and count of each label
def countClass(data):
    dictLabelCount = {}
    for row in data:
        label = row[-1]
        if label not in dictLabelCount:
            dictLabelCount[label] = 0
        dictLabelCount[label] += 1
    return dictLabelCount

In [3]:
#to compute the gini index for dataset 
def gini(data):
    counts = countClass(data)
    giniVal = 1
    for label in counts:
        prob = counts[label] / float(len(data))
        giniVal -= prob ** 2
    return giniVal

In [4]:
#dividing data based on attribute and attribute value
def partition(data, col, val):
    right = []
    left = []
    for row in data:
        if isinstance(val, numbers.Number):
            val = float(val)
            col= int(col)
            if row[col] >= val:
                right.append(row)
            else:
                left.append(row)
        else:
            if row[col] == val:
                right.append(row)
            else:
                left.append(row)
    return left, right

In [82]:
#computing gain using gini index
def bestGainSplit(data):
    bestGain , bestCol, bestVal = 0, 0, 0
    if len(data) == 0:
        return bestGain, bestCol, bestVal

    curGINI = gini(data)
    features = len(data[0]) - 1
        
    for col in range(features):
        values = set([row[col] for row in data])
        for val in values:
            leftClass, rightClass = partition(data, col, val)
            if isinstance(val, numbers.Number):
                val = float(val)
                col = int(col)
            if len(leftClass) == 0 or len(rightClass) == 0:
                continue
            total = (len(rightClass) + len(leftClass))
            probRight = float(len(rightClass) / total)
            probleft = float(len(leftClass) / total)
            #information gain calculation
            gain = curGINI - (probleft * gini(leftClass)) - (probRight * gini(rightClass))
                
            if gain >= bestGain:
                bestGain = gain
                #print("bestgain---",bestGain)
                bestCol = col
                #print("bestcol----",bestCol)
                bestVal = val
                #print("bestval----",bestVal)
                
    return bestGain, bestCol, bestVal

In [83]:
#how a prticular node looks like
class TreeNode(object):
    def __init__(self,col,val, left, right, rightClass, leftClass):
        self.col = col
        self.val = val
        self.left = left
        self.right = right
        self.rightClass=rightClass
        self.leftClass=leftClass

In [84]:
#determining class of node after a perfect split
def updateNode(data,col, val):
    rightClass=0
    leftClass=0
    for row in data:
        if int(row[-1])==1:
            rightClass +=1
        else:
            leftClass +=1
    return TreeNode(col, val, None, None,rightClass, leftClass)

In [85]:
#building decision tree
def buildDecisionTree(data):
    gain, col, val = bestGainSplit(data)
    if gain == 0:
        return updateNode(data,None,None)
    leftClass, rightClass = partition(data, col, val)
    if isinstance(val, numbers.Number):
        val = float(val)
        col = int(col)
    node = TreeNode(col,val, None, None,-1,-1)
    node.left = buildDecisionTree(leftClass)
    node.right = buildDecisionTree(rightClass)
    return node

In [86]:
def printTree(root, tab):
    tab+="      "
    if (root.left == None) and (root.right == None):
        if root.rightClass >= root.leftClass:
            print(tab + "   ->(Class 1)" )
        else:
            print(tab + "   ->(Class 0)" )
        return
    print(tab  + "SPLIT" + ": " + str(root.val))
    print( tab + '   ->left:')
    printTree(root.left, tab )
    print( tab + '   ->Right:')
    printTree(root.right, tab )

In [87]:
data=[]
input_file = "project3_dataset4.txt"
data = np.genfromtxt(input_file, dtype=None)
data = np.array(data)

print(data)

[(b'sunny', b'hot', b'high', b'weak', 0)
 (b'sunny', b'hot', b'high', b'strong', 0)
 (b'overcast', b'hot', b'high', b'weak', 1)
 (b'rain', b'mild', b'high', b'weak', 1)
 (b'rain', b'cool', b'normal', b'weak', 1)
 (b'rain', b'cool', b'normal', b'strong', 0)
 (b'overcast', b'cool', b'normal', b'strong', 1)
 (b'sunny', b'mild', b'high', b'weak', 0)
 (b'sunny', b'cool', b'normal', b'weak', 1)
 (b'rain', b'mild', b'normal', b'weak', 1)
 (b'sunny', b'mild', b'normal', b'strong', 1)
 (b'overcast', b'mild', b'high', b'strong', 1)
 (b'overcast', b'hot', b'normal', b'weak', 1)
 (b'rain', b'mild', b'high', b'strong', 0)]


  This is separate from the ipykernel package so we can avoid doing imports until


In [88]:
root = None
root = buildDecisionTree(data)
printTree(root, "")

      SPLIT: b'overcast'
         ->left:
            SPLIT: b'normal'
               ->left:
                  SPLIT: b'rain'
                     ->left:
                           ->(Class 0)
                     ->Right:
                        SPLIT: b'strong'
                           ->left:
                                 ->(Class 1)
                           ->Right:
                                 ->(Class 0)
               ->Right:
                  SPLIT: b'strong'
                     ->left:
                           ->(Class 1)
                     ->Right:
                        SPLIT: b'mild'
                           ->left:
                                 ->(Class 0)
                           ->Right:
                                 ->(Class 1)
         ->Right:
               ->(Class 1)
