In [1]:
import numpy as np
import pandas as pd
from math import log2

In [2]:
#Importing data
data = pd.read_csv("../data/agaricus-lepiota.csv")
data.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
#Separating our training and testing samples by a 30-70 proportion
trainingSample = data.sample(frac=0.70)
display(trainingSample)
trainingIndexes = trainingSample.index.values
testingSample = data.drop(trainingIndexes)
display(testingSample)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
7244,e,b,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,y,v,l
2136,e,x,y,g,t,n,f,c,b,w,...,s,g,w,p,w,o,p,n,v,d
3036,e,f,y,e,t,n,f,c,b,p,...,s,p,p,p,w,o,p,n,v,d
2761,e,x,f,g,t,n,f,c,b,u,...,s,p,w,p,w,o,p,k,y,d
6692,p,f,s,n,f,y,f,c,n,b,...,k,p,w,p,w,o,e,w,v,d
4565,p,f,f,y,f,f,f,c,b,h,...,k,n,n,p,w,o,l,h,v,g
272,e,x,y,w,t,l,f,c,b,k,...,s,w,w,p,w,o,p,n,s,g
5399,p,f,y,y,f,f,f,c,b,p,...,k,p,p,p,w,o,l,h,y,p
44,e,x,s,y,t,a,f,c,b,w,...,s,w,w,p,w,o,p,k,n,m
4128,p,f,y,g,f,f,f,c,b,p,...,k,b,p,p,w,o,l,h,y,p


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
5,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g
9,e,b,s,y,t,a,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m
10,e,x,y,y,t,l,f,c,b,g,...,s,w,w,p,w,o,p,n,n,g
13,p,x,y,w,t,p,f,c,n,k,...,s,w,w,p,w,o,p,n,v,u
14,e,x,f,n,f,n,f,w,b,n,...,f,w,w,p,w,o,e,k,a,g
18,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,n,s,u
23,e,b,y,w,t,a,f,c,b,w,...,s,w,w,p,w,o,p,n,n,m
24,e,b,s,w,t,l,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m
25,p,f,s,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,n,v,g


In [4]:
#Defining information-entropy function for a given data
def entropy(x):
    result = 0
    unique = list(set(x))
    for i in unique:
        p = x.count(i)/len(x)
        result += -(p)*log2(p)
    return result

In [5]:
#Defining information-fain function for a given attribute
def gain(classification, x):
    zipped = list(zip(classification, x))
    result = entropy(classification)
    unique = list(set(x))
    for i in unique:
        sublist = [y for y in zipped if y[1]==i]
        p = x.count(i)/len(x)
        result += -(p)*entropy(sublist)
    return result

In [6]:
#Defining auxiliar function
def splitInfo(classification, x):
    result = 0
    unique = list(set(x))
    for i in unique:
        p = x.count(i)/len(x)
        result += -(p)*log2(p)
    return result

In [7]:
#Defining information-gain ratio function for a given possible attribute
#This will be our paramater to see the best possible attribute for deciding the split-nodes
def gainRatio(classification, x):
    if (gain(classification, x) == 0) or (splitInfo(classification, x) == 0):
        return 0
    else:
        return gain(classification, x) / splitInfo(classification, x)

In [8]:
#Making our Tree data structure
class Tree:
    def __init__(self, df):
        self.dfFull = df
        self.results = list(df['class'])
        self.df = df.drop(columns=['class'])
        self.best = self.bestAttr()
        self.total = ""
        if self.best == "":
            self.nodes = {}
            self.total = str(list(set(list(self.dfFull['class'])))[0])
            return
        bestCol = list(df[self.best])
        bestColValues = list(set(bestCol))
        self.nodes = dict.fromkeys(bestColValues)
        self.buildNodes()
            
    def buildNodes(self):
         for i in self.nodes:
            qr = (str(self.best) + " == " + str(i))
            dfSplit = self.dfFull.loc[self.dfFull[self.best] == i]
            self.nodes[i] = Tree(dfSplit)
        
    def bestAttr(self):
        maxGain = 0
        whoIsMaxGain = ""
        for i in self.df.columns:
            gain = gainRatio(self.results, list(self.df[i]))
            if gain > maxGain:
                maxGain = gain
                whoIsMaxGain = i
        return whoIsMaxGain
    
    def getClassification(self, x):
        if self.best != "":
            return self.nodes[x[self.best]].getClassification(x)
        else:
            return self.total
    
    def __str__(self, level=0):
        ret = "\t"*level+(self.best)+"\n"
        for child in self.nodes:
            ret += "\t"*level+child + ": " + self.total + self.nodes[child].total + self.nodes[child].__str__(level+1)
        return ret

In [9]:
decisionTree = Tree(trainingSample)

print("FOR DEVELOPER TEST ONLY, FORMATTING IS NOT WORKING CORRECTLY")
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print(decisionTree)
print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
print("FOR DEVELOPER TEST ONLY, FORMATTING IS NOT WORKING CORRECTLY")

FOR DEVELOPER TEST ONLY, FORMATTING IS NOT WORKING CORRECTLY
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
odor
f: p	
l: e	
y: p	
a: e	
m: p	
s: p	
c: p	
n: 	spore-print-color
	r: p		
	o: e		
	y: e		
	k: e		
	b: e		
	n: e		
	w: 		veil-color
		y: p			
		w: 			gill-size
			n: 				gill-spacing
				c: p					
				w: 					bruises?
					f: e						
					t: p						
			b: e				
	h: e		
p: p	

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
FOR DEVELOPER TEST ONLY, FORMATTING IS NOT WORKING CORRECTLY


In [10]:
#Making our predictions list with predicted classifications and the real ones
predictions = []
for i in testingSample.index.values:
    predictions.append(tuple([testingSample.loc[i]['class'], decisionTree.getClassification(testingSample.loc[i])]))
print(predictions)

[('p', 'p'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('p', 'p'), ('e', 'e'), ('p', 'p'), ('e', 'e'), ('e', 'e'), ('p', 'p'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('p', 'p'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('p', 'p'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('p', 'p'), ('e', 'e'), ('e', 'e'), ('p', 'p'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('p', 'p'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('p', 'p'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e', 'e'), ('e

In [11]:
#Auxiliar function to see if prediction is correct
def hit(x):
    if x[0] == x[1]:
        return True
    else:
        return False

In [12]:
#Checking our performance based on the predictions we got right
hits = list(filter(hit, predictions))

numPredictions = len(predictions)
numHits = len(hits)

performance = numHits/numPredictions

In [13]:
#Printing results :)
print("Correctly Classified Rows: " + str(performance*100) + "%")
print("Incorrectly Classified Rows: " + str(100-(performance*100)) + "%")

Correctly Classified Rows: 100.0%
Incorrectly Classified Rows: 0.0%
