# Function Definitions

In [128]:
def split(node, attribute, value):
    """returns two nodes based on a given split"""
    data = node["tree"]
    yes = data[data[attribute] < value]
    no = data[data[attribute] >= value]
    return yes,no

In [181]:
def GINIsplit(node, attribute, value):
    """returns the Gini of the split if applied on the given attribute"""
    [yes,no]=split(node, attribute, value)
    gini = (GINI(yes,node["classifier"])*len(yes)+GINI(no, node["classifier"])*len(no))/len(node)
    del yes
    del no
    return gini

In [216]:
def bestCandidate(tree):
    tests={}
    atts = tree['attributes']
    for att in atts:
        vals = getVals(tree["tree"],att)
        tests[att]=vals
    
    ginis={}
    for test in tests:
        vals = tests[test][1:] #skip the first value since it is the infimum
        
        for val in vals:
            ginis[(test,val)]=GINIsplit(tree, test, val)
    best=max(ginis, key=ginis.get)
    return best

In [217]:
def getVals(df, attribute):
    """returns the DISTINCT values of a specified attribute from a dataset"""
    vals = []
    for val in list(df[attribute]):
        if val not in vals:
            vals += [val]
    vals.sort()
    return vals

In [218]:
def GINI(data, classifier):
    """returns the Gini of the given node, based on the Class"""
    classVals=getVals(data,classifier)
    cl=data[classifier]
    sqSum = 0
    for val in classVals:
        sqSum += (len(cl[cl==val])/len(df))**2
    GINI= 1-sqSum
    return GINI

In [219]:
def createNode(D, A, nodeType, level, classifier, parent):
    node = {}
    node["type"] = nodeType
    node["level"] = level
    node["tree"] = D
    node["attributes"] = A
    node["classifier"] = classifier
    
    gini=float(GINI(D,classifier))
    node["gini"]=gini

In [220]:
def BuildDecisionTree(D, A, minNum, alpha,d, classifier):
    node = {}
    node["type"]= "root"
    node["level"]=0
    node["tree"]=D
    node["attributes"]=A
    node["classifier"] = classifier
    gini=float(GINI(D,classifier))
    node["gini"]=gini
    return node

# Main

In [232]:
import numpy as np
import pandas as pd

df=pd.read_csv('data.csv')
A=list(df.columns[:-1])
classifier=df.columns[-1]
D = df
minNum = 5
alpha = 0.5
default = 0

tree= BuildDecisionTree(D, A, minNum, alpha,default, classifier)
decision= [attribute, value] = bestCandidate(tree)
left,right = split(tree, attribute, value)


('Pclass', 3)

# TESTS

In [171]:
l=[0,3,1]
l.sort()
l


[0, 1, 3]

In [30]:
df = df.append({'A': i}, ignore_index=True)

In [89]:
classVals=getVals(df,classifier)
cl=df[classifier]
sqSum = 0
for val in classVals:
    sqSum += (len(cl[cl==val])/len(df))**2
GINI= 1-sqSum
GINI

1.0

In [110]:
GINI(node)

0.5

In [62]:
cl=df[classifier]
cl[cl==0]

0     0
4     0
5     0
6     0
7     0
12    0
13    0
14    0
16    0
18    0
Name: Survived, dtype: int64

In [66]:
len(cl[cl==val])

10

In [38]:
ginis={"ahh":2,"ohhh":4,"iiiih":1}
min(ginis, key=ginis.get)

'iiiih'

In [71]:
d=tree['tree']


In [125]:
transd=d.T

for row in transd:
    if transd[row]['Sex'] < 1:
        a += [transd[row]]
b= d[d['Pclass']<2]
bb = d[d['Pclass']>=2]
bb

Unnamed: 0,Sex,Pclass,Embarked,Survived
0,1,3,2,0
2,0,3,2,1
4,1,3,2,0
5,1,3,1,0
7,1,3,2,0
8,0,3,2,1
9,0,2,0,1
10,0,3,2,1
12,1,3,2,0
13,1,3,2,0


In [167]:
b.T.count().count()

4