# Function Definitions

In [175]:
def getVals(df, attribute):
    """returns the DISTINCT values of a specified attribute from a dataset"""
    vals = []
    for val in list(df[attribute]):
        if val not in vals:
            vals += [val]
    vals.sort()
    return vals

In [176]:
def GINI(data, classifier):
    """returns the Gini of the given node, based on the Class"""
    classVals=getVals(data,classifier)
    cl=data[classifier]
    sqSum = 0
    for val in classVals:
        sqSum += (len(cl[cl==val])/len(data))**2
    GINI= 1-sqSum
    return GINI

In [172]:
def split(node, attribute, value):
    """returns two nodes based on a given split (attribute < value), input is node dict"""
    data = node["tree"]
    yes = data[data[attribute] < value]
    no = data[data[attribute] >= value]
    return yes,no

In [173]:
def GINIsplit(node, attribute, value):
    """returns the Gini of the split if applied on the given attribute"""
    [yes,no]=split(node, attribute, value)
    gini = (GINI(yes,node["classifier"])*len(yes)+GINI(no, node["classifier"])*len(no))/len(node["tree"])
    del yes
    del no
    return gini

In [174]:
def bestCandidate(tree):
    tests={}
    atts = tree['attributes']
    for att in atts:
        vals = getVals(tree["tree"],att)
        tests[att]=vals
    
    ginis={}
    for test in tests:
        vals = tests[test][1:] #skip the first value since it is the infimum
        
        for val in vals:
            ginis[(test,val)]=GINIsplit(tree, test, val)
    best=min(ginis, key=ginis.get)
    return best

In [192]:
def createNode(D, A, nodeType, level, classifier, minNum=5,alpha=0.5, default = 0):
    node = {}
    node["tree"] = D
    node["attributes"] = A
    node["type"] = nodeType
    node["level"] = level
    node["classifier"] = classifier
    node["min"]=minNum
    node["gini"]=float(GINI(D,classifier))
    return node

In [224]:
def continueTree(root):
    A = list(root["attributes"])
    if(len(root["tree"])<=root["min"] or root["gini"]==0 or root["type"]=="leaf"):
        root["type"]="leaf"
        return
    if(len(root["attributes"])==1):
        decision= [attribute, value] = bestCandidate(root)
        A.remove(attribute)
        left,right = split(root, attribute, value)
        root["left"]  = createNode(left, A, "leaf", root["level"]+1, root["classifier"])
        root["right"] = createNode(right, A, "leaf", root["level"]+1, root["classifier"])
        
    else:
        decision= [attribute, value] = bestCandidate(root)
        A.remove(attribute)
        left,right = split(root, attribute, value)
        root["left"]  = createNode(left , A, "intermediate", root["level"]+1, root["classifier"])
        root["right"] = createNode(right, A, "intermediate", root["level"]+1, root["classifier"])
        continueTree(root["left"])
        continueTree(root["right"])

In [225]:
def BuildDecisionTree(path):
    df=pd.read_csv(path)
    df=df[:20]
    A=list(df.columns[:-1])
    classifier=df.columns[-1]
    D = df
    default = 0
    tree= createNode(D, A, "root", 0, classifier)
    continueTree(tree)
    return tree

In [None]:
def printDecisionTree(tree):
    

# Main

In [226]:
import numpy as np
import pandas as pd

tree = BuildDecisionTree('data.csv')
printDecisionTree(tree)

In [227]:
tree

{'tree':     Sex  Pclass  Embarked  Survived
 0     1       3         2         0
 1     0       1         0         1
 2     0       3         2         1
 3     0       1         2         1
 4     1       3         2         0
 5     1       3         1         0
 6     1       1         2         0
 7     1       3         2         0
 8     0       3         2         1
 9     0       2         0         1
 10    0       3         2         1
 11    0       1         2         1
 12    1       3         2         0
 13    1       3         2         0
 14    0       3         2         0
 15    0       2         2         1
 16    1       3         1         0
 17    1       2         2         1
 18    0       3         2         0
 19    0       3         0         1,
 'attributes': ['Sex', 'Pclass', 'Embarked'],
 'type': 'root',
 'level': 0,
 'classifier': 'Survived',
 'min': 5,
 'gini': 0.5,
 'left': {'tree':     Sex  Pclass  Embarked  Survived
  1     0       1         0     

In [205]:
bestCandidate(root)

('Sex', 1)

In [158]:
split(root,'Sex',1)
[yes,no]=split(tree, 'Sex', 1)
(GINI(yes,tree["classifier"])*len(yes)+GINI(no, tree["classifier"])*len(no))/len(root["tree"])

0.2525252525252525

In [84]:
tree['right']

{'tree':     Sex  Pclass  Embarked  Survived
 0     1       3         2         0
 2     0       3         2         1
 3     0       1         2         1
 4     1       3         2         0
 6     1       1         2         0
 7     1       3         2         0
 8     0       3         2         1
 10    0       3         2         1
 11    0       1         2         1
 12    1       3         2         0
 13    1       3         2         0
 14    0       3         2         0
 15    0       2         2         1
 17    1       2         2         1
 18    0       3         2         0,
 'attributes': ['Sex', 'Pclass'],
 'type': 'intermediate',
 'level': 1,
 'classifier': 'Survived',
 'others': [5, 0.5, 0],
 'gini': 0.49777777777777776}

In [85]:
tree['gini']

0.5

# TESTS

In [69]:
def bestCandidate1(tree):
    tests={}
    atts = tree['attributes']
    for att in atts:
        vals = getVals(tree["tree"],att)
        tests[att]=vals
    print(tests)
    ginis={}
    for test in tests:
        vals = tests[test][1:] #skip the first value since it is the infimum
        
        for val in vals:
            ginis[(test,val)]=GINIsplit(tree, test, val)
    best=max(ginis, key=ginis.get)
    return best

In [62]:
b=a={"pip":[1,2]}
bb=aa = {"man":a, "men":[1,2]}
cc=list(bb)

In [64]:
a["pap"]=[1]
bb
cc

['man', 'men']

In [39]:
a = [1]
a.remove(1)
a is None
a

[]

In [37]:
l=[0,3,1]
l.remove(3)
l

[0, 1]

In [30]:
df = df.append({'A': i}, ignore_index=True)

In [89]:
classVals=getVals(df,classifier)
cl=df[classifier]
sqSum = 0
for val in classVals:
    sqSum += (len(cl[cl==val])/len(df))**2
GINI= 1-sqSum
GINI

1.0

In [110]:
GINI(node)

0.5

In [62]:
cl=df[classifier]
cl[cl==0]

0     0
4     0
5     0
6     0
7     0
12    0
13    0
14    0
16    0
18    0
Name: Survived, dtype: int64

In [66]:
len(cl[cl==val])

10

In [38]:
ginis={"ahh":2,"ohhh":4,"iiiih":1}
min(ginis, key=ginis.get)

'iiiih'

In [71]:
d=tree['tree']


In [125]:
transd=d.T

for row in transd:
    if transd[row]['Sex'] < 1:
        a += [transd[row]]
b= d[d['Pclass']<2]
bb = d[d['Pclass']>=2]
bb

Unnamed: 0,Sex,Pclass,Embarked,Survived
0,1,3,2,0
2,0,3,2,1
4,1,3,2,0
5,1,3,1,0
7,1,3,2,0
8,0,3,2,1
9,0,2,0,1
10,0,3,2,1
12,1,3,2,0
13,1,3,2,0


In [167]:
b.T.count().count()

4