# Function Definitions

In [1]:
def split(node, attribute, value):
    """returns two nodes based on a given split (attribute < value), input is node dict"""
    data = node["tree"]
    yes = data[data[attribute] < value]
    no = data[data[attribute] >= value]
    return yes,no

In [2]:
def getVals(df, attribute):
    """returns the DISTINCT values of a specified attribute from a dataset"""
    vals = []
    for val in list(df[attribute]):
        if val not in vals:
            vals += [val]
    vals.sort()
    return vals

In [3]:
def GINI(data, classifier):
    """returns the Gini of the given node, based on the Class"""
    classVals=getVals(data,classifier)
    cl=data[classifier]
    sqSum = 0
    
    if(len(classVals)<2): #recently added
        return 0
    
    print(classVals,"gini")
    for val in classVals:
        sqSum += (len(cl[cl==val])/len(data))**2
    GINI= 1-sqSum
    return GINI

In [4]:
def GINIsplit(node, attribute, value):
    """returns the Gini of the split if applied on the given attribute"""
    [yes,no]=split(node, attribute, value)
    if(len(yes)==0 or len(no)==0):            #recently added
        print("tell me", yes, no,"STOOOOOOOOOOOP HEREEEEE in GiniSplit")
    gini = (GINI(yes,node["classifier"])*len(yes)+GINI(no, node["classifier"])*len(no))/len(node["tree"])
    del yes
    del no
    return gini

In [5]:
def bestCandidate(tree):
    tests={}
    atts = tree['attributes']
    for att in atts:
        vals = getVals(tree["tree"],att)
        if (len(vals)>1):                 #added this recently
            tests[att]=vals
    print(tests,'tests to do...')
    
    
    ginis={}
    for test in tests:
        vals = tests[test][1:] #skip the first value since it is the infimum
        for val in vals:
            ginis[(test,val)]=GINIsplit(tree, test, val)     
    
    best=min(ginis, key=ginis.get)
    return best

In [6]:
def createNode(D, A, nodeType, level, classifier, minNum=5,alpha=0.5, default = 1):
    node = {}
    node["tree"] = D
    node["attributes"] = A
    node["type"] = nodeType
    node["level"] = level
    node["classifier"] = classifier
    node["min"]=minNum
    node["default"]=default
    node["gini"]=float(GINI(D,classifier))
    return node

In [7]:
def continueTreeOld(root):
    A = list(root["attributes"])
    if(len(root["tree"])<=root["min"] or root["gini"]==0 or root["type"]=="leaf"):
        root["type"]="Leaf"
        return
    if(len(root["attributes"])==1):
        root["feature"]= [attribute, value] = bestCandidate(root)
        A.remove(attribute)
        left,right = split(root, attribute, value)
        root["left"]  = createNode(left, A, "Leaf", root["level"]+1, root["classifier"])
        root["right"] = createNode(right, A, "Leaf", root["level"]+1, root["classifier"])
        
    else:
        root["feature"]= [attribute, value] = bestCandidate(root)
        A.remove(attribute)
        left,right = split(root, attribute, value)
        root["left"]  = createNode(left , A, "Intermediate", root["level"]+1, root["classifier"])
        root["right"] = createNode(right, A, "Intermediate", root["level"]+1, root["classifier"])
        continueTree(root["left"])
        continueTree(root["right"])

In [8]:
def continueTree(root):
    A = list(root["attributes"])
    if(len(root["tree"])<=root["min"] or root["gini"]==0 or root["type"]=="leaf"):
        root["type"]="Leaf"
        return
    print(root["tree"][root["classifier"]])
    #if(all(element == root["tree"][root["classifier"]][0] for element in root["tree"][root["classifier"]])):
    #    root["type"]="Leaf"
    #    return
    root["feature"]= [attribute, value] = bestCandidate(root) #don't call for best candidate if there are no new best candidates
    #A.remove(attribute)
    print("decision:",attribute)
    left,right = split(root, attribute, value)
    print('done the split')
    root["left"]  = createNode(left , A, "Intermediate", root["level"]+1, root["classifier"])
    print('got my left')
    root["right"] = createNode(right, A, "Intermediate", root["level"]+1, root["classifier"])
    print('got my right')
    continueTree(root["left"])
    continueTree(root["right"])

In [9]:
def BuildDecisionTree(path):
    df=pd.read_csv(path)
    df=df[:20]
    A=list(df.columns[:-1])
    classifier=df.columns[-1]
    D = df
    default = 0
    tree= createNode(D, A, "Root", 0, classifier)
    continueTree(tree)
    return tree

In [10]:
def printNode(node):
    nType = node["type"]
    nLevel= node["level"]
    nGini =node["gini"]
    print(nType)
    print("Level",nLevel)
    if nType in ("Root","Intermediate"):
        feature = node["feature"]
        print("Feature",feature[0],end=" ")
        for i in range(min(getVals(node["tree"],feature[0])),feature[1]):
            if (i != feature[1] - 1):
                print(i, end = " ")
            else:
                print(i, end = "")
        print()
    else:
        test = sum(node["tree"][node["classifier"]])/len(node["tree"]) > 0.5
        cl = 1 if test else 0
        if (len(node["tree"])<node["min"]): cl = node["default"]
        print("Class",cl)
    if(nGini!=0):
        print("Gini","{:.4}".format(nGini))
    else:
        print("Gini",0)

In [11]:
def getNextLevel(tree, result):
    if 'left' in tree.keys():
        result[tree['level']+1]= result[tree['level']+1] + [tree['left']]
        result[tree['level']+1]= result[tree['level']+1] + [tree['right']]
        getNextLevel(tree['left'],result)
        getNextLevel(tree['right'],result)

In [12]:
def getTreeOrdered(tree):
    nodesByLevel=[[]]*len(tree["tree"].columns)*5 #recently edited!!!
    nodesByLevel[0] = [tree]
    getNextLevel(tree, nodesByLevel)
    return nodesByLevel

In [13]:
def printDecisionTree(tree):
    printNode(tree)
    print()
    nodesByLevel= getTreeOrdered(tree)
    for level in nodesByLevel[1:]:
        i=0
        for node in level:
            i+=1
            printNode(node)
            if (i != len(level)):
                print('*********')
        print()

In [14]:
def generalizationError(tree, alpha):
    nodesByLevel= getTreeOrdered(tree)
    complexity=0
    mistakes=0
    for level in nodesByLevel[1:]:
        for node in level:
            if(node["type"]=="Leaf"):
                complexity+=1
                vals=node["tree"].T
                for row in node["tree"].T:
                    test = sum(node["tree"][node["classifier"]])/len(node["tree"]) > 0.5
                    cl = 1 if test else 0
                    if (len(node["tree"])<node["min"]): cl = node["default"]
                    if(vals[row][node["classifier"]]!=cl):
                        mistakes+=1
    error = mistakes + alpha * complexity
    print("Generalization Error =",error/len(tree["tree"]))
    return error, mistakes, complexity

In [15]:
def pruneTree(tree, minNum, alpha):
    prunedTree=dict(tree)
    
    for level in nodesByLevel[1:]:
        for node in level:
            if(node["type"]=="Leaf"):
                disappear()
    return prunedTree

# Main

In [16]:
import numpy as np
import pandas as pd

### TODO:  
###      - add class attribute once you know it's a Leaf... if nb(attributes)<minNum ==> default class (Survived)
###      - add class attribute once all data in df have the same Class c
###      - make sure if yu divide genError by len(df)
tree = BuildDecisionTree('data.csv')
printDecisionTree(tree)
error,mistakes,complexity = generalizationError(tree, alpha=0.5)

[0, 1] gini
0     0
1     1
2     1
3     1
4     0
5     0
6     0
7     0
8     1
9     1
10    1
11    1
12    0
13    0
14    0
15    1
16    0
17    1
18    0
19    1
Name: Survived, dtype: int64
{'Sex': [0, 1], 'Pclass': [1, 2, 3], 'Embarked': [0, 1, 2]} tests to do...
[0, 1] gini
[0, 1] gini
[0, 1] gini
[0, 1] gini
[0, 1] gini
[0, 1] gini
[0, 1] gini
[0, 1] gini
[0, 1] gini
decision: Sex
done the split
[0, 1] gini
got my left
[0, 1] gini
got my right
1     1
2     1
3     1
8     1
9     1
10    1
11    1
14    0
15    1
18    0
19    1
Name: Survived, dtype: int64
{'Pclass': [1, 2, 3], 'Embarked': [0, 2]} tests to do...
[0, 1] gini
[0, 1] gini
[0, 1] gini
decision: Pclass
done the split
got my left
[0, 1] gini
got my right
2     1
8     1
10    1
14    0
18    0
19    1
Name: Survived, dtype: int64
{'Embarked': [0, 2]} tests to do...
[0, 1] gini
decision: Embarked
done the split
got my left
[0, 1] gini
got my right
0     0
4     0
5     0
6     0
7     0
12    0
13    0
16    0

In [173]:
mistakes,complexity,len(tree["tree"])

(3, 5, 20)

In [174]:
tree
printDecisionTree(tree)

Root
Level 0
Feature Sex 0
Gini 0.5

Intermediate
Level 1
Feature Pclass 1 2
Gini 0.2975
*********
Intermediate
Level 1
Feature Pclass 1 2
Gini 0.1975

Leaf
Level 2
Class 1
Gini 0
*********
Intermediate
Level 2
Feature Embarked 0 1
Gini 0.4444
*********
Leaf
Level 2
Class 1
Gini 0.5
*********
Leaf
Level 2
Class 0
Gini 0

Leaf
Level 3
Class 1
Gini 0
*********
Leaf
Level 3
Class 1
Gini 0.48



# Tests

In [140]:
a={"hi":1}
b=dict(a)
b["he"]=1
a
b

{'hi': 1, 'he': 1}

In [138]:
nodesByLevel=[[0]]*len(tree["tree"].columns)
nodesByLevel[2] = nodesByLevel[2]+[1] 
nodesByLevel

for row in list(tree["tree"].T):
    tree["tree"][tree["classifier"]]
    print('1')

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [116]:
for r in tree["tree"].T:
    print(r)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


In [59]:
a=[[]]*5
a[3]=[2]
a[3]+=[1]
len(a[3])

2

In [12]:
tree

{'tree':     Sex  Pclass  Embarked  Survived
 0     1       3         2         0
 1     0       1         0         1
 2     0       3         2         1
 3     0       1         2         1
 4     1       3         2         0
 5     1       3         1         0
 6     1       1         2         0
 7     1       3         2         0
 8     0       3         2         1
 9     0       2         0         1
 10    0       3         2         1
 11    0       1         2         1
 12    1       3         2         0
 13    1       3         2         0
 14    0       3         2         0
 15    0       2         2         1
 16    1       3         1         0
 17    1       2         2         1
 18    0       3         2         0
 19    0       3         0         1,
 'attributes': ['Sex', 'Pclass', 'Embarked'],
 'type': 'Root',
 'level': 0,
 'classifier': 'Survived',
 'min': 5,
 'gini': 0.5,
 'feature': ('Sex', 1),
 'left': {'tree':     Sex  Pclass  Embarked  Survived
  1     

In [402]:
len(tree["tree"].columns)

4

In [403]:
bestCandidate(root)

('Sex', 1)

In [158]:
split(root,'Sex',1)
[yes,no]=split(tree, 'Sex', 1)
(GINI(yes,tree["classifier"])*len(yes)+GINI(no, tree["classifier"])*len(no))/len(root["tree"])

0.2525252525252525

In [84]:
tree['right']

{'tree':     Sex  Pclass  Embarked  Survived
 0     1       3         2         0
 2     0       3         2         1
 3     0       1         2         1
 4     1       3         2         0
 6     1       1         2         0
 7     1       3         2         0
 8     0       3         2         1
 10    0       3         2         1
 11    0       1         2         1
 12    1       3         2         0
 13    1       3         2         0
 14    0       3         2         0
 15    0       2         2         1
 17    1       2         2         1
 18    0       3         2         0,
 'attributes': ['Sex', 'Pclass'],
 'type': 'intermediate',
 'level': 1,
 'classifier': 'Survived',
 'others': [5, 0.5, 0],
 'gini': 0.49777777777777776}

In [85]:
tree['gini']

0.5

In [387]:
_ih[-500:]

['',
 'def split(node, attribute, value):\n    """returns two nodes based on a given split"""\n    data = node["tree"]\n    yes = data[data[attribute] < value]\n    no = data[data[attribute] >= value]\n    return yes,no',
 'def GINIsplit(node, attribute, value):\n    """returns the Gini of the split if applied on the given attribute"""\n    [yes,no]=split(node, attribute, value)\n    gini = (GINI(yes,node["classifier"])*len(yes)+GINI(no, node["classifier"])*len(no))/len(node)\n    del yes\n    del no\n    return gini',
 'def bestCandidate(tree):\n    tests={}\n    atts = tree[\'attributes\']\n    for att in atts:\n        vals = getVals(tree["tree"],att)\n        tests[att]=vals\n    \n    ginis={}\n    for test in tests:\n        vals = tests[test][1:] #skip the first value since it is the infimum\n        \n        for val in vals:\n            ginis[(test,val)]=GINIsplit(tree, test, val)\n    best=max(ginis, key=ginis.get)\n    return best',
 'def getVals(df, attribute):\n    """ret

# TESTS

In [69]:
def bestCandidate1(tree):
    tests={}
    atts = tree['attributes']
    for att in atts:
        vals = getVals(tree["tree"],att)
        tests[att]=vals
    print(tests)
    ginis={}
    for test in tests:
        vals = tests[test][1:] #skip the first value since it is the infimum
        
        for val in vals:
            ginis[(test,val)]=GINIsplit(tree, test, val)
    best=max(ginis, key=ginis.get)
    return best

In [62]:
b=a={"pip":[1,2]}
bb=aa = {"man":a, "men":[1,2]}
cc=list(bb)

In [64]:
a["pap"]=[1]
bb
cc

['man', 'men']

In [39]:
a = [1]
a.remove(1)
a is None
a

[]

In [37]:
l=[0,3,1]
l.remove(3)
l

[0, 1]

In [30]:
df = df.append({'A': i}, ignore_index=True)

In [89]:
classVals=getVals(df,classifier)
cl=df[classifier]
sqSum = 0
for val in classVals:
    sqSum += (len(cl[cl==val])/len(df))**2
GINI= 1-sqSum
GINI

1.0

In [110]:
GINI(node)

0.5

In [62]:
cl=df[classifier]
cl[cl==0]

0     0
4     0
5     0
6     0
7     0
12    0
13    0
14    0
16    0
18    0
Name: Survived, dtype: int64

In [66]:
len(cl[cl==val])

10

In [38]:
ginis={"ahh":2,"ohhh":4,"iiiih":1}
min(ginis, key=ginis.get)

'iiiih'

In [71]:
d=tree['tree']


In [125]:
transd=d.T

for row in transd:
    if transd[row]['Sex'] < 1:
        a += [transd[row]]
b= d[d['Pclass']<2]
bb = d[d['Pclass']>=2]
bb

Unnamed: 0,Sex,Pclass,Embarked,Survived
0,1,3,2,0
2,0,3,2,1
4,1,3,2,0
5,1,3,1,0
7,1,3,2,0
8,0,3,2,1
9,0,2,0,1
10,0,3,2,1
12,1,3,2,0
13,1,3,2,0


In [167]:
b.T.count().count()

4