In [1]:
import pandas as pd

In [2]:
#Decision trees also provide the foundation for more advanced ensemble methods such as
#bagging, random forests and gradient boosting

# Calculate the Gini index for a split dataset
def gini_index(groups, class_values):
    gini = 0.0
    for class_value in class_values:
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            proportion = [row[-1] for row in group].count(class_value) / float(size)
            gini += (proportion * (1.0 - proportion))
    return gini

In [3]:
# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

In [4]:
# Select the best split point for a dataset
def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            #print( ' X%d < %.3f Gini=%.3f ' % ((index+1), row[index], gini))
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return { ' index ' :b_index, ' value ' :b_value, ' groups ' :b_groups}

In [5]:
# Create a terminal node value
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

In [6]:
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
    left, right = node[ ' groups ' ]
    del(node[ ' groups ' ])
    # check for a no split
    if not left or not right:
        node[ ' left ' ] = node[ ' right ' ] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node[ ' left ' ], node[ ' right ' ] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node[ ' left ' ] = to_terminal(left)
    else:
        node[ ' left ' ] = get_split(left)
        split(node[ ' left ' ], max_depth, min_size, depth+1)
    # process right child
    if len(right) <= min_size:
        node[ ' right ' ] = to_terminal(right)
    else:
        node[ ' right ' ] = get_split(right)
        split(node[ ' right ' ], max_depth, min_size, depth+1)

In [7]:
# Make a prediction with a decision tree
def predict(node, row):
    if row[node[ ' index ' ]] < node[ ' value ' ]:
        if isinstance(node[ ' left ' ], dict):
            return predict(node[ ' left ' ], row)
        else:
            return node[ ' left ' ]
    else:
        if isinstance(node[ ' right ' ], dict):
            return predict(node[ ' right ' ], row)
        else:
            return node[ ' right ' ]

In [8]:
# Build a decision tree
def build_tree(train, max_depth, min_size):
    root = get_split(dataset)
    print(root)
    split(root, max_depth, min_size, 1)
    return root

In [9]:
# Print a decision tree
def print_tree(node, depth=0):
    if isinstance(node, dict):
        print( ' %s[Var0%d == %s] ' % ((depth* ' ' , (node[ ' index ' ]+1), node[ ' value ' ])))
        print_tree(node[ ' left ' ], depth+1)
        print_tree(node[ ' right ' ], depth+1)
    else:
        print( ' %s[%s] ' % ((depth* ' ' , node)))

In [10]:
data = pd.read_csv('Sample Dataset.csv')
dataset= data.iloc[:, :4].values
print(dataset)
tree = build_tree(dataset,2, 1)
print_tree(tree)
#stump = { ' index ' : 2, ' right ' : 1, ' value ' : 'Round', ' left ' : 2}
#for row in dataset:
#    prediction = predict(stump, row)
#    print( ' Expected=%d, Got=%d ' % (row[-1], prediction))

[['White' 'Short' 'Round' 1]
 ['White' 'Short' 'Round' 1]
 ['Black' 'Short' 'Round' 1]
 ['White' 'Long' 'Round' 1]
 ['Black' 'Medium' 'Round' 1]
 ['White' 'Medium' 'Rectangular' 2]
 ['Pink' 'Long' 'Rectangular' 2]
 ['Pink' 'Short' 'Round' 2]
 ['White' 'Long' 'Rectangular' 2]
 ['White' 'Long' 'Rectangular' 2]]
{' index ': 2, ' value ': 'Round', ' groups ': ([array(['White', 'Medium', 'Rectangular', 2], dtype=object), array(['Pink', 'Long', 'Rectangular', 2], dtype=object), array(['White', 'Long', 'Rectangular', 2], dtype=object), array(['White', 'Long', 'Rectangular', 2], dtype=object)], [array(['White', 'Short', 'Round', 1], dtype=object), array(['White', 'Short', 'Round', 1], dtype=object), array(['Black', 'Short', 'Round', 1], dtype=object), array(['White', 'Long', 'Round', 1], dtype=object), array(['Black', 'Medium', 'Round', 1], dtype=object), array(['Pink', 'Short', 'Round', 2], dtype=object)])}
 [Var03 == Round] 
  [Var01 == White] 
   [2] 
   [2] 
  [Var01 == Black] 
   [1] 
   