Code to accompany Machine Learning Recipes #8. We'll write a Decision Tree Classifier, in pure Python. Below each of the methods, I've written a little demo to help explain what it does.

In [1]:
# For Python 2 / 3 compatability
from __future__ import print_function
import pandas as pd

In [2]:
# Toy dataset.
# Format: each row is an example.
# The last column is the label.
# The first two columns are features.
# Feel free to play with it by adding more features & examples.
# Interesting note: I've written this so the 2nd and 5th examples
# have the same features, but different labels - so we can see how the
# tree handles this case.

training_data = pd.DataFrame([
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
])
data = pd.read_csv('diabetes.csv')
data = data.loc[:50,:]
data.shape

(51, 9)

In [3]:
# Column labels.
# These are used only to print the tree.
columnNames = list(data)
header = columnNames

In [4]:
def unique_vals(rows, col):
    """Find the unique values for a column in a dataset."""
    
    return set([rows.loc[row,col] for row in rows])

In [5]:
#######
# Demo:
print(training_data)
unique_vals(training_data, 0)

# unique_vals(training_data, 1)
#######

        0  1      2
0   Green  3  Apple
1  Yellow  3  Apple
2     Red  1  Grape
3     Red  1  Grape
4  Yellow  3  Lemon


{'Green', 'Red', 'Yellow'}

In [44]:
def class_counts(rows):
    if isinstance(rows,list):
        rows = pd.DataFrame(rows)
    """Counts the number of each type of example in a dataset."""
    counts = {}  # a dictionary of label -> count.
    for r in range(rows.shape[0]):
        row = rows.iloc[r].tolist()
        # in our dataset format, the label is always the last column
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [7]:
#######
# Demo:
class_counts(training_data)
#######

{'Apple': 2, 'Grape': 2, 'Lemon': 1}

In [8]:
def is_numeric(value):
    """Test if a value is numeric."""
    return isinstance(value, int) or isinstance(value, float)

In [9]:
#######
# Demo:
is_numeric(7)
# is_numeric("Red")
#######

True

In [33]:
class Question:
    """A Question is used to partition a dataset.

    This class just records a 'column number' (e.g., 0 for Color) and a
    'column value' (e.g., Green). The 'match' method is used to compare
    the feature value in an example to the feature value stored in the
    question. See the demo below.
    """

    def __init__(self, column, value):
        #value is what is being tested against
        self.column = column
        self.value = value

    def match(self, example):
        #[AS] example should be a row
        
        # Compare the feature value in an example to the
        # feature value in this question.
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [11]:
#######
# Demo:
# Let's write a question for a numeric attribute
Question(1, 3)

Is Glucose >= 3?

In [12]:
# How about one for a categorical attribute
q = Question(0, 'Green')
q

Is Pregnancies == Green?

In [13]:
# Let's pick an example from the training set...
example = training_data[0]
# ... and see if it matches the question
q.match(example) # this will be true, since the first example is Green.

#asking if pregnancies row 0 is greater than or equal to 5
r= Question(0,5)
example2 = data.iloc[0]
print(data.iloc[0][0])
r.match(example2)
#######

6.0


True

In [42]:
def partition(rows, question):
    if isinstance(rows,list):
        rows = pd.DataFrame(rows)
    """Partitions a dataset.

    For each row in the dataset, check if it matches the question. If
    so, add it to 'true rows', otherwise, add it to 'false rows'.
    """
    true_rows, false_rows = [], []
    for r in range(rows.shape[0]):
        row = rows.iloc[r].tolist()
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows



In [15]:
#######
# Demo:
# Let's partition the training data based on whether rows are Red.
#true_rows, false_rows = partition(training_data, Question(0, 'Red'))
# This will contain all the 'Red' rows.
#true_rows

##[AS]
#print(data)
true_rows, false_rows = partition(data,Question(0,5))
true_rows

[[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0],
 [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0],
 [5.0, 116.0, 74.0, 0.0, 0.0, 25.6, 0.201, 30.0, 0.0],
 [10.0, 115.0, 0.0, 0.0, 0.0, 35.3, 0.134, 29.0, 0.0],
 [8.0, 125.0, 96.0, 0.0, 0.0, 0.0, 0.23199999999999998, 54.0, 1.0],
 [10.0, 168.0, 74.0, 0.0, 0.0, 38.0, 0.537, 34.0, 1.0],
 [10.0, 139.0, 80.0, 0.0, 0.0, 27.1, 1.4409999999999998, 57.0, 0.0],
 [5.0, 166.0, 72.0, 19.0, 175.0, 25.8, 0.5870000000000001, 51.0, 1.0],
 [7.0, 100.0, 0.0, 0.0, 0.0, 30.0, 0.484, 32.0, 1.0],
 [7.0, 107.0, 74.0, 0.0, 0.0, 29.6, 0.254, 31.0, 1.0],
 [8.0, 99.0, 84.0, 0.0, 0.0, 35.4, 0.38799999999999996, 50.0, 0.0],
 [7.0, 196.0, 90.0, 0.0, 0.0, 39.8, 0.451, 41.0, 1.0],
 [9.0, 119.0, 80.0, 35.0, 0.0, 29.0, 0.263, 29.0, 1.0],
 [11.0, 143.0, 94.0, 33.0, 146.0, 36.6, 0.254, 51.0, 1.0],
 [10.0, 125.0, 70.0, 26.0, 115.0, 31.1, 0.205, 41.0, 1.0],
 [7.0, 147.0, 76.0, 0.0, 0.0, 39.4, 0.257, 43.0, 1.0],
 [13.0, 145.0, 82.0, 19.0, 110.0, 22.2, 0.245, 57.0, 0

In [16]:
# This will contain everything else.
false_rows
#######

[[1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.35100000000000003, 31.0, 0.0],
 [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.16699999999999998, 21.0, 0.0],
 [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.2880000000000003, 33.0, 1.0],
 [3.0, 78.0, 50.0, 32.0, 88.0, 31.0, 0.248, 26.0, 1.0],
 [2.0, 197.0, 70.0, 45.0, 543.0, 30.5, 0.158, 53.0, 1.0],
 [4.0, 110.0, 92.0, 0.0, 0.0, 37.6, 0.191, 30.0, 0.0],
 [1.0, 189.0, 60.0, 23.0, 846.0, 30.1, 0.39799999999999996, 59.0, 1.0],
 [0.0, 118.0, 84.0, 47.0, 230.0, 45.8, 0.551, 31.0, 1.0],
 [1.0, 103.0, 30.0, 38.0, 83.0, 43.3, 0.183, 33.0, 0.0],
 [1.0, 115.0, 70.0, 30.0, 96.0, 34.6, 0.529, 32.0, 1.0],
 [3.0, 126.0, 88.0, 41.0, 235.0, 39.3, 0.7040000000000001, 27.0, 0.0],
 [1.0, 97.0, 66.0, 15.0, 140.0, 23.2, 0.48700000000000004, 22.0, 0.0],
 [3.0, 158.0, 76.0, 36.0, 245.0, 31.6, 0.851, 28.0, 1.0],
 [3.0, 88.0, 58.0, 11.0, 54.0, 24.8, 0.267, 22.0, 0.0],
 [4.0, 103.0, 60.0, 33.0, 192.0, 24.0, 0.966, 33.0, 0.0],
 [2.0, 90.0, 68.0, 42.0, 0.0, 38.2, 0.503, 27.0, 1.0],
 [4.0, 1

In [17]:
def gini(rows):
    """Calculate the Gini Impurity for a list of rows.

    There are a few different ways to do this, I thought this one was
    the most concise. See:
    https://en.wikipedia.org/wiki/Decision_tree_learning#Gini_impurity
    """
    if isinstance(rows,list):
        rows = pd.DataFrame(rows)
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    print("\nGini score is:" + str(impurity) + '\n')
    return impurity

In [18]:
#######
# Demo:
# Let's look at some example to understand how Gini Impurity works.
#
# First, we'll look at a dataset with no mixing.
no_mixing = pd.DataFrame([['Apple'],
              ['Apple']])
# this will return 0
gini(no_mixing)


Gini score is:0.0



0.0

In [19]:
# Now, we'll look at dataset with a 50:50 apples:oranges ratio
some_mixing = pd.DataFrame([['Apple'],
               ['Orange']])
# this will return 0.5 - meaning, there's a 50% chance of misclassifying
# a random example we draw from the dataset.
gini(some_mixing)


Gini score is:0.5



0.5

In [20]:
# Now, we'll look at a dataset with many different labels
lots_of_mixing = pd.DataFrame([['Apple'],
                  ['Orange'],
                  ['Grape'],
                  ['Grapefruit'],
                  ['Blueberry']])
# This will return 0.8
gini(lots_of_mixing)
#######


Gini score is:0.7999999999999998



0.7999999999999998

In [21]:
def info_gain(left, right, current_uncertainty):
    """Information Gain.

    The uncertainty of the starting node, minus the weighted impurity of
    two child nodes.
    """
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

In [22]:
#######
# Demo:
# Calculate the uncertainy of our training data.
current_uncertainty = gini(data)
current_uncertainty


Gini score is:0.4998077662437525



0.4998077662437525

In [23]:
# How much information do we gain by partioning on 'Pregnancies'?
true_rows, false_rows = partition(data, Question(0, 7))
info_gain(true_rows, false_rows, current_uncertainty)


Gini score is:0.4914933837429112


Gini score is:0.489795918367347



0.009246324275660855

In [24]:
# What about if we partioned on 'Red' instead?
true_rows, false_rows = partition(data, Question(6, .7))
info_gain(true_rows, false_rows, current_uncertainty)


Gini score is:0.46875


Gini score is:0.4975662520281234



0.006761710612197491

In [25]:
# It looks like we learned more using 'Red' (0.37), than 'Green' (0.14).
# Why? Look at the different splits that result, and see which one
# looks more 'unmixed' to you.
true_rows, false_rows = partition(data, Question(0,7))

# Here, the true_rows contain only 'Grapes'.
true_rows

[[8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0],
 [10.0, 115.0, 0.0, 0.0, 0.0, 35.3, 0.134, 29.0, 0.0],
 [8.0, 125.0, 96.0, 0.0, 0.0, 0.0, 0.23199999999999998, 54.0, 1.0],
 [10.0, 168.0, 74.0, 0.0, 0.0, 38.0, 0.537, 34.0, 1.0],
 [10.0, 139.0, 80.0, 0.0, 0.0, 27.1, 1.4409999999999998, 57.0, 0.0],
 [7.0, 100.0, 0.0, 0.0, 0.0, 30.0, 0.484, 32.0, 1.0],
 [7.0, 107.0, 74.0, 0.0, 0.0, 29.6, 0.254, 31.0, 1.0],
 [8.0, 99.0, 84.0, 0.0, 0.0, 35.4, 0.38799999999999996, 50.0, 0.0],
 [7.0, 196.0, 90.0, 0.0, 0.0, 39.8, 0.451, 41.0, 1.0],
 [9.0, 119.0, 80.0, 35.0, 0.0, 29.0, 0.263, 29.0, 1.0],
 [11.0, 143.0, 94.0, 33.0, 146.0, 36.6, 0.254, 51.0, 1.0],
 [10.0, 125.0, 70.0, 26.0, 115.0, 31.1, 0.205, 41.0, 1.0],
 [7.0, 147.0, 76.0, 0.0, 0.0, 39.4, 0.257, 43.0, 1.0],
 [13.0, 145.0, 82.0, 19.0, 110.0, 22.2, 0.245, 57.0, 0.0],
 [10.0, 122.0, 78.0, 31.0, 0.0, 27.6, 0.512, 45.0, 0.0],
 [11.0, 138.0, 76.0, 0.0, 0.0, 33.2, 0.42, 35.0, 0.0],
 [9.0, 102.0, 76.0, 37.0, 0.0, 32.9, 0.665, 46.0, 1.0],
 [7.0, 13

In [26]:
# And the false rows contain two types of fruit. Not too bad.
false_rows

[[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0],
 [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.35100000000000003, 31.0, 0.0],
 [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.16699999999999998, 21.0, 0.0],
 [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.2880000000000003, 33.0, 1.0],
 [5.0, 116.0, 74.0, 0.0, 0.0, 25.6, 0.201, 30.0, 0.0],
 [3.0, 78.0, 50.0, 32.0, 88.0, 31.0, 0.248, 26.0, 1.0],
 [2.0, 197.0, 70.0, 45.0, 543.0, 30.5, 0.158, 53.0, 1.0],
 [4.0, 110.0, 92.0, 0.0, 0.0, 37.6, 0.191, 30.0, 0.0],
 [1.0, 189.0, 60.0, 23.0, 846.0, 30.1, 0.39799999999999996, 59.0, 1.0],
 [5.0, 166.0, 72.0, 19.0, 175.0, 25.8, 0.5870000000000001, 51.0, 1.0],
 [0.0, 118.0, 84.0, 47.0, 230.0, 45.8, 0.551, 31.0, 1.0],
 [1.0, 103.0, 30.0, 38.0, 83.0, 43.3, 0.183, 33.0, 0.0],
 [1.0, 115.0, 70.0, 30.0, 96.0, 34.6, 0.529, 32.0, 1.0],
 [3.0, 126.0, 88.0, 41.0, 235.0, 39.3, 0.7040000000000001, 27.0, 0.0],
 [1.0, 97.0, 66.0, 15.0, 140.0, 23.2, 0.48700000000000004, 22.0, 0.0],
 [5.0, 117.0, 92.0, 0.0, 0.0, 34.1, 0.337, 38.0, 0.0]

In [27]:
# On the other hand, partitioning by Green doesn't help so much.
true_rows, false_rows = partition(data, Question(6,.7))

# We've isolated one apple in the true rows.
true_rows

[[0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.2880000000000003, 33.0, 1.0],
 [10.0, 139.0, 80.0, 0.0, 0.0, 27.1, 1.4409999999999998, 57.0, 0.0],
 [3.0, 126.0, 88.0, 41.0, 235.0, 39.3, 0.7040000000000001, 27.0, 0.0],
 [3.0, 158.0, 76.0, 36.0, 245.0, 31.6, 0.851, 28.0, 1.0],
 [4.0, 103.0, 60.0, 33.0, 192.0, 24.0, 0.966, 33.0, 0.0],
 [4.0, 111.0, 72.0, 47.0, 207.0, 37.1, 1.39, 56.0, 1.0],
 [9.0, 171.0, 110.0, 24.0, 240.0, 45.4, 0.721, 54.0, 1.0],
 [0.0, 180.0, 66.0, 39.0, 0.0, 42.0, 1.893, 25.0, 1.0]]

In [28]:
# But, the false-rows are badly mixed up.
false_rows
#######

[[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0],
 [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.35100000000000003, 31.0, 0.0],
 [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0],
 [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.16699999999999998, 21.0, 0.0],
 [5.0, 116.0, 74.0, 0.0, 0.0, 25.6, 0.201, 30.0, 0.0],
 [3.0, 78.0, 50.0, 32.0, 88.0, 31.0, 0.248, 26.0, 1.0],
 [10.0, 115.0, 0.0, 0.0, 0.0, 35.3, 0.134, 29.0, 0.0],
 [2.0, 197.0, 70.0, 45.0, 543.0, 30.5, 0.158, 53.0, 1.0],
 [8.0, 125.0, 96.0, 0.0, 0.0, 0.0, 0.23199999999999998, 54.0, 1.0],
 [4.0, 110.0, 92.0, 0.0, 0.0, 37.6, 0.191, 30.0, 0.0],
 [10.0, 168.0, 74.0, 0.0, 0.0, 38.0, 0.537, 34.0, 1.0],
 [1.0, 189.0, 60.0, 23.0, 846.0, 30.1, 0.39799999999999996, 59.0, 1.0],
 [5.0, 166.0, 72.0, 19.0, 175.0, 25.8, 0.5870000000000001, 51.0, 1.0],
 [7.0, 100.0, 0.0, 0.0, 0.0, 30.0, 0.484, 32.0, 1.0],
 [0.0, 118.0, 84.0, 47.0, 230.0, 45.8, 0.551, 31.0, 1.0],
 [7.0, 107.0, 74.0, 0.0, 0.0, 29.6, 0.254, 31.0, 1.0],
 [1.0, 103.0, 30.0, 38.0, 83.0, 43.3,

In [40]:
def find_best_split(rows):
    """Find the best question to ask by iterating over every feature / value
    and calculating the information gain."""
    if isinstance(rows,list):
        rows = pd.DataFrame(rows)
    best_gain = 0  # keep track of the best information gain
    best_question = None  # keep train of the feature / value that produced it
    current_uncertainty = gini(rows)
    n_features = len(rows.columns)-1  #len(rows[0]) - 1  # number of columns

    for col in range(n_features):  # for each feature
        print([rows.loc[0,rows.columns[col]]])
        values = set(rows.loc[:,rows.columns[col]])  # unique values in the column

        for val in values:  # for each value

            question = Question(col, val)

            # try splitting the dataset
            true_rows, false_rows = partition(rows, question)

            # Skip this split if it doesn't divide the
            # dataset.
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            # Calculate the information gain from this split
            gain = info_gain(true_rows, false_rows, current_uncertainty)

            # You actually can use '>' instead of '>=' here
            # but I wanted the tree to look a certain way for our
            # toy dataset.
            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [34]:
#######
# Demo:
# Find the best question to ask first for our toy dataset.
best_gain, best_question = find_best_split(data)
best_question
# FYI: is color == Red is just as good. See the note in the code above
# where I used '>='.
#######


Gini score is:0.4998077662437525

[6]

Gini score is:0.49652777777777785


Gini score is:0.0


Gini score is:0.5


Gini score is:0.49586776859504145


Gini score is:0.49963476990504024


Gini score is:0.5


Gini score is:0.5


Gini score is:0.4986149584487535


Gini score is:0.49940546967895355


Gini score is:0.49586776859504145


Gini score is:0.4927999999999999


Gini score is:0.48816568047337283


Gini score is:0.4914933837429112


Gini score is:0.489795918367347


Gini score is:0.48979591836734704


Gini score is:0.4967129291453616


Gini score is:0.4958677685950414


Gini score is:0.49875


Gini score is:0.46875


Gini score is:0.49972958355868036


Gini score is:0.4444444444444444


Gini score is:0.5


Gini score is:0.0


Gini score is:0.5

[148]

Gini score is:0.4549999999999999


Gini score is:0.47450572320499484


Gini score is:0.4321329639889197


Gini score is:0.46875


Gini score is:0.4444444444444445


Gini score is:0.4775022956841138


Gini score is:0.4152249134948096





Gini score is:0.49323958896700915


Gini score is:0.48


Gini score is:0.48699271592091564


Gini score is:0.4914933837429112


Gini score is:0.48979591836734704


Gini score is:0.48979591836734704


Gini score is:0.4911111111111111


Gini score is:0.0


Gini score is:0.49812578092461474


Gini score is:0.31999999999999984


Gini score is:0.49621928166351614


Gini score is:0.3911111111111112


Gini score is:0.4753086419753086


Gini score is:0.4296875


Gini score is:0.48


Gini score is:0.40816326530612246


Gini score is:0.4821037253469685


Gini score is:0.39669421487603307


Gini score is:0.48875


Gini score is:0.42000000000000004


Gini score is:0.4925639500297441


Gini score is:0.4444444444444445


Gini score is:0.4977777777777777


Gini score is:0.48611111111111094


Gini score is:0.4828532235939643


Gini score is:0.47337278106508873


Gini score is:0.46080000000000004


Gini score is:0.48


Gini score is:0.47337278106508873


Gini score is:0.49756625202812343


Gini score

Is BMI >= 29.0?

In [35]:
class Leaf:
    """A Leaf node classifies data.

    This holds a dictionary of class (e.g., "Apple") -> number of times
    it appears in the rows from the training data that reach this leaf.
    """

    def __init__(self, rows):
        self.predictions = class_counts(rows)

In [36]:
class Decision_Node:
    """A Decision Node asks a question.

    This holds a reference to the question, and to the two child nodes.
    """

    def __init__(self,
                 question,
                 true_branch,
                 false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [37]:
def build_tree(rows):
    
    """Builds the tree.

    Rules of recursion: 1) Believe that it works. 2) Start by checking
    for the base case (no further information gain). 3) Prepare for
    giant stack traces.
    """

    # Try partitioing the dataset on each of the unique attribute,
    # calculate the information gain,
    # and return the question that produces the highest gain.
    gain, question = find_best_split(rows)

    # Base case: no further info gain
    # Since we can ask no further questions,
    # we'll return a leaf.
    if gain == 0:
        return Leaf(rows)

    # If we reach here, we have found a useful feature / value
    # to partition on.
    true_rows, false_rows = partition(rows, question)

    # Recursively build the true branch.
    true_branch = build_tree(true_rows)

    # Recursively build the false branch.
    false_branch = build_tree(false_rows)

    # Return a Question node.
    # This records the best feature / value to ask at this point,
    # as well as the branches to follow
    # dependingo on the answer.
    return Decision_Node(question, true_branch, false_branch)

In [38]:
def print_tree(node, spacing=""):
    """World's most elegant tree printing function."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return

    # Print the question at this node
    print (spacing + str(node.question))

    # Call this function recursively on the true branch
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    # Call this function recursively on the false branch
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [47]:
my_tree = build_tree(data)


Gini score is:0.4998077662437525

[6]

Gini score is:0.49652777777777785


Gini score is:0.0


Gini score is:0.5


Gini score is:0.49586776859504145


Gini score is:0.49963476990504024


Gini score is:0.5


Gini score is:0.5


Gini score is:0.4986149584487535


Gini score is:0.49940546967895355


Gini score is:0.49586776859504145


Gini score is:0.4927999999999999


Gini score is:0.48816568047337283


Gini score is:0.4914933837429112


Gini score is:0.489795918367347


Gini score is:0.48979591836734704


Gini score is:0.4967129291453616


Gini score is:0.4958677685950414


Gini score is:0.49875


Gini score is:0.46875


Gini score is:0.49972958355868036


Gini score is:0.4444444444444444


Gini score is:0.5


Gini score is:0.0


Gini score is:0.5

[148]

Gini score is:0.4549999999999999


Gini score is:0.47450572320499484


Gini score is:0.4321329639889197


Gini score is:0.46875


Gini score is:0.4444444444444445


Gini score is:0.4775022956841138


Gini score is:0.4152249134948096




Gini score is:0.451171875


Gini score is:0.3324099722991689


Gini score is:0.4834710743801651


Gini score is:0.4851367419738407


Gini score is:0.4653739612188366


Gini score is:0.482421875


Gini score is:0.47530864197530853


Gini score is:0.4885215794306703


Gini score is:0.4591836734693877


Gini score is:0.4234404536862004


Gini score is:0.42603550295857984


Gini score is:0.48753462603878117


Gini score is:0.375


Gini score is:0.48389217619986846


Gini score is:0.4444444444444445


Gini score is:0.4954648526077097


Gini score is:0.46639231824417005


Gini score is:0.4444444444444445


Gini score is:0.45674740484429066


Gini score is:0.4844290657439447


Gini score is:0.40816326530612246


Gini score is:0.4958677685950414


Gini score is:0.375


Gini score is:0.49796287913082843


Gini score is:0.4444444444444445


Gini score is:0.4991319444444444


Gini score is:0.0


Gini score is:0.49920000000000003


Gini score is:0.375


Gini score is:0.49323958896700915


Gini sc



Gini score is:0.21875

[0.627]

Gini score is:0.345679012345679


Gini score is:0.46875


Gini score is:0.40816326530612246


Gini score is:0.48


Gini score is:0.0


Gini score is:0.451171875


Gini score is:0.42000000000000004


Gini score is:0.4444444444444445


Gini score is:0.42603550295857984


Gini score is:0.4549999999999999


Gini score is:0.38781163434903054


Gini score is:0.48979591836734704


Gini score is:0.375


Gini score is:0.4844290657439447


Gini score is:0.39669421487603307


Gini score is:0.4628099173553719


Gini score is:0.41700960219478733


Gini score is:0.5


Gini score is:0.40816326530612246


Gini score is:0.4653739612188366


Gini score is:0.42000000000000004


Gini score is:0.47337278106508873


Gini score is:0.4444444444444445


Gini score is:0.4444444444444445


Gini score is:0.4370447450572321


Gini score is:0.5


Gini score is:0.4012345679012346


Gini score is:0.48


Gini score is:0.40816326530612246


Gini score is:0.4526627218934911


Gini score



Gini score is:0.24489795918367355


Gini score is:0.0

[72.0]

Gini score is:0.1420118343195265


Gini score is:0.0


Gini score is:0.1652892561983472


Gini score is:0.0


Gini score is:0.17999999999999994


Gini score is:0.0


Gini score is:0.21875


Gini score is:0.0


Gini score is:0.2777777777777777


Gini score is:0.0


Gini score is:0.375


Gini score is:0.0


Gini score is:0.13265306122448978


Gini score is:0.0


Gini score is:0.4444444444444445


Gini score is:0.0


Gini score is:0.5


Gini score is:0.0


Gini score is:0.0


Gini score is:0.13265306122448978

[35.0]

Gini score is:0.13265306122448978


Gini score is:0.0


Gini score is:0.15277777777777787


Gini score is:0.0


Gini score is:0.1652892561983472


Gini score is:0.0


Gini score is:0.21875


Gini score is:0.0


Gini score is:0.24489795918367355


Gini score is:0.0


Gini score is:0.2777777777777777


Gini score is:0.0


Gini score is:0.31999999999999984


Gini score is:0.0


Gini score is:0.0


Gini score is:0.


Gini score is:0.375


Gini score is:0.47337278106508873


Gini score is:0.48979591836734704


Gini score is:0.4444444444444445


Gini score is:0.49704142011834324


Gini score is:0.5


Gini score is:0.40816326530612246


Gini score is:0.42000000000000004


Gini score is:0.5


Gini score is:0.48


Gini score is:0.2777777777777777


Gini score is:0.39669421487603307


Gini score is:0.48


Gini score is:0.40816326530612246


Gini score is:0.49586776859504145


Gini score is:0.4444444444444445


Gini score is:0.4444444444444445


Gini score is:0.48979591836734704


Gini score is:0.0


Gini score is:0.48

[0.0]

Gini score is:0.49704142011834324


Gini score is:0.5


Gini score is:0.5


Gini score is:0.48


Gini score is:0.49586776859504145


Gini score is:0.5


Gini score is:0.4444444444444445


Gini score is:0.46875


Gini score is:0.46875


Gini score is:0.49382716049382713


Gini score is:0.5


Gini score is:0.4444444444444445


Gini score is:0.0


Gini score is:0.4921875


Gini score 

[0.0]

Gini score is:0.0


Gini score is:0.0

[0.0]
[35.3]

Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0

[0.134]

Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0

[29.0]

Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.0


Gini score is:0.375

[7.0]

Gini score is:0.0


Gini score is:0.4444444444444445


Gini score is:0.0


Gini score is:0.0

[100.0]

Gini score is:0.0


Gini score is:0.0


Gini score is:0.4444444444444445





Gini score is:0.31999999999999984


Gini score is:0.0


Gini score is:0.2777777777777777


Gini score is:0.12444444444444439


Gini score is:0.0


Gini score is:0.0


Gini score is:0.15277777777777787


Gini score is:0.0


Gini score is:0.375


Gini score is:0.0


Gini score is:0.1420118343195265


Gini score is:0.0


Gini score is:0.13265306122448978


Gini score is:0.0


Gini score is:0.21875


Gini score is:0.0


Gini score is:0.1652892561983472


Gini score is:0.0


Gini score is:0.24489795918367355

[31.0]

Gini score is:0.24489795918367352


Gini score is:0.0


Gini score is:0.2777777777777777


Gini score is:0.0


Gini score is:0.31999999999999984


Gini score is:0.0


Gini score is:0.375


Gini score is:0.0


Gini score is:0.4444444444444444


Gini score is:0.0


Gini score is:0.12444444444444439


Gini score is:0.0


Gini score is:0.1652892561983472


Gini score is:0.0


Gini score is:0.0


Gini score is:0.13265306122448978


Gini score is:0.17999999999999994


Gini score is

In [48]:
print_tree(my_tree)

Is BMI >= 29.0?
--> True:
  Is SkinThickness >= 30.0?
  --> True:
    Is BloodPressure >= 40.0?
    --> True:
      Is BloodPressure >= 88.0?
      --> True:
        Is Age >= 51.0?
        --> True:
          Predict {1.0: 1}
        --> False:
          Predict {0.0: 1}
      --> False:
        Predict {1.0: 13}
    --> False:
      Predict {0.0: 1}
  --> False:
    Is Glucose >= 147.0?
    --> True:
      Is Age >= 34.0?
      --> True:
        Predict {1.0: 5}
      --> False:
        Predict {0.0: 1}
    --> False:
      Is BMI >= 33.2?
      --> True:
        Predict {0.0: 7}
      --> False:
        Is Age >= 31.0?
        --> True:
          Predict {1.0: 3}
        --> False:
          Predict {0.0: 1}
--> False:
  Is Glucose >= 166.0?
  --> True:
    Predict {1.0: 2}
  --> False:
    Is BloodPressure >= 96.0?
    --> True:
      Predict {1.0: 1}
    --> False:
      Predict {0.0: 15}


In [49]:
def classify(row, node):
    """See the 'rules of recursion' above."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [53]:
#######
# Demo:
# The tree predicts the 1st row of our
# training data is an apple with confidence 1.
classify(data.iloc[0], my_tree)
#######

{1.0: 13}

In [54]:
def print_leaf(counts):
    """A nicer way to print the predictions at a leaf."""
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

In [55]:
#######
# Demo:
# Printing that a bit nicer
print_leaf(classify(data.iloc[0], my_tree))
#######

{1.0: '100%'}

In [56]:
#######
# Demo:
# On the second example, the confidence is lower
print_leaf(classify(data.iloc[1], my_tree))
#######

{0.0: '100%'}

In [62]:
# Evaluate
testing_data = pd.read_csv('diabetes.csv')
testing_data = testing_data.iloc[-50:,:]
testing_data.shape

(50, 9)

In [64]:
for t in range(len(testing_data)):
    row = testing_data.iloc[t]
    print ("Actual: %s. Predicted: %s" %
           (row[-1], print_leaf(classify(row, my_tree))))

Actual: 0.0. Predicted: {1.0: '100%'}
Actual: 1.0. Predicted: {0.0: '100%'}
Actual: 0.0. Predicted: {1.0: '100%'}
Actual: 0.0. Predicted: {1.0: '100%'}
Actual: 1.0. Predicted: {1.0: '100%'}
Actual: 0.0. Predicted: {1.0: '100%'}
Actual: 0.0. Predicted: {1.0: '100%'}
Actual: 0.0. Predicted: {1.0: '100%'}
Actual: 0.0. Predicted: {0.0: '100%'}
Actual: 0.0. Predicted: {0.0: '100%'}
Actual: 0.0. Predicted: {1.0: '100%'}
Actual: 0.0. Predicted: {0.0: '100%'}
Actual: 1.0. Predicted: {0.0: '100%'}
Actual: 1.0. Predicted: {0.0: '100%'}
Actual: 1.0. Predicted: {0.0: '100%'}
Actual: 0.0. Predicted: {0.0: '100%'}
Actual: 0.0. Predicted: {0.0: '100%'}
Actual: 0.0. Predicted: {1.0: '100%'}
Actual: 0.0. Predicted: {0.0: '100%'}
Actual: 0.0. Predicted: {1.0: '100%'}
Actual: 0.0. Predicted: {0.0: '100%'}
Actual: 1.0. Predicted: {0.0: '100%'}
Actual: 1.0. Predicted: {1.0: '100%'}
Actual: 0.0. Predicted: {0.0: '100%'}
Actual: 0.0. Predicted: {0.0: '100%'}
Actual: 1.0. Predicted: {1.0: '100%'}
Actual: 0.0.