In [0]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
iris = load_iris()

In [0]:
x=iris.data
y=iris.target

In [0]:
data=np.c_[x,y]

In [0]:
cols=['sepal_length','sepal_width','petal_length','petal_width']
header=cols+['species']
iris_df=pd.DataFrame(data=data,columns=header)

In [80]:
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [0]:
iris_df.species.replace(0.0,'iris-sesota',inplace=True)
iris_df.species.replace(1.0,'iris-versicolor',inplace=True)
iris_df.species.replace(2.0,'iris-virginica',inplace=True)

In [83]:
iris_df.shape

(150, 5)

In [0]:
class Question:
    def __init__(self,column,value):
        self.column=column
        self.value=value
    def match(self,data):
        value=data[self.column]
        return value>=self.value
    def __repr__(self):
        condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [84]:
Question(0,6)
q=Question(0,6)
q.match(x[4])

False

In [0]:
def count_values(rows):
    count={}
    for row in  rows:
      label=row[-1]
      if label not in count:
        count[label]=0
      count[label]+=1
    return count

In [86]:
count_values(data)

{0.0: 50, 1.0: 50, 2.0: 50}

In [0]:
def partition(rows,question): 
    true_row,false_row=[],[]
    for row in rows:
        if question.match(row):
            true_row.append(row)
        else:
            false_row.append(row)
    return true_row,false_row

In [88]:
print(Question(0,6))
t_r,f_r=partition(data,Question(0,6))
#f_r
t_r


Is sepal_length >= 6?


[array([7. , 3.2, 4.7, 1.4, 1. ]),
 array([6.4, 3.2, 4.5, 1.5, 1. ]),
 array([6.9, 3.1, 4.9, 1.5, 1. ]),
 array([6.5, 2.8, 4.6, 1.5, 1. ]),
 array([6.3, 3.3, 4.7, 1.6, 1. ]),
 array([6.6, 2.9, 4.6, 1.3, 1. ]),
 array([6. , 2.2, 4. , 1. , 1. ]),
 array([6.1, 2.9, 4.7, 1.4, 1. ]),
 array([6.7, 3.1, 4.4, 1.4, 1. ]),
 array([6.2, 2.2, 4.5, 1.5, 1. ]),
 array([6.1, 2.8, 4. , 1.3, 1. ]),
 array([6.3, 2.5, 4.9, 1.5, 1. ]),
 array([6.1, 2.8, 4.7, 1.2, 1. ]),
 array([6.4, 2.9, 4.3, 1.3, 1. ]),
 array([6.6, 3. , 4.4, 1.4, 1. ]),
 array([6.8, 2.8, 4.8, 1.4, 1. ]),
 array([6.7, 3. , 5. , 1.7, 1. ]),
 array([6. , 2.9, 4.5, 1.5, 1. ]),
 array([6. , 2.7, 5.1, 1.6, 1. ]),
 array([6. , 3.4, 4.5, 1.6, 1. ]),
 array([6.7, 3.1, 4.7, 1.5, 1. ]),
 array([6.3, 2.3, 4.4, 1.3, 1. ]),
 array([6.1, 3. , 4.6, 1.4, 1. ]),
 array([6.2, 2.9, 4.3, 1.3, 1. ]),
 array([6.3, 3.3, 6. , 2.5, 2. ]),
 array([7.1, 3. , 5.9, 2.1, 2. ]),
 array([6.3, 2.9, 5.6, 1.8, 2. ]),
 array([6.5, 3. , 5.8, 2.2, 2. ]),
 array([7.6, 3. , 6.

In [0]:
def entropy(rows):
    entropy=0
    from math import log
    log2=lambda x:log(x)/log(2)
    count=count_values(rows)
    for label in count:
        p=count[label]/float(len(rows))
        entropy-=p*log2(p)
    return entropy 
    

In [92]:
entropy(data)

1.584962500721156

In [0]:
def info_gain_gini(current,left,right):
    p =float(len(left))/len(left)+len(right)
    return current-p*gini(left)-(1-p)*gini(right)

In [0]:
def info_gain_entropy(current,left,right):
    p =float(len(left))/len(left)+len(right)
    return current-p*entropy(left)-(1-p)*entropy(right)

In [0]:
def best_split(rows):
    best_gain=0
    best_question=None  
    current=gini(rows)
    features=len(rows[0])-1
    for col in range(features):
        values=set([row[col] for row in rows])
        for val in values:
            question=Question(col,val)
            true_rows,false_rows=partition(rows,question)
            if len(true_rows)==0 or len(false_rows) ==0:
                continue
            gain=info_gain_gini(current,true_rows,false_rows)
            if gain>=best_gain:
                best_gain,best_question=gain,question
    return best_gain,best_question

In [0]:
class DecisionNode:
    def __init__(self,question,true_branch,false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [0]:
class Leaf:
    def __init__(self,rows):
        self.predictions=count_values(rows)

In [0]:
def build_tree(rows):
    gain,question=best_split(rows)
    if gain==0:
        return Leaf(rows)
    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return DecisionNode(question, true_branch, false_branch)