# Decision Tree

## Example

##### Sample Dataset

![](img/SampleData.png)

-------------------------
##### Summary of the Data

![](img/summary.png)

# IRIS Flower Data 

![](img/iris.jpg)
----------------------------------
![](img/iris1.png)

## Import Modules

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks", color_codes=True, font_scale=1.5)
color = sns.color_palette()
sns.set_style('darkgrid')
import pylab 
%matplotlib inline

import warnings
warnings.simplefilter(action = 'ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

import random
from pprint import pprint

from sklearn.metrics import log_loss, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from scipy import stats

import statsmodels.api as sm
from statsmodels.formula.api import ols

### Reading Data

In [None]:
df = pd.read_csv("data/Iris.csv")
df = df.drop("Id", axis=1)
df = df.rename(columns={"species": "label"})
df.head()

In [None]:
df.info()

In [None]:
df.label.value_counts()

In [None]:
X = df.iloc[:, 0:4]
y = df.iloc[:, 4]

### Visualization


##### Sepal vs Label

In [None]:
# Use the 'hue' argument to provide a factor variable
sns.lmplot(data=df, x="sepal_length", y="sepal_width",  fit_reg=False, hue='label', size = 6,  aspect = 1.5);

##### Petal vs Label

In [None]:
sns.lmplot(data=df, x = 'petal_width', y='petal_length', hue='label', 
           fit_reg=False, size = 6,  aspect = 1.5);

#### Logistic Regression Decision Boudary
![](img/logdb.png)
-------------------------------------
#### Decision Decision Boudary
![](img/db.png)
--------------------
![](img/db1.png)
----------------------
![](img/dtree.png)

#### So How its done?

![](img/algorithm.png)

### Split based on Low Entropy
![](img/entropy1.png)
------------------------------
![](img/entropy2.png)
------------------------------

In [None]:
p = np.arange(0,1.2,.2)
l = -np.log2(p)

In [None]:
p

In [None]:
l

In [None]:
plt.scatter(p, l);
plt.xlabel('Probability')
plt.ylabel('Uncertainity');

In [None]:
df.label.value_counts()

In [None]:
sns.lmplot(data=df, x = 'petal_width', y='petal_length', hue='label', 
           fit_reg=False, size = 6,  aspect = 1.5);
plt.vlines(x=0.8, ymin=1, ymax=7);

In [None]:
data_below = 50/50*-np.log2(1) + 0/50 * -np.log2(0/50+0.00001) +  0/50 * -np.log2(0/50+0.00001)
data_below 

In [None]:
data_above = 0/100 *-np.log2(0/100 +0.0001) + 50/100 * -np.log2(50/100) +  50/100 * -np.log2(50/100)
data_above 

In [None]:
sns.lmplot(data=df, x = 'petal_width', y='petal_length', hue='label', 
           fit_reg=False, size = 6,  aspect = 1.5);
plt.vlines(x=1.05, ymin=1, ymax=7);

In [None]:
data_below = 50/55*-np.log2(50/55) + 5/55 * -np.log2(5/55) +  0/55 * -np.log2(0/55+0.00001)
data_below 

In [None]:
data_below = 0/95*-np.log2(0/95+0.000001) + 45/95 * -np.log2(45/95) +  50/95 * -np.log2(50/95)
data_below 

In [None]:
# Overall Entropy -Split1
OE_S1 = 50/150 * 0 + 100/150 * 1
print(OE_S1)
# Overall Entropy -Split1
OE_S2 = 55/150 * 0.44 + 95/150 * .99
print(OE_S2)

#### Decision Tree Termniology
![](img/term1.png)
------------------------
![](img/term2.png)
------------------------