In [1]:
from sklearn import datasets
import pandas as pd

iris = datasets.load_iris()

In [2]:
# See what methods and attributes the dataset provides
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [3]:
# Print the iris dataset description
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [4]:
df = pd.DataFrame(
    iris.data,
    columns=iris.feature_names
)

df['target'] = pd.Series(
 iris.target
)

In [5]:
df['target_names'] = df['target'].apply(lambda y: iris.target_names[y])
df.sample(n=6)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
60,5.0,2.0,3.5,1.0,1,versicolor
59,5.2,2.7,3.9,1.4,1,versicolor
95,5.7,3.0,4.2,1.2,1,versicolor
56,6.3,3.3,4.7,1.6,1,versicolor
50,7.0,3.2,4.7,1.4,1,versicolor
12,4.8,3.0,1.4,0.1,0,setosa


## Splitting the data

In [6]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.3)

In [7]:
df_train.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target', 'target_names'],
      dtype='object')

In [8]:
df_test.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target', 'target_names'],
      dtype='object')

In [9]:
x_train = df_train[iris.feature_names]
x_test = df_test[iris.feature_names]

y_train = df_train['target']
y_test = df_test['target']

## Training the model and using it for prediction

In [10]:
from sklearn.tree import DecisionTreeClassifier

# It is common to call the classifier instance clf
clf = DecisionTreeClassifier()

In [11]:
clf.fit(x_train, y_train)

In [12]:
# If y_test is our truth, then let's call our predictions y_test_pred
y_test_pred = clf.predict(x_test)

In [13]:
print(y_test.values)
print(y_test_pred)

[1 2 0 2 0 0 0 1 0 2 1 1 1 2 0 0 0 2 2 0 0 2 2 0 1 2 0 0 2 2 0 1 2 2 0 0 0
 0 1 2 0 2 1 2 2]
[1 2 0 2 0 0 0 1 0 2 2 1 1 2 0 0 0 2 2 0 0 2 2 0 1 2 0 0 2 2 0 1 1 2 0 0 0
 0 1 2 0 2 1 2 2]


In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test_pred)

0.9555555555555556

## Which features were more important?

In [15]:
pd.DataFrame(
  {
    'feature_names': iris.feature_names,
    'feature_importances': clf.feature_importances_
  }
).sort_values(
  'feature_importances', ascending=False
).set_index('feature_names')

Unnamed: 0_level_0,feature_importances
feature_names,Unnamed: 1_level_1
petal width (cm),0.93321
sepal length (cm),0.033589
petal length (cm),0.033201
sepal width (cm),0.0


## Displaying the internal tree decisions

In [16]:
from sklearn.tree import export_text
print(
  export_text(clf, feature_names=iris.feature_names, spacing=3, decimals=1)
) 

|--- petal width (cm) <= 0.8
|   |--- class: 0
|--- petal width (cm) >  0.8
|   |--- petal width (cm) <= 1.8
|   |   |--- petal length (cm) <= 5.0
|   |   |   |--- petal width (cm) <= 1.7
|   |   |   |   |--- class: 1
|   |   |   |--- petal width (cm) >  1.7
|   |   |   |   |--- sepal length (cm) <= 5.8
|   |   |   |   |   |--- class: 2
|   |   |   |   |--- sepal length (cm) >  5.8
|   |   |   |   |   |--- class: 1
|   |   |--- petal length (cm) >  5.0
|   |   |   |--- sepal length (cm) <= 6.0
|   |   |   |   |--- class: 1
|   |   |   |--- sepal length (cm) >  6.0
|   |   |   |   |--- class: 2
|   |--- petal width (cm) >  1.8
|   |   |--- class: 2



In [17]:
print(pd.unique(df[['target', 'target_names']].values.ravel()))
print(clf.predict([[1.2, 5.0, 1.2, 3.3]]))

[0 'setosa' 1 'versicolor' 2 'virginica']
[2]


