# Building a Classification Model for the Iris data set

Furkan Kaya

In this Jupyter notebook, we will be building a classification model for the Iris data set using the random forest algorithm.

## 1. Import libraries

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

## 2. Load the *iris* data seta

In [2]:
iris = datasets.load_iris()

In [3]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

## 3. Input features
The ***iris*** data set contains 4 input features and 1 output variable (the class label).

### 3.1. Input features

In [5]:
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


### 3.2. Output features

In [7]:
print(iris.target_names)

['setosa' 'versicolor' 'virginica']


## 4. Glimpse of the data

### 4.1. Input features

In [12]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [39]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [40]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

### 4.3. Assigning *input* and *output* variables
Let's assign the 4 input variables to X and the output variable (class label) to Y

In [16]:
X = iris.data
Y = iris.target

### 4.3. Let's examine the data dimension

In [17]:
X.shape

(150, 4)

In [21]:
Y.shape

(150,)

## 5. Build Classification Model using Random Forest

In [19]:
clf = RandomForestClassifier()

In [22]:
clf.fit(X, Y)

## 6. Feature Importance

In [23]:
print(clf.feature_importances_)

[0.10279458 0.02327021 0.47678666 0.39714855]


## 7. Make Prediction

In [27]:
X[0]

array([5.1, 3.5, 1.4, 0.2])

In [25]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [44]:
print(clf.predict(X[[55]]))

['versicolor']


In [47]:
print(clf.predict_proba(X[[0]]))

[[1. 0. 0.]]


In [46]:
print(clf.predict_proba(X[[0]]))

[[1. 0. 0.]]


In [60]:
clf.fit(iris.data, iris.target_names[iris.target])

## 8. Data split (80/20 ratio)

In [72]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [73]:
X_train.shape, Y_train.shape

((120, 4), (120,))

In [74]:
X_test.shape, Y_test.shape

((30, 4), (30,))

## 9. Rebuild the Random Forest Model

In [79]:
clf.fit(X_train, Y_train)

### 9.1. Performs prediction on single sample from the data set

In [56]:
print(clf.predict(X_test[[15]]))

[2]


In [58]:
print(clf.predict_proba(X_test[[15]]))

[[0.   0.03 0.97]]


In [80]:
print(clf.predict(X_test))

[2 1 0 2 0 1 0 2 1 2 2 2 1 0 1 1 2 0 2 2 0 1 2 2 0 2 2 0 0 0]


In [77]:
clf.fit(iris.data, iris.target_names[iris.target])

## Model Performance

In [81]:
print(clf.score(X_test,Y_test))

0.9333333333333333


### 9.2. Performs prediction on the test set

#### *Predicted class labels*

In [54]:
print(clf.predict(X_test))

[277. 272. 233. 245.  39. 128. 121. 241. 134. 185.  87.  52.  52.  91.
 182.  99. 139. 138. 265. 185. 217. 192.  98. 104. 155. 245. 101. 121.
  87.  97. 152.  65. 111. 262. 192. 111. 233.  83.  72. 141. 212.  91.
 111.  55.  91. 128. 118. 263.  67.  68. 236. 230.  96. 217. 114. 128.
  97. 131.  72. 259. 225. 113. 232. 214. 113. 277. 192. 109. 275. 270.
 232.  43.  50. 220.  98. 121. 113. 245. 220. 265. 200.  68. 336.  94.
 270. 128.  97.  88. 110.]


#### *Actual class labels*

In [55]:
print(Y_test)

[258. 186. 243. 242.  92.  52. 276. 248. 158.  94. 103. 142.  47. 118.
 113. 281. 196.  47. 237. 161. 252. 116.  93.  94.  66. 281. 137.  25.
 170. 168. 273.  89. 196. 202. 306. 164. 242.  97. 127. 143. 122. 198.
 107. 172.  53.  95.  89. 178. 249.  86. 280. 248.  53. 258. 143. 111.
 160. 257. 179. 293. 141.  48. 163.  93. 216. 297. 229. 215. 275. 195.
 197.  99.  71. 225. 182.  85. 175. 310. 295. 202. 113. 246. 263.  72.
 308. 220. 190.  63. 178.]


## 10. Model Performance

In [70]:
print(clf.score(X_test, Y_test))

0.0


  score = y_true == y_pred


# Digits Data

## Importing Data

In [58]:
digits = datasets.load_digits()

In [60]:
digits

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'frame': None,
 'feature_names': ['pixel_0_0',
  'pixel_0_1',
  'pixel_0_2',
  'pixel_0_3',
  'pixel_0_4',
  'pixel_0_5',
  'pixel_0_6',
  'pixel_0_7',
  'pixel_1_0',
  'pixel_1_1',
  'pixel_1_2',
  'pixel_1_3',
  'pixel_1_4',
  'pixel_1_5',
  'pixel_1_6',
  'pixel_1_7',
  'pixel_2_0',
  'pixel_2_1',
  'pixel_2_2',
  'pixel_2_3',
  'pixel_2_4',
  'pixel_2_5',
  'pixel_2_6',
  'pixel_2_7',
  'pixel_3_0',
  'pixel_3_1',
  'pixel_3_2',
  'pixel_3_3',
  'pixel_3_4',
  'pixel_3_5',
  'pixel_3_6',
  'pixel_3_7',
  'pixel_4_0',
  'pixel_4_1',
  'pixel_4_2',
  'pixel_4_3',
  'pixel_4_4',
  'pixel_4_5',
  'pixel_4_6',
  'pixel_4_7',
  'pixel_5_0',
  'pixel_5_1',
 

## Input Features

In [67]:
print(digits.feature_names)

['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_4', 'pixel_7_5', 'pixel_7_6', 'pixel_7_7']


In [66]:
print(digits.data)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]


In [70]:
print(digits.target)

[0 1 2 ... 8 9 8]


## Glimpse of the data

In [72]:
digits.data

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [73]:
digits.target

array([0, 1, 2, ..., 8, 9, 8])

### 4.3. Assigning *input* and *output* variables
Let's assign the 4 input variables to X and the output variable (class label) to Y

In [112]:
X = digits.data
Y = digits.target

### Examing The Data Dimension

In [113]:
X.shape

(1797, 64)

In [114]:
Y.shape

(1797,)

### Build Classification Model using Random Forest

In [115]:
cls = RandomForestClassifier()

In [116]:
cls.fit(X,Y)

#### Feature Importance

In [117]:
print(cls.feature_importances_)

[0.00000000e+00 1.98636230e-03 2.22429106e-02 9.88011459e-03
 1.01656947e-02 1.99201919e-02 8.77674756e-03 5.63847716e-04
 3.62808659e-05 1.04200688e-02 2.58434331e-02 6.23230008e-03
 1.66980740e-02 2.93304753e-02 5.67624767e-03 4.87751946e-04
 2.36504948e-05 7.74755674e-03 2.12630685e-02 2.32235215e-02
 2.80684780e-02 4.97834353e-02 9.93566837e-03 3.56953691e-04
 6.07688968e-05 1.52823445e-02 4.14480336e-02 2.46579658e-02
 3.48014940e-02 2.28217527e-02 3.03634480e-02 4.94819697e-05
 0.00000000e+00 2.87008043e-02 2.58384593e-02 1.67604939e-02
 3.74147281e-02 1.96637022e-02 2.98515474e-02 0.00000000e+00
 2.14553397e-05 1.21404636e-02 3.48042510e-02 4.35860390e-02
 2.12224127e-02 1.88632356e-02 1.93049026e-02 1.32246573e-04
 4.73782521e-05 1.81473131e-03 1.60595920e-02 2.16021031e-02
 1.36899485e-02 2.44555335e-02 2.79152127e-02 2.21136289e-03
 2.12692217e-05 2.36708041e-03 2.16455125e-02 1.05896832e-02
 2.28320812e-02 2.57854223e-02 1.85140432e-02 3.99618073e-03]


In [118]:
X[0]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])

In [121]:
print(cls.predict(X[[1,7]]))

[1 7]


In [124]:
print(cls.predict_proba(X[[0]]))

[[0.99 0.   0.   0.   0.   0.01 0.   0.   0.   0.  ]]


In [130]:
cls.fit(digits.data, digits.feature_names[digits.target])

TypeError: only integer scalar arrays can be converted to a scalar index