# Building a Classification Model for the Iris data set

Furkan Kaya

In this Jupyter notebook, we will be building a classification model for the Iris data set using the random forest algorithm.

## 1. Import libraries

In [2]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

## 2. Load the *iris* data seta

In [3]:
diabetes = datasets.load_diabetes()

In [4]:
diabetes

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990749, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06833155, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286131, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04688253,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452873, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00422151,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 225.,  59

## 3. Input features
The ***diabetes*** data set contains 4 input features and 1 output variable (the class label).

### 3.1. Input features

In [6]:
print(diabetes.feature_names)

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


### 3.2. Output features

In [12]:
print(diabetes.target_filename)

diabetes_target.csv.gz


In [14]:
print(diabetes.data_filename)

diabetes_data_raw.csv.gz


## 4. Glimpse of the data

### 4.1. Input features

In [15]:
diabetes.data

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]])

### 4.2. Output variable (the Class label)

In [17]:
diabetes.target

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

### 4.3. Assigning *input* and *output* variables
Let's assign the 4 input variables to X and the output variable (class label) to Y

In [22]:
X = diabetes.data
Y = diabetes.target

### 4.3. Let's examine the data dimension

In [23]:
X.shape

(442, 10)

In [24]:
Y.shape

(442,)

## 5. Build Classification Model using Random Forest

In [25]:
clf = RandomForestClassifier()

In [26]:
clf.fit(X, Y)

## 6. Feature Importance

In [85]:
print(clf.feature_importances_)

[0.10963169 0.02785545 0.12252361 0.11468884 0.10784464 0.11074483
 0.10979243 0.06040838 0.12242133 0.1140888 ]


## 7. Make Prediction

In [28]:
X[0]

array([ 0.03807591,  0.05068012,  0.06169621,  0.02187239, -0.0442235 ,
       -0.03482076, -0.04340085, -0.00259226,  0.01990749, -0.01764613])

In [29]:
X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]])

In [36]:
print(clf.predict([[0.039, 0.01, 0.04, 0.02, -0.02,
                   -0.04,-0.03, -0.01, 0.05, -0.02]]))

[281.]


In [110]:
print(clf.predict(X[[0]]))

ValueError: X has 64 features, but RandomForestClassifier is expecting 10 features as input.

In [38]:
print(clf.predict_proba(X[[0]]))

[[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.02 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.   0.   0.
  0.   0.   0.   0.   0.01 0.   0.   0.01 0.   0.   0.02 0.01 0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.03 0.   0.   0.   0.   0.
  0.   0.   0.   0.62 0.   0.   0.   0.   0.01 0.   0.01 0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.02 0.   0.   0.   0.02 0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.01 0.   0.   0.   0.   0.   0.02 0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.02 0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.04
  0.   0.02 0.   0.   0.02 0.   0.   0.   0.   0.   0.   0.01 0.03 0.
  0.01 0.   0.   0

In [45]:
clf.fit(diabetes.data, diabetes.feature_names[diabetes.target])

TypeError: only integer scalar arrays can be converted to a scalar index

## 8. Data split (80/20 ratio)

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [47]:
X_train.shape, Y_train.shape

((353, 10), (353,))

In [48]:
X_test.shape, Y_test.shape

((89, 10), (89,))

## 9. Rebuild the Random Forest Model

In [49]:
clf.fit(X_train, Y_train)

### 9.1. Performs prediction on single sample from the data set

In [51]:
print(clf.predict([[0.039, 0.01, 0.04, 0.02, -0.02,
                   -0.04,-0.03, -0.01, 0.05, -0.02]]))

[99.]


In [53]:
print(clf.predict_proba([[0.039, 0.01, 0.04, 0.02, -0.02,
                   -0.04,-0.03, -0.01, 0.05, -0.02]]))

[[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.06 0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.01 0.   0.   0.   0.   0.   0.   0.01 0.   0.   0.   0.01 0.   0.
  0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.08 0.   0.   0.
  0.   0.   0.   0.   0.   0.02 0.   0.   0.   0.   0.   0.   0.   0.02
  0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.02 0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.01 0.   0.01 0.   0.   0.06 0.   0.
  0.   0.03 0.02 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.   0.04 0.   0.   0.
  0.   0.   0.   0.01 0.   0.02 0.   0.01 0.   0.03 0.   0.   0.02 0.
  0.   0.   0.   0.08 0.   0.08 0.   0.02 0.   0.04 0.03 0.   0.   0.
  0.   0.   0.02 0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.03 0.
  0.04 0.   0.   0.   0.   0.   0.   0.   0.01 0.   0.01 0.01 0.   0.
  0.07 0.02 0.   0.   0.   0.   0.   0.   0.  ]]


### 9.2. Performs prediction on the test set

#### *Predicted class labels*

In [54]:
print(clf.predict(X_test))

[277. 272. 233. 245.  39. 128. 121. 241. 134. 185.  87.  52.  52.  91.
 182.  99. 139. 138. 265. 185. 217. 192.  98. 104. 155. 245. 101. 121.
  87.  97. 152.  65. 111. 262. 192. 111. 233.  83.  72. 141. 212.  91.
 111.  55.  91. 128. 118. 263.  67.  68. 236. 230.  96. 217. 114. 128.
  97. 131.  72. 259. 225. 113. 232. 214. 113. 277. 192. 109. 275. 270.
 232.  43.  50. 220.  98. 121. 113. 245. 220. 265. 200.  68. 336.  94.
 270. 128.  97.  88. 110.]


#### *Actual class labels*

In [55]:
print(Y_test)

[258. 186. 243. 242.  92.  52. 276. 248. 158.  94. 103. 142.  47. 118.
 113. 281. 196.  47. 237. 161. 252. 116.  93.  94.  66. 281. 137.  25.
 170. 168. 273.  89. 196. 202. 306. 164. 242.  97. 127. 143. 122. 198.
 107. 172.  53.  95.  89. 178. 249.  86. 280. 248.  53. 258. 143. 111.
 160. 257. 179. 293. 141.  48. 163.  93. 216. 297. 229. 215. 275. 195.
 197.  99.  71. 225. 182.  85. 175. 310. 295. 202. 113. 246. 263.  72.
 308. 220. 190.  63. 178.]


## 10. Model Performance

In [57]:
print(clf.score(X_test, Y_test))

0.011235955056179775


# Digits Data

## Importing Data

In [58]:
digits = datasets.load_digits()

In [60]:
digits

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'frame': None,
 'feature_names': ['pixel_0_0',
  'pixel_0_1',
  'pixel_0_2',
  'pixel_0_3',
  'pixel_0_4',
  'pixel_0_5',
  'pixel_0_6',
  'pixel_0_7',
  'pixel_1_0',
  'pixel_1_1',
  'pixel_1_2',
  'pixel_1_3',
  'pixel_1_4',
  'pixel_1_5',
  'pixel_1_6',
  'pixel_1_7',
  'pixel_2_0',
  'pixel_2_1',
  'pixel_2_2',
  'pixel_2_3',
  'pixel_2_4',
  'pixel_2_5',
  'pixel_2_6',
  'pixel_2_7',
  'pixel_3_0',
  'pixel_3_1',
  'pixel_3_2',
  'pixel_3_3',
  'pixel_3_4',
  'pixel_3_5',
  'pixel_3_6',
  'pixel_3_7',
  'pixel_4_0',
  'pixel_4_1',
  'pixel_4_2',
  'pixel_4_3',
  'pixel_4_4',
  'pixel_4_5',
  'pixel_4_6',
  'pixel_4_7',
  'pixel_5_0',
  'pixel_5_1',
 

## Input Features

In [67]:
print(digits.feature_names)

['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_4', 'pixel_7_5', 'pixel_7_6', 'pixel_7_7']


In [66]:
print(digits.data)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]


In [70]:
print(digits.target)

[0 1 2 ... 8 9 8]


## Glimpse of the data

In [72]:
digits.data

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [73]:
digits.target

array([0, 1, 2, ..., 8, 9, 8])

### 4.3. Assigning *input* and *output* variables
Let's assign the 4 input variables to X and the output variable (class label) to Y

In [112]:
X = digits.data
Y = digits.target

### Examing The Data Dimension

In [113]:
X.shape

(1797, 64)

In [114]:
Y.shape

(1797,)

### Build Classification Model using Random Forest

In [115]:
cls = RandomForestClassifier()

In [116]:
cls.fit(X,Y)

#### Feature Importance

In [117]:
print(cls.feature_importances_)

[0.00000000e+00 1.98636230e-03 2.22429106e-02 9.88011459e-03
 1.01656947e-02 1.99201919e-02 8.77674756e-03 5.63847716e-04
 3.62808659e-05 1.04200688e-02 2.58434331e-02 6.23230008e-03
 1.66980740e-02 2.93304753e-02 5.67624767e-03 4.87751946e-04
 2.36504948e-05 7.74755674e-03 2.12630685e-02 2.32235215e-02
 2.80684780e-02 4.97834353e-02 9.93566837e-03 3.56953691e-04
 6.07688968e-05 1.52823445e-02 4.14480336e-02 2.46579658e-02
 3.48014940e-02 2.28217527e-02 3.03634480e-02 4.94819697e-05
 0.00000000e+00 2.87008043e-02 2.58384593e-02 1.67604939e-02
 3.74147281e-02 1.96637022e-02 2.98515474e-02 0.00000000e+00
 2.14553397e-05 1.21404636e-02 3.48042510e-02 4.35860390e-02
 2.12224127e-02 1.88632356e-02 1.93049026e-02 1.32246573e-04
 4.73782521e-05 1.81473131e-03 1.60595920e-02 2.16021031e-02
 1.36899485e-02 2.44555335e-02 2.79152127e-02 2.21136289e-03
 2.12692217e-05 2.36708041e-03 2.16455125e-02 1.05896832e-02
 2.28320812e-02 2.57854223e-02 1.85140432e-02 3.99618073e-03]


In [118]:
X[0]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])

In [121]:
print(cls.predict(X[[1,7]]))

[1 7]


In [124]:
print(cls.predict_proba(X[[0]]))

[[0.99 0.   0.   0.   0.   0.01 0.   0.   0.   0.  ]]


In [130]:
cls.fit(digits.data, digits.feature_names[digits.target])

TypeError: only integer scalar arrays can be converted to a scalar index