# Iris classification model using Random Forest

<i>Project idea by: Data Professor http://youtube.com/dataprofessor </i>

In [10]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [26]:
#load and show dataset containing 150 iris flowers
iris = datasets.load_iris()
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [27]:
#three different class labels of iris flowers and the values associated with them
print(iris.target_names)
print(iris.target)

['setosa' 'versicolor' 'virginica']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [28]:
#features to be weighed in model as coefficients
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [14]:
#assign input and output variables
input_var = iris.data
output_var = iris.target

In [16]:
#examine data dimensions
print(input_var.shape) #150x4 matrix
print(output_var.shape) #150X1 matrix

(150, 4)
(150,)


In [19]:
#training model with random forest algorithm on whole dataset
clf = RandomForestClassifier()
clf.fit(input_var, output_var)

In [11]:
#shows the weights of each feature variable 
print(clf.feature_importances_)

[0.07344346 0.01623453 0.42869861 0.4816234 ]


In [21]:
#makes prediction 
input_var[0] #first flower in dataset

array([5.1, 3.5, 1.4, 0.2])

In [14]:
print(clf.predict(X[[0]]))

[0]


Model predicts that the first flower will be of the first class label "setosa"

In [30]:
print(clf.predict_proba(input_var[[0]]))

[[1. 0. 0.]]


Model is 100% confident that the class label prediction (class 1) is correct for the first flower

In [41]:
#using an 80/20 split to train and test model to avoid overfitting like in previous example
input_train, input_test, output_train, output_test = train_test_split(input_var, output_var, test_size=0.2)

In [42]:
#80% of flowers in training dataframe
input_train.shape, output_train.shape

((120, 4), (120,))

In [43]:
#remaining 20% of flowers in testing dataframe
input_test.shape, output_test.shape

((30, 4), (30,))

In [47]:
#retrain our model with training dataframe
clf.fit(input_train, output_train)

RandomForestClassifier()

In [50]:
#perform prediction on the flowers in the test dataframe
print(clf.predict(input_test))

[2 2 1 1 0 0 1 1 2 1 2 2 0 0 2 2 1 1 2 2 0 0 0 2 0 2 2 1 0 1]


In [52]:
#print the actual class labels of the flowers in the testing dataframe to compare with our prediction model
print(output_test)

[2 2 1 1 0 0 1 1 2 1 2 2 0 0 1 2 1 1 2 2 0 0 0 2 0 2 2 1 0 1]


In [55]:
#show our model's class probabilities for each test flower sample
print(clf.predict_proba(input_test))

[[0.   0.   1.  ]
 [0.   0.   1.  ]
 [0.   0.94 0.06]
 [0.   1.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [0.   0.99 0.01]
 [0.02 0.79 0.19]
 [0.   0.   1.  ]
 [0.   1.   0.  ]
 [0.   0.01 0.99]
 [0.   0.04 0.96]
 [0.98 0.02 0.  ]
 [1.   0.   0.  ]
 [0.   0.05 0.95]
 [0.   0.   1.  ]
 [0.   0.89 0.11]
 [0.   1.   0.  ]
 [0.   0.02 0.98]
 [0.   0.   1.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [1.   0.   0.  ]
 [0.   0.   1.  ]
 [1.   0.   0.  ]
 [0.   0.01 0.99]
 [0.   0.   1.  ]
 [0.   0.99 0.01]
 [1.   0.   0.  ]
 [0.03 0.93 0.04]]


In [57]:
#using score feature to evaluate our model and show mean accuracy on prediction of test data and class labels
print(clf.score(input_test, output_test))

0.9666666666666667
