In [1]:
# Loading the library with the iris dataset
from sklearn.datasets import load_iris
# Loading scikit'srandom forest classifier library 
from sklearn.ensemble import RandomForestClassifier 
# Loading pandas
import pandas as pd
# Loading numpy
import numpy as np 
# Setting random seed 
np.random.seed (0)

In [8]:
# Creating an object called iris with the iris data 
iris = load_iris()
# Creating a dataframewith the four feature variables 
df= pd.DataFrame(iris.data, columns=iris.feature_names) 
# Viewing the top 5 rows
df.head

<bound method NDFrame.head of      sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                  5.1               3.5                1.4               0.2
1                  4.9               3.0                1.4               0.2
2                  4.7               3.2                1.3               0.2
3                  4.6               3.1                1.5               0.2
4                  5.0               3.6                1.4               0.2
..                 ...               ...                ...               ...
145                6.7               3.0                5.2               2.3
146                6.3               2.5                5.0               1.9
147                6.5               3.0                5.2               2.0
148                6.2               3.4                5.4               2.3
149                5.9               3.0                5.1               1.8

[150 rows x 4 columns]>

In [10]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [12]:
# Adding a new column for the species name 
#iris.target: A NumPy array of numeric codes (e.g., [0, 1, 2]).
#iris.target_names: A list of string labels (e.g., ['setosa', 'versicolor', 'virginica']).
#pd.Categorical.from_codes() maps the numeric codes to their corresponding string labels
df['species']=pd.Categorical.from_codes(iris.target,iris.target_names)
# Viewing the top 5 rows 
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [15]:
len(df)

150

In [14]:
# Creating Test and Train Data

'''
 adds a new column, is_train, to the DataFrame df. 
 The column contains boolean values (True or False), 
 which are used to indicate whether a particular row 
 should belong to the training set (True) or the test set (False).
 #np.random.uniform(low, high, size) >> size: The number of random numbers to generate.
'''
df['is_train'] =np.random.uniform(0, 1, len(df)) <= .75
# View the top 5 rows 
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,False
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [16]:
# Creating dataframes with test rows and training rows
train, test=df[df['is_train']==True], df[df['is_train']==False]
# Show the number of observations for the test and training dataframes 
print('Number of observations in the training data:', len(train)) 
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 112
Number of observations in the test data: 38


In [17]:
# Create a list of the feature column's names 
features = df.columns[:4]
# View features
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [18]:
train['species']

0         setosa
2         setosa
3         setosa
4         setosa
7         setosa
         ...    
141    virginica
143    virginica
144    virginica
145    virginica
146    virginica
Name: species, Length: 112, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [23]:
# Converting each species name into digits 
'''
It returns a tuple with two components:

The encoded values (an array of integers): 
Each unique value in the input is assigned a unique integer,starting from 0.
The unique values (an array of the original unique elements): 
This is a mapping that shows which integer corresponds to which original value.
'''
y = pd.factorize (train['species'])[0]  #[0] : is used to extract the first element of the tuple, which is the array of encoded integers.
# Viewing target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2], dtype=int64)

In [25]:
# Creating a random forest Classifier.
'''
The classifier is configured to:
Use two CPU cores for training and predictions (n_jobs=2).
Produce the same results on every run due to the fixed random_state=0.
'''
clf= RandomForestClassifier(n_jobs=2, random_state=0) 
# Training the classifier
clf.fit(train[features], y)

In [26]:
# Applying the trained Classifier to the test 
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [29]:
# Viewing the predicted probabilities of the first 10 observations 
'''
clf.predict_proba()
Purpose: This method predicts the probability distribution of classes for given input data.
For each instance, it outputs a list of probabilities,
where each probability corresponds to the likelihood of that instance belonging to a specific class.
The probabilities are computed as the fraction of decision trees in the Random Forest that predict a specific clas
'''
clf.predict_proba(test[features])[0:10]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.97, 0.03, 0.  ],
       [0.97, 0.03, 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ]])

In [30]:
#mapping names for the plants for each predicted plant class 
preds= iris.target_names[clf.predict(test[features])]
# View the PREDICTED species for the first five observations 
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [31]:
# Viewing the ACTUAL species for the first five observations 
test['species'].head()

1     setosa
5     setosa
6     setosa
13    setosa
14    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [32]:
# Creating confusion matrix
'''
The confusion matrix provides a summary of prediction results on a classification problem 
by comparing the actual and predicted classes. 
pd.crosstab()
Purpose: Creates a cross-tabulation table, 
which shows the frequency (count) of occurrences for combinations of values 
in two or more column
'''
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])


Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,9,0
virginica,0,0,19


In [34]:
preds= iris.target_names[clf.predict(test[features])]

In [35]:
preds

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'virginica'], dtype='<U10')