In [75]:
# Loading the library with the iris dataset
from sklearn.datasets import load_iris
# Loading scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
np.random.seed(0)

In [76]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [77]:
df['species'] = pd.Categorical.from_codes(iris.target,iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [78]:
# here we want random number b/w 0 and 1 and which nummber is less than .75 is true other false and on the bases 
# of that we create a new column
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

In [79]:
df.shape

(150, 6)

In [80]:
df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
145,6.7,3.0,5.2,2.3,virginica,True
146,6.3,2.5,5.0,1.9,virginica,True
147,6.5,3.0,5.2,2.0,virginica,False
148,6.2,3.4,5.4,2.3,virginica,True
149,5.9,3.0,5.1,1.8,virginica,False


In [81]:
# create a dataframe with test rows and training rows
train, test = df[df['is_train']==True], df[df['is_train']==False]
# show the number of observation for the test and training data frame
print('Number of observaton in the training data:', len(train))
print('Number of observation in the test data:', len(test))

Number of observaton in the training data: 118
Number of observation in the test data: 32


In [82]:
features = df.columns[:4]
print(features)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')


In [83]:
# converting each species into a digits
y = pd.factorize(train['species'])[0]
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [84]:
# creating a random forest Classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0) # n_jobs=2 because we r passing 2 variable 
# Training the  classifier
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [85]:
test[features].head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
10,5.4,3.7,1.5,0.2
13,4.3,3.0,1.1,0.1
17,5.1,3.5,1.4,0.3


In [86]:
# applying the trained classifier to the test
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [87]:
# Viewing the predicted probabilities of the first 10 observation
clf.predict_proba(test[features])

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.95, 0.05, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.99, 0.01, 0.  ],
       [1.  , 0.  , 0.  ],
       [0.  , 0.67, 0.33],
       [0.  , 1.  , 0.  ],
       [0.  , 0.82, 0.18],
       [0.  , 0.03, 0.97],
       [0.  , 0.42, 0.58],
       [0.  , 0.99, 0.01],
       [0.  , 0.96, 0.04],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ],
       [0.  , 0.01, 0.99],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ],
       [0.  , 0.01, 0.99],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ],
       [0.  , 0.02, 0.98]])

In [88]:
clf.predict_proba(test[features])[15:20]

array([[0.  , 0.82, 0.18],
       [0.  , 0.03, 0.97],
       [0.  , 0.42, 0.58],
       [0.  , 0.99, 0.01],
       [0.  , 0.96, 0.04]])

In [89]:
# viewing the PREDECTED values of species column 
preds = iris.target_names[clf.predict(test[features])]
preds[10:25]

array(['setosa', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'versicolor', 'virginica', 'virginica', 'versicolor', 'versicolor',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica'],
      dtype='<U10')

In [90]:
# lets take 1 exaple
preds

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica'],
      dtype='<U10')

In [91]:
# viewing the ACTUAL values
test['species'][10:25]

27         setosa
31         setosa
38         setosa
52     versicolor
66     versicolor
68     versicolor
70     versicolor
72     versicolor
89     versicolor
98     versicolor
103     virginica
109     virginica
111     virginica
114     virginica
116     virginica
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [92]:
# creating a CUNFUSION MATRIX 
pd.crosstab(test['species'],preds, rownames = ['Actual Species'],colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12


In [93]:
# MODEL ACCURACY
(13+5+12)/(13+5+12+2) 

0.9375

# So the Model Accuracy is

#   93 %

In [94]:
# take some value and predct the species
preds = iris.target_names[clf.predict([[5,3.6,1.4,2]])]
preds

array(['setosa'], dtype='<U10')

In [95]:
preds = iris.target_names[clf.predict([[5,3.7,1.9,2.1],[6.3,3.1,3.8,4.7]])]
preds

array(['setosa', 'virginica'], dtype='<U10')