##  Classification using Random Forest Classifier


In [44]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score

In [None]:
iris=datasets.load_iris()
iris

In [5]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [6]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [7]:
X=iris.data
#we assign to class
y=iris.target

In [8]:
X.shape
#150 flowers, 4 features

(150, 4)

In [9]:
clf=RandomForestClassifier()

In [10]:
clf.fit(X,y)

In [12]:
#importance of the features (4)
clf.feature_importances_

array([0.08298032, 0.01804577, 0.39302959, 0.50594431])

In [15]:
#predict something similar to first element
print(X[0])
#predictor gets assigned to first class
clf.predict([[5.1,3.5,1.4,0.1]])

[5.1 3.5 1.4 0.2]


array([0])

In [20]:
clf.predict([X[2]])

array(['setosa'], dtype='<U10')

In [19]:
clf.fit(iris.data,iris.target_names[iris.target])

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [23]:
X_train.shape

(120, 4)

In [29]:
clf.fit(X_train,y_train)

In [31]:
clf.predict(X_test)

array([2, 1, 0, 1, 1, 0, 1, 1, 2, 0, 2, 0, 1, 0, 0, 2, 1, 1, 0, 2, 2, 0,
       1, 1, 2, 2, 0, 2, 1, 2])

In [27]:
clf.score(X_test,y_test)

1.0

Hyperparameter Tuning


In [84]:
from sklearn.datasets import make_classification
import pandas as pd
#generate a dataset
X,Y=make_classification(n_samples=200,n_classes=2,n_features=10,n_redundant=0,random_state=42)

In [85]:
X.shape,Y.shape

((200, 10), (200,))

In [None]:
#pd.DataFrame(X)
#X

In [86]:
Y

array([0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1])

In [87]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.4)

In [88]:
rf=RandomForestClassifier(max_features=5,n_estimators=100)

In [89]:
rf.fit(X_train,y_train)

In [90]:
rf.score(X_test,y_test)

0.85

In [91]:
Y_pred=rf.predict(X_test)
Y_pred

array([1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1])

In [92]:
accuracy_score(Y_pred,y_test)

0.85

## Hyperparameter tuning

In [93]:
#n_estimators= no. of trees
#Grid search CV continuosly keeps performing hyper param tuning till it finds an optimal value, in grid format
# n_est=5, max_feature=6 then total runs=30
from sklearn.model_selection import GridSearchCV
import numpy as np


In [94]:
max_features_range=np.arange(1,6,1) # generate from 1 to 5, with skip value1
n_estimators_range=np.arange(10,210,10)
param_grid=dict(max_features=max_features_range,n_estimators=n_estimators_range,)

rf=RandomForestClassifier()
#n fold cross val=cv
grid=GridSearchCV(estimator=rf,param_grid=param_grid,cv=5)

In [95]:
type(y_train)

numpy.ndarray

In [96]:
grid.fit(X_train,y_train)

In [97]:
print(grid.best_params_,grid.best_score_)

{'max_features': 2, 'n_estimators': 170} 0.85


In [98]:
import pandas as pd

grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
grid_results.head()

Unnamed: 0,max_features,n_estimators,Accuracy
0,1,10,0.7
1,1,20,0.783333
2,1,30,0.733333
3,1,40,0.758333
4,1,50,0.783333
