# **Random Forest**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('PastHires.csv')
df.head(2)

Unnamed: 0,Years Experience,Employed?,Previous employers,Level of Education,Top-tier school,Interned,Hired
0,10,Y,4,BS,N,N,Y
1,0,N,0,BS,Y,Y,Y


In [3]:
# Check missing values
df.isnull().sum()

Years Experience      0
Employed?             0
Previous employers    0
Level of Education    0
Top-tier school       0
Interned              0
Hired                 0
dtype: int64

In [4]:
# Mapping of strings (labels) to numbers
d = {'Y': 1, 'N': 0}
df['Hired'] = df['Hired'].map(d)
df['Employed?'] = df['Employed?'].map(d)
df['Top-tier school'] = df['Top-tier school'].map(d)
df['Interned'] = df['Interned'].map(d)
d = {'BS': 0, 'MS': 1, 'PhD': 2}
df['Level of Education'] = df['Level of Education'].map(d)
df.head()

Unnamed: 0,Years Experience,Employed?,Previous employers,Level of Education,Top-tier school,Interned,Hired
0,10,1,4,0,0,0,1
1,0,0,0,0,1,1,1
2,7,0,6,0,0,0,0
3,2,1,1,1,1,0,1
4,20,0,2,2,1,0,0


In [5]:
# Split data in x & y
y = df['Hired']
x = df.drop( ['Hired'], axis=1 )

In [6]:
# Build model

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10)

In [7]:
# Train the model
clf = clf.fit(x, y)

In [8]:
# Predict for following
# exp 10 yrs, employed, 4 employers, BS, No top tier school,  no internship
clf.predict( [[10, 1, 4, 0, 0, 0]] )

array([1], dtype=int64)

In [9]:
# fresher, not employed, 0 , BS, No top tier school,  no internship
clf.predict( [[0, 0, 0, 0, 0, 0]] )

array([0], dtype=int64)

## **IoT Sensor Fatal Accident prediction**

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import tree

In [12]:
car_train = pd.read_csv('Car_Accidents_IOT/train.csv')
car_test = pd.read_csv('Car_Accidents_IOT/test.csv')

car_train.head(2)

Unnamed: 0,Fatal,S1,S2,S3,S4,S5,S6,S7,S8,S9,...,S13,S14,S15,S16,S17,S18,S19,S20,S21,S22
0,1,36.2247,10.7733,0.243897,596,100.671,0.0,0.0,1,28,...,1,57,0.0,0.28,240,5.99375,0,0.0,4,14.9382
1,1,35.7343,17.4551,0.243897,600,100.0,0.0,0.0,1,14,...,1,57,0.0,0.175,240,5.99375,0,0.0,4,14.8827


In [13]:
car_train.shape
car_test.shape

(9065, 23)

In [14]:
x = car_train [ car_train.columns[1:23] ]
y = car_train['Fatal']

In [15]:
x.head(2)

Unnamed: 0,S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,...,S13,S14,S15,S16,S17,S18,S19,S20,S21,S22
0,36.2247,10.7733,0.243897,596,100.671,0.0,0.0,1,28,0.016064,...,1,57,0.0,0.28,240,5.99375,0,0.0,4,14.9382
1,35.7343,17.4551,0.243897,600,100.0,0.0,0.0,1,14,0.015812,...,1,57,0.0,0.175,240,5.99375,0,0.0,4,14.8827


In [16]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, min_samples_split=2, min_samples_leaf=1, random_state=0)

In [17]:
# Train Model
clf.fit(x, y)

# Predict for car_test
var=list( car_test.columns[1:33] )
tree_predict = clf.predict( car_test[ var ] )

In [18]:
# Confusion matrix for validating model

from sklearn.metrics import confusion_matrix

cm1 = confusion_matrix( car_test[['Fatal']], tree_predict )
cm1

array([[3484,  408],
       [ 327, 4846]], dtype=int64)

In [19]:
accuracy = (3484 + 4846 ) / (3484 + 408 + 327 + 4846)
accuracy

0.918918918918919

## **Mushroom classification with Random Forest**

In [21]:
df = pd.read_csv('mushrooms.csv')
df.head(2)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g


In [22]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()


In [23]:
for col in df.columns:
    df[col] = le.fit_transform(df[col])

In [24]:
x = df.iloc[:, 1:23].values
y = df.iloc[:, 0].values

In [25]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [26]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=5)

In [27]:
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

In [28]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

1.0

In [29]:
from sklearn.metrics import confusion_matrix

cm1 = confusion_matrix( y_test, y_pred )
cm1

array([[852,   0],
       [  0, 773]], dtype=int64)

Supervisory Modelling  
Regression  

-infinity 0 1 100 1000 infinity  
Linear Regression  

Classification - Catergory, Yes/no, win/lose  

Logistic  
Decision Tree  
Random Forest ()  
Boosting  
Naive Bayes  - probability - smam  
KNN  
SVM  
  
  
Un-supervisory  


In [31]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split


In [32]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

1.0

In [33]:
from sklearn.metrics import confusion_matrix

cm1 = confusion_matrix( y_test, y_pred )
cm1


array([[852,   0],
       [  0, 773]], dtype=int64)