# Random Forest Regression vs. Classification
## 1. RF Regression - Predict Gas Consumption

In [1]:
import pandas as pd
import numpy as np

In [10]:
dataset = pd.read_csv("/Users/MPHA/Desktop/UCLA/data/petrol_consumption.csv")

In [11]:
dataset.head()

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410


#### These values will likely need to be scaled

## Prepare the Data

In [12]:
X = dataset.iloc[:, 0:4].values
y = dataset.iloc[:, 4].values

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Feature Scaling
#### May not be needed for Random Forests algorithms

In [14]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Train the Algorithm

In [15]:
# n_estimator - number of Trees

from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 20, random_state = 0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

  from numpy.core.umath_tests import inner1d


## Evaluate the Algorithm
#### Find the MAE, MSE, and RMSE

In [16]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 51.76500000000001
Mean Squared Error: 4216.166749999999
Root Mean Squared Error: 64.93201637097064


In [18]:
dataset['Petrol_Consumption'].mean()

576.7708333333334

#### Increase the number of Trees

In [19]:
regressor_2 = RandomForestRegressor(n_estimators = 250, random_state = 0)
regressor_2.fit(X_train, y_train)
y_pred_2 = regressor_2.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_2))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_2)))

Mean Absolute Error: 47.8524
Mean Squared Error: 3456.6889552000016
Root Mean Squared Error: 58.79361321776373


## 2. RF Classification - Predict Bank Currency Note Authenticity
#### Binary Classification problem, using 4 attributes

In [20]:
dataset2 = pd.read_csv("/Users/MPHA/Desktop/UCLA/data/bill_authentication.csv")

dataset2.head()

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


#### This data will also be scaled
## Prepare the Data

In [21]:
Xc = dataset2.iloc[:, 0:4].values
yc = dataset2.iloc[:, 4].values

Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, random_state=0)

## Feature Scaling

In [22]:
Xc_train = sc.fit_transform(Xc_train)
Xc_test = sc.transform(Xc_test)

## Train the Algorithm
#### Use RF Classifier - instead of RF Regressor

In [23]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators = 20, random_state=0)
classifier.fit(Xc_train, yc_train)
yc_pred = classifier.predict(Xc_test)

## Evaluate the Algorithm
#### Different metrics are used - Accuracy, F1 values, precision recall, confusion matrix, etc.

In [24]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(yc_test, yc_pred))
print(classification_report(yc_test, yc_pred))
print(accuracy_score(yc_test, yc_pred))

[[155   2]
 [  1 117]]
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       157
          1       0.98      0.99      0.99       118

avg / total       0.99      0.99      0.99       275

0.9890909090909091
