# ENSEMBLE LEARNING - Random Forest

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint

import seaborn as sns
sns.set(style="ticks", color_codes=True, font_scale=1.5)
color = sns.color_palette()
sns.set_style('darkgrid')


from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier



### Example-1 IRIS

In [None]:
iris = pd.read_csv("data/Iris.csv")
iris = iris.drop("Id", axis=1)
iris = iris.rename(columns={"species": "label"})

In [None]:
iris.head()

In [None]:
iris.info()

In [None]:
X = iris.iloc[:, :-1].values
y = iris.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 2794)

##### Logistic Regression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_train = log_reg.predict(X_train)
y_pred_test = log_reg.predict(X_test)

In [None]:
print("Training Accuracy:", np.round(accuracy_score(y_train, y_pred_train),2))
print("Test Accuracy:", np.round(accuracy_score(y_test, y_pred_test),2))

In [None]:
print("Confusion Matrix-Test Dataset:::")
print("--------------------------------")
confusion_matrix(y_test, y_pred_test)

In [None]:
pd.Series(y_test).value_counts()

##### Decision Tree

In [None]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)
y_pred_train = dec_tree.predict(X_train)
y_pred_test = dec_tree.predict(X_test)

In [None]:
print("Training Accuracy:", np.round(accuracy_score(y_train, y_pred_train),2))
print("Test Accuracy:", np.round(accuracy_score(y_test, y_pred_test),2))

In [None]:
print("Confusion Matrix-Test Dataset:::")
print("--------------------------------")
confusion_matrix(y_test, y_pred_test)

In [None]:
sns.lmplot(data=iris, x = 'petal_width', y='petal_length', hue='label', 
           fit_reg=False, size = 6,  aspect = 1.5);
plt.vlines(x=0.8, ymin=1, ymax=7);
plt.vlines(x=1.75, ymin=1, ymax=7);
plt.hlines(y=5.2, xmin=0.8, xmax=1.75);


### Example-2 Housing Price

In [None]:
df = pd.read_excel("data/HousingPrice.xls")
df.head()

In [None]:
df.info()

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('/', '_').str.replace(')', '')
df.columns

In [None]:
df.set_index('pid', inplace=True)  ### Setting Index 
df.drop('order', axis =1, inplace=True) ### Dropping Unneccessary column
df.dropna(inplace=True) ### Dropping Missing Values

In [None]:
df.columns

In [None]:
df = pd.get_dummies(df)
df.columns

In [None]:
column_order = ['lot_area', 'overall_qual', 'overall_cond', 'year_built',
       'year_remod_add', 'f_flr', 's_flr', 'yr_sold', 
       'ms_zoning_C (all)', 'ms_zoning_FV', 'ms_zoning_I (all)',
       'ms_zoning_RH', 'ms_zoning_RL', 'ms_zoning_RM', 'lot_shape_IR1',
       'lot_shape_IR2', 'lot_shape_IR3', 'lot_shape_Reg', 'utilities_AllPub',
       'utilities_NoSeWa', 'utilities_NoSewr', 'condition_1_Artery',
       'condition_1_Feedr', 'condition_1_Norm', 'condition_1_PosA',
       'condition_1_PosN', 'condition_1_RRAe', 'condition_1_RRAn',
       'condition_1_RRNe', 'condition_1_RRNn', 'condition_2_Artery',
       'condition_2_Feedr', 'condition_2_Norm', 'condition_2_PosA',
       'condition_2_PosN', 'condition_2_RRAe', 'condition_2_RRAn',
       'condition_2_RRNn', 'bldg_type_1Fam', 'bldg_type_2fmCon',
       'bldg_type_Duplex', 'bldg_type_Twnhs', 'bldg_type_TwnhsE',
       'house_style_1.5Fin', 'house_style_1.5Unf', 'house_style_1Story',
       'house_style_2.5Fin', 'house_style_2.5Unf', 'house_style_2Story',
       'house_style_SFoyer', 'house_style_SLvl', 'foundation_BrkTil',
       'foundation_CBlock', 'foundation_PConc', 'foundation_Stone',
       'foundation_Wood', 'bsmt_qual_Ex', 'bsmt_qual_Fa', 'bsmt_qual_Gd',
       'bsmt_qual_Po', 'bsmt_qual_TA', 'central_air_N', 'central_air_Y',
       'kitchen_qual_Ex', 'kitchen_qual_Fa', 'kitchen_qual_Gd',
       'kitchen_qual_Po', 'kitchen_qual_TA', 'sale_price']

In [None]:
df = df[column_order]
df.head()

In [None]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2794)

In [None]:
X_test.shape

In [None]:
lr  = LinearRegression()
lr.fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
print("Training Accuracy:", np.round(r2_score(y_train, y_pred_train),2))
print("Training RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

print("Test Accuracy:", np.round(r2_score(y_test, y_pred_test),2))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

In [None]:
dec_tree = DecisionTreeRegressor()
dec_tree.fit(X_train, y_train)
y_pred_train = dec_tree.predict(X_train)
y_pred_test = dec_tree.predict(X_test)
print("Training Accuracy:", np.round(r2_score(y_train, y_pred_train),2))
print("Training RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

print("Test Accuracy:", np.round(r2_score(y_test, y_pred_test),2))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

### Bias-Variance Tradeoff
![](img/bias1.png)
![](img/bias2.png)
-----------------------------------
### So Decision Tree is WOT ???
![](img/think.png)

In [None]:
dec_tree = DecisionTreeRegressor(max_depth=5, min_samples_split= 5)
dec_tree.fit(X_train, y_train)
y_pred_train = dec_tree.predict(X_train)
y_pred_test = dec_tree.predict(X_test)
print("Training Accuracy:", np.round(r2_score(y_train, y_pred_train),2))
print("Training RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

print("Test Accuracy:", np.round(r2_score(y_test, y_pred_test),2))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

# Here comes the handy Ensemble Learning

#### 1) Bagging (Ex. Random Forest)
![](img/bagging.png)
#### 2) Boosting (Ex. Gradient Boosting)
![](img/boosting.png)
#### 3) Stacking
![](img/stacking.png)

## Random Forest
![](img/rf1.jpg)
------------------------------------
![](img/rf2.png)

In [None]:
rf = RandomForestRegressor(n_estimators=1)
rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)
print("Training Accuracy:", np.round(r2_score(y_train, y_pred_train),2))
print("Training RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

print("Test Accuracy:", np.round(r2_score(y_test, y_pred_test),2))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

In [None]:
RandomForestRegressor?

In [None]:
rf = RandomForestRegressor(n_estimators = 100, max_depth =9, random_state = 50, 
                           max_features=.60)
rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)
print("Training Accuracy:", np.round(r2_score(y_train, y_pred_train),2))
print("Training RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

print("Test Accuracy:", np.round(r2_score(y_test, y_pred_test),2))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

In [None]:
rf.feature_importances_

##### Hyper Parameter Tuning

In [None]:
param_grid = {'n_estimators': [10, 50, 75, 90, 100],
'max_depth': [1, 3, 5, 7, 9], 'max_features': [.40, .50, .60, 'sqrt', 'log2']}
print("Parameter grid:\n{}".format(param_grid))

In [None]:
# grid_search = GridSearchCV(RandomForestRegressor(random_state = 50, n_jobs = -1), param_grid)
# grid_search.fit(X_train, y_train)
# print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
# print("Best parameters: {}".format(grid_search.best_params_))


In [None]:
# y_pred_train = grid_search.predict(X_train)
# y_pred_test = grid_search.predict(X_test)
# print("Training Accuracy:", np.round(r2_score(y_train, y_pred_train),2))
# print("Training RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

# print("Test Accuracy:", np.round(r2_score(y_test, y_pred_test),2))
# print("Test RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))

### Out of Bag (OOB) Score

In [None]:
rf = RandomForestRegressor(n_estimators = 100, max_depth =9, random_state = 50, 
                           max_features=.60, oob_score = True)
rf.fit(X, y)
y_pred = rf.predict(X)
y_pred_oob = rf.oob_prediction_
print("Training Accuracy:", np.round(r2_score(y, y_pred),2))
print("Training RMSE:", np.sqrt(mean_squared_error(y, y_pred)))

print("OOB Accuracy:", np.round(r2_score(y, y_pred_oob),2))
print("OOB RMSE:", np.sqrt(mean_squared_error(y, y_pred_oob)))

print("OOB Accuracy:", np.round(rf.oob_score_, 2))