In [6]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import load_boston

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [8]:
boston_data=load_boston()
df=pd.DataFrame(boston_data.data,columns=boston_data.feature_names)
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [9]:
df['LSTAT'].value_counts()

7.79     3
14.10    3
6.36     3
18.13    3
8.05     3
        ..
3.32     1
1.92     1
1.73     1
6.43     1
7.88     1
Name: LSTAT, Length: 455, dtype: int64

In [10]:
x=pd.DataFrame(boston_data.data,columns=boston_data.feature_names)
y=pd.Series(boston_data.target)

In [11]:
x

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [12]:
y

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Length: 506, dtype: float64

# splitting

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=1)

# Random Forest

# Model Training

In [14]:
rf_reg = RandomForestRegressor()
rf_reg.fit(x_train,y_train)

# Evaluation

In [15]:
## Testing Data Evaluation
y_pred = rf_reg.predict(x_test)

mse = mean_squared_error(y_test,y_pred)
print('Mean Squared Error ', mse)

r2 = r2_score(y_test,y_pred)
print('R2 Score: ', r2)

mae=mean_absolute_error(y_test,y_pred)
print('mean absolute Error: ',mae)


Mean Squared Error  8.753338532894741
R2 Score:  0.9044963448088208
mean absolute Error:  2.26283552631579


In [16]:
## Training Data Evaluation
y_pred_train = rf_reg.predict(x_train)

mse = mean_squared_error(y_train,y_pred_train)
print('Mean Squared Error ', mse)

r2 = r2_score(y_train,y_pred_train)
print('R2 Score: ', r2)

mae=mean_absolute_error(y_train,y_pred_train)
print('mean absolute Error: ',mae)

Mean Squared Error  1.7284776949152532
R2 Score:  0.9787104614123563
mean absolute Error:  0.8567231638418069


# Decision Tree

In [17]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(x_train, y_train)

In [18]:
## Testing Data Evaluation
y_pred = dt_reg.predict(x_test)

mse = mean_squared_error(y_test,y_pred)
print('Mean Squared Error ', mse)

r2 = r2_score(y_test,y_pred)
print('R2 Score: ', r2)

mae=mean_absolute_error(y_test,y_pred)
print('mean absolute Error: ',mae)


Mean Squared Error  20.09263157894737
R2 Score:  0.7807785279881552
mean absolute Error:  3.0118421052631574


In [19]:
## Traning Data Evaluation
y_pred = dt_reg.predict(x_train)

mse = mean_squared_error(y_train,y_pred_train)
print('Mean Squared Error ', mse)

r2 = r2_score(y_train,y_pred_train)
print('R2 Score: ', r2)

mae=mean_absolute_error(y_train,y_pred_train)
print('mean absolute Error: ',mae)


Mean Squared Error  1.7284776949152532
R2 Score:  0.9787104614123563
mean absolute Error:  0.8567231638418069


# Hyper parameter Tuning for Random Forest

In [20]:
rf_model = RandomForestRegressor()
hyp = {'n_estimators': np.arange(20,150),
    'criterion':['mse','mae'],
    'max_depth':np.arange(3,9),
    'min_samples_split':np.arange(3,20),
    'min_samples_leaf':np.arange(2,10),
    }

gscv_rf_model = RandomizedSearchCV(rf_model, hyp, cv=5)
gscv_rf_model.fit(x_train,y_train)

In [21]:
best_esti=gscv_rf_model.best_estimator_
best_esti

In [22]:
best_esti.fit(x_train,y_train)

In [23]:
# Testing data evaluation
y_pred = best_esti.predict(x_test)

r2_sco=r2_score(y_test,y_pred)
print('r2 score is: ',r2_sco)

mae=mean_absolute_error(y_test,y_pred)
print('mean absolute error is: ', mae)

mse=mean_squared_error(y_test,y_pred)
print('mean squared error is: ', mse)

r2 score is:  0.8648004496540005
mean absolute error is:  2.643332304777411
mean squared error is:  12.391645443356648


In [24]:
# Training data evaluation
y_pred_train = best_esti.predict(x_train)

r2_sco=r2_score(y_train,y_pred_train)
print('r2 score is: ',r2_sco)

mae=mean_absolute_error(y_train,y_pred_train)
print('mean absolute error is: ', mae)

mse=mean_squared_error(y_train,y_pred_train)
print('mean squared error is: ', mse)

r2 score is:  0.9008956417727205
mean absolute error is:  1.8804543326892424
mean squared error is:  8.04619000827782
