# Regression using Ensemble techniques on Boston dataset

In [1]:
#import essential libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

In [2]:
#load the dataset
from sklearn.datasets import load_boston
boston_dataset=load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [3]:
# input data (independent variable)
x=boston_dataset.data

## Output Data(dependent variable)
y=boston_dataset.target

In [4]:
# splitting data to training and testing dataset.

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.3,random_state=1)

# Random forest

In [5]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor(n_estimators=100)
regressor.fit(xtrain,ytrain)

RandomForestRegressor()

In [6]:
y_pred=regressor.predict(xtest)

In [21]:
from sklearn.metrics import r2_score,mean_squared_error
rf_score=r2_score(ytest,y_pred)
rf_score

0.9109167703419092

In [8]:
mse_rf=mean_squared_error(ytest,y_pred)
mse_rf

8.164877723684215

In [9]:
y_pred

array([28.769, 26.798, 19.402, 20.919, 19.542, 19.525, 28.755, 18.898,
       20.212, 23.391, 27.622, 31.832, 20.356, 20.197, 19.867, 25.408,
       12.194, 41.66 , 23.788, 14.625, 19.613, 17.397, 24.253, 23.925,
       26.547,  9.288, 14.411, 20.082, 38.871, 12.363, 27.053, 20.248,
       47.07 , 16.08 , 23.017, 21.075, 15.265, 32.745, 13.042, 19.508,
       24.566, 23.221, 25.701, 15.788, 16.093, 10.84 , 48.164, 12.229,
       21.677, 18.637, 24.162, 21.5  , 24.852, 21.077, 10.993, 23.947,
       11.496, 24.769, 18.429, 42.023, 14.504, 26.964, 13.63 , 15.382,
       18.72 , 32.963, 41.955, 24.951, 21.523, 20.147, 24.014,  7.376,
       18.289, 21.918, 19.643, 20.835, 46.63 , 24.317, 29.15 , 33.648,
       17.038, 20.517, 33.184, 11.514, 24.818, 25.681, 15.2  , 25.098,
       19.668, 16.982, 27.529, 43.495, 15.526, 21.309, 15.587, 20.301,
       23.494, 22.801, 43.567, 20.877, 17.011, 15.508, 24.608, 23.833,
        9.108, 20.637, 16.627, 30.075, 25.041, 26.249, 46.856, 23.26 ,
      

In [10]:
xtest[0]

array([4.9320e-02, 3.3000e+01, 2.1800e+00, 0.0000e+00, 4.7200e-01,
       6.8490e+00, 7.0300e+01, 3.1827e+00, 7.0000e+00, 2.2200e+02,
       1.8400e+01, 3.9690e+02, 7.5300e+00])

In [11]:
# lets try our models accuracy 

y_predict=regressor.predict([[4.9320e-02, 3.3000e+01, 2.1800e+00, 0.0000e+00, 4.7200e-01,
       6.8490e+00, 7.0300e+01, 3.1827e+00, 7.0000e+00, 2.2200e+02,
       1.8400e+01, 3.9690e+02, 7.5300e+00]])
y_predict

array([28.769])

# Adaboost Regressor

In [12]:
from sklearn.ensemble import AdaBoostRegressor
adb_reg=AdaBoostRegressor()
adb_reg.fit(xtrain,ytrain)

AdaBoostRegressor()

In [13]:
adb_pred=adb_reg.predict(xtest)

In [22]:
from sklearn.metrics import r2_score,mean_squared_error
ab_score=r2_score(ytest,adb_pred)
ab_score

0.8224514685678044

In [15]:
mse_ab=mean_squared_error(ytest,adb_pred)
mse_ab

16.2731195840958

# Gradient regressor

In [16]:
from sklearn.ensemble import GradientBoostingRegressor
gb_reg=GradientBoostingRegressor()
gb_reg.fit(xtrain,ytrain)

GradientBoostingRegressor()

In [17]:
gb_pred=gb_reg.predict(xtest)

In [23]:
from sklearn.metrics import r2_score,mean_squared_error
gb_score=r2_score(ytest,gb_pred)
gb_score

0.9222125973453925

In [19]:
mse_gb=mean_squared_error(ytest,gb_pred)
mse_gb

7.129564493289287

# Comparison 

In [25]:
import pandas as pd
  
# initialize list of lists
data = [["RF",rf_score, mse_rf], ["Adaboost",ab_score, mse_ab], ["Gboost",gb_score, mse_gb]]
  
# Create the pandas DataFrame
df = pd.DataFrame(data, columns=['Name', 'r2_score',"mse"])
  
# print dataframe.
df

Unnamed: 0,Name,r2_score,mse
0,RF,0.910917,8.164878
1,Adaboost,0.822451,16.27312
2,Gboost,0.922213,7.129564


Conclusion: Among all three techniques, gradient boost seems to be best for given dataset