In [1]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Regression
============

Load the boston dataset:

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()
boston.keys()

['data', 'feature_names', 'DESCR', 'target']

In [3]:
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [4]:
boston.data.shape

(506, 13)

In [5]:
boston.target.shape

(506,)

In [6]:
boston.target

array([ 24. ,  21.6,  34.7,  33.4,  36.2,  28.7,  22.9,  27.1,  16.5,
        18.9,  15. ,  18.9,  21.7,  20.4,  18.2,  19.9,  23.1,  17.5,
        20.2,  18.2,  13.6,  19.6,  15.2,  14.5,  15.6,  13.9,  16.6,
        14.8,  18.4,  21. ,  12.7,  14.5,  13.2,  13.1,  13.5,  18.9,
        20. ,  21. ,  24.7,  30.8,  34.9,  26.6,  25.3,  24.7,  21.2,
        19.3,  20. ,  16.6,  14.4,  19.4,  19.7,  20.5,  25. ,  23.4,
        18.9,  35.4,  24.7,  31.6,  23.3,  19.6,  18.7,  16. ,  22.2,
        25. ,  33. ,  23.5,  19.4,  22. ,  17.4,  20.9,  24.2,  21.7,
        22.8,  23.4,  24.1,  21.4,  20. ,  20.8,  21.2,  20.3,  28. ,
        23.9,  24.8,  22.9,  23.9,  26.6,  22.5,  22.2,  23.6,  28.7,
        22.6,  22. ,  22.9,  25. ,  20.6,  28.4,  21.4,  38.7,  43.8,
        33.2,  27.5,  26.5,  18.6,  19.3,  20.1,  19.5,  19.5,  20.4,
        19.8,  19.4,  21.7,  22.8,  18.8,  18.7,  18.5,  18.3,  21.2,
        19.2,  20.4,  19.3,  22. ,  20.3,  20.5,  17.3,  18.8,  21.4,
        15.7,  16.2,

In [7]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target)

Learning a Regressor
===========

In [8]:
from sklearn.linear_model import Ridge


In [9]:
ridge = Ridge()

In [10]:
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [11]:
pred_test = ridge.predict(X_test)
pred_test

array([ 24.77912445,  10.74311479,  23.9021793 ,  26.58942902,
        34.39775508,  13.7858141 ,  14.99157131,  12.40353403,
        18.27215025,  20.93691856,  31.64479072,   8.46622479,
        28.69881202,  20.28873786,  13.6764264 ,  18.82244519,
        21.59437744,  35.46080973,  12.03168927,  10.92056327,
        19.88873807,  15.71459069,  16.71124231,  12.99817929,
         6.49028835,  22.31406961,  24.82799139,  12.86629588,
         1.08382622,  26.61437796,  25.18530629,  17.55146825,
        30.5708193 ,  25.50778442,  20.79652189,  20.41820963,
        26.82387012,  19.51649098,  19.84362699,  10.32839376,
         7.68558824,  32.98811859,  27.32644075,  13.49626893,
        26.03981109,  14.72654849,  21.56113938,  23.10728291,
        28.69275377,  22.09732199,  25.88547959,  30.29406107,
        14.56137967,  27.31107537,  22.52165118,  27.16615371,
        30.94035679,  16.54897452,  15.33504805,  20.16882137,
        20.6316112 ,  34.52585401,  22.46642351,  17.99

R2 score:

In [12]:
ridge.score(X_test, y_test)

0.69962717840974831

MSE:

In [13]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred_test)

26.518609259151461

Random Forest Regression
----------------------------

In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
rf = RandomForestRegressor()

In [16]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [17]:
rf.score(X_test, y_test)

0.84606030065945281

In [18]:
mean_squared_error(y_test, rf.predict(X_test))

13.590666141732282