In [4]:
from sklearn.datasets import load_boston

boston = load_boston()

In [5]:
type(boston)

sklearn.utils.Bunch

In [7]:
type(boston)

sklearn.utils.Bunch

In [8]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [10]:
boston['data']

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [11]:
boston['feature_names']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [14]:
print(boston['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [15]:
from sklearn.ensemble import RandomForestRegressor

###### create a model

In [16]:
clf = RandomForestRegressor()

###### train the classifier

In [18]:
clf.fit(boston['data'], boston['target'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [19]:
clf.score?

In [20]:
clf.n_features_

13

In [21]:
boston['data'].shape

(506, 13)

In [22]:
row = boston['data'][17]

In [23]:
row.reshape(-1, 13)

array([[  0.7842,   0.    ,   8.14  ,   0.    ,   0.538 ,   5.99  ,
         81.7   ,   4.2579,   4.    , 307.    ,  21.    , 386.75  ,
         14.67  ]])

    .fit > Train model
    .score > check accuracy
    .predict > check outcome

In [25]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train,y_test = train_test_split(boston['data'],
                                                  boston['target'],
                                                  test_size=0.3)

In [32]:
clf = RandomForestRegressor()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8119679898626403

In [33]:
import pandas as pd 

In [34]:
df = pd.DataFrame(boston['data'], columns=boston['feature_names'])

df.max(axis=0)

CRIM        88.9762
ZN         100.0000
INDUS       27.7400
CHAS         1.0000
NOX          0.8710
RM           8.7800
AGE        100.0000
DIS         12.1265
RAD         24.0000
TAX        711.0000
PTRATIO     22.0000
B          396.9000
LSTAT       37.9700
dtype: float64

In [35]:
from sklearn.svm import SVR


In [36]:
clf = SVR()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

-0.007003382242165124

###### pre-processing

In [43]:
from sklearn import preprocessing
Xs = preprocessing.scale(boston['data'])


In [44]:
df = pd.DataFrame(Xs, columns=boston['feature_names'])

df.max(axis=0)

CRIM       9.941735
ZN         3.804234
INDUS      2.422565
CHAS       3.668398
NOX        2.732346
RM         3.555044
AGE        1.117494
DIS        3.960518
RAD        1.661245
TAX        1.798194
PTRATIO    1.638828
B          0.441052
LSTAT      3.548771
dtype: float64

In [47]:
Xs_train, Xs_test, ys_train,ys_test = train_test_split(Xs,
                                                  boston['target'],
                                                  test_size=0.3)

In [49]:
clf = SVR()
clf.fit(Xs_train, ys_train)
clf.score(Xs_test, ys_test)

0.6784099323400359

###### reduce no. of features

In [51]:
from sklearn.decomposition import PCA


In [52]:
pca = PCA(n_components=5)
pca.fit(boston['data'])

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [56]:
# reduced features to five
Xp = pca.transform(boston['data'])
Xp.shape

(506, 5)

In [59]:
clf = RandomForestRegressor()
Xp_train, Xp_test, yp_train,yp_test = train_test_split(Xp,
                                                  boston['target'],
                                                  test_size=0.3)

In [67]:
clf.fit(Xp_train, yp_train)
clf.score(Xp_test, yp_test)

0.3960826955973684