In [1]:
import numpy as np, pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

if __name__ == "__main__":
    br = '\n'
    boston = load_boston()
    X = boston.data
    y = boston.target
    print ('feature shape', X.shape)
    print ('target shape', y.shape, br)
    keys = boston.keys()
    rfr = RandomForestRegressor(random_state=0,
                                n_estimators=100)
    rfr.fit(X, y)
    features = boston.feature_names
    feature_importances = rfr.feature_importances_
    importance = sorted(zip(feature_importances, features),
                        reverse=True)
    [print (row) for row in importance]
    print ()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=0)
    rfr = RandomForestRegressor(random_state=0,
                                n_estimators=100)
    rfr.fit(X_train, y_train)
    rfr_name = rfr.__class__.__name__
    y_pred = rfr.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print (rfr_name + ' (rmse):', rmse, br)
    cols = list(features) + ['target']
    data = pd.DataFrame(data=np.c_[X, y], columns=cols)
    print ('boston dataset sample:')
    print (data[['RM', 'LSTAT', 'DIS', 'CRIM', 'NOX', 'PTRATIO',
                 'target']].head(3), br)
    print ('data set before removing noise:', data.shape)
    noise = data.loc[data['target'] >= 50]
    data = data.drop(noise.index)
    print ('data set without noise:', data.shape, br)
    X = data.loc[:, data.columns != 'target'].values
    y = data['target'].values
    print ('cleansed feature shape:', X.shape)
    print ('cleansed target shape:', y.shape, br)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=0)
    rfr = RandomForestRegressor(random_state=0,
                                n_estimators=100)
    rfr.fit(X_train, y_train)
    y_pred = rfr.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print (rfr_name + ' (rmse):', rmse)
    X_file = 'data/X_boston'
    y_file = 'data/y_boston'
    np.save(X_file, X)
    np.save(y_file, y)

feature shape (506, 13)
target shape (506,) 

(0.45730362625767507, 'RM')
(0.3500866188568138, 'LSTAT')
(0.06518862820215895, 'DIS')
(0.04098961725700101, 'CRIM')
(0.020247975630343553, 'NOX')
(0.01557636583549852, 'PTRATIO')
(0.015524054184831325, 'TAX')
(0.011764308556043929, 'AGE')
(0.011324966974602934, 'B')
(0.00591213993799977, 'INDUS')
(0.003916064249793194, 'RAD')
(0.0011173446269339177, 'ZN')
(0.0010482894303040918, 'CHAS')

RandomForestRegressor (rmse): 4.091149842219918 

boston dataset sample:
      RM  LSTAT     DIS     CRIM    NOX  PTRATIO  target
0  6.575   4.98  4.0900  0.00632  0.538     15.3    24.0
1  6.421   9.14  4.9671  0.02731  0.469     17.8    21.6
2  7.185   4.03  4.9671  0.02729  0.469     17.8    34.7 

data set before removing noise: (506, 14)
data set without noise: (490, 14) 

cleansed feature shape: (490, 13)
cleansed target shape: (490,) 

RandomForestRegressor (rmse): 3.37169151536684
