In [3]:
from sklearn import datasets
diabetes = datasets.load_diabetes()
X_diabetes,Y_diabetes = diabetes.data,diabetes.target
print("Dataset feature names :"+str(diabetes.feature_names))
print("Dataset feature size :"+str(diabetes.data.shape))
print("Dataset target size :"+str(diabetes.target.shape))

Dataset feature names :['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
Dataset feature size :(442, 10)
Dataset target size :(442,)


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [8]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X_diabetes,Y_diabetes,train_size = 0.80,test_size = 0.2,random_state = 123)
print("Train/test Sets Sizes :",X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

Train/test Sets Sizes : (353, 10) (89, 10) (353,) (89,)


In [10]:
lr = LinearRegression()
dt = DecisionTreeRegressor()
knn = KNeighborsRegressor()

In [11]:
lr.fit(X_train,Y_train)
dt.fit(X_train,Y_train)
knn.fit(X_train,Y_train)

In [12]:
y_pred1 = lr.predict(X_test)
y_pred2 = dt.predict(X_test)
y_pred3 = knn.predict(X_test)

In [14]:
print("R^2 score for LR",r2_score(Y_test,y_pred1))
print("R^2 score for DT",r2_score(Y_test,y_pred2))
print("R^2 score for KNN",r2_score(Y_test,y_pred3))

R^2 score for LR 0.5675895725793205
R^2 score for DT 0.11943663171890573
R^2 score for KNN 0.438839665879189


In [18]:
from sklearn.ensemble import BaggingRegressor

bag_regressor = BaggingRegressor(
    random_state = 1
)
bag_regressor.fit(X_train,Y_train)

In [19]:
y_preds = bag_regressor.predict(X_test)

print("Training Coefficient of R^2 : %.3f"%bag_regressor.score(X_train,Y_train))
print('Test Coefficient of R^2 :%.3f'%bag_regressor.score(X_test,Y_test))


Training Coefficient of R^2 : 0.897
Test Coefficient of R^2 :0.499


In [25]:
%%time

n_samples = diabetes.data.shape[0]
n_features = diabetes.data.shape[1]


params = {
    'estimator':[None,LinearRegression(),KNeighborsRegressor()],
    'n_estimators':[20,50,100],
    'max_samples':[0.5,1.0],
    'max_features':[0.5,1.0],
    'bootstrap':[True,False],
    'bootstrap_features':[True,False]
}


bagging_regressor_grid = GridSearchCV(
    BaggingRegressor(random_state = 1,
                     n_jobs = -1,
                     ),param_grid = params, cv = 3,n_jobs = -1,verbose = 1
)
bagging_regressor_grid.fit(X_train,Y_train)



print('Train R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_train, Y_train))
print('Test R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X_test, Y_test))
print('Best R^2 Score Through Grid Search : %.3f'%bagging_regressor_grid.best_score_)
print('Best Parameters : ',bagging_regressor_grid.best_params_)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
Train R^2 Score : 0.487
Test R^2 Score : 0.533
Best R^2 Score Through Grid Search : 0.446
Best Parameters :  {'bootstrap': False, 'bootstrap_features': True, 'estimator': LinearRegression(), 'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 100}
CPU times: user 1.08 s, sys: 127 ms, total: 1.2 s
Wall time: 1min 6s
