In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

In [59]:
from sklearn.datasets import load_diabetes
data=load_diabetes()
print(data.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

In [60]:
data.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [61]:
x=pd.DataFrame(data.data,columns=data.feature_names)
y=data.target

In [62]:
x

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [63]:
y

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [64]:
x.corr()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
age,1.0,0.173737,0.185085,0.335428,0.260061,0.219243,-0.075181,0.203841,0.270774,0.301731
sex,0.173737,1.0,0.088161,0.24101,0.035277,0.142637,-0.37909,0.332115,0.149916,0.208133
bmi,0.185085,0.088161,1.0,0.395411,0.249777,0.26117,-0.366811,0.413807,0.446157,0.38868
bp,0.335428,0.24101,0.395411,1.0,0.242464,0.185548,-0.178762,0.25765,0.39348,0.39043
s1,0.260061,0.035277,0.249777,0.242464,1.0,0.896663,0.051519,0.542207,0.515503,0.325717
s2,0.219243,0.142637,0.26117,0.185548,0.896663,1.0,-0.196455,0.659817,0.318357,0.2906
s3,-0.075181,-0.37909,-0.366811,-0.178762,0.051519,-0.196455,1.0,-0.738493,-0.398577,-0.273697
s4,0.203841,0.332115,0.413807,0.25765,0.542207,0.659817,-0.738493,1.0,0.617859,0.417212
s5,0.270774,0.149916,0.446157,0.39348,0.515503,0.318357,-0.398577,0.617859,1.0,0.464669
s6,0.301731,0.208133,0.38868,0.39043,0.325717,0.2906,-0.273697,0.417212,0.464669,1.0


In [65]:
x.drop(['sex','s6'],axis=1,inplace=True)

In [66]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=11)

In [67]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [68]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,root_mean_squared_error

In [69]:
models=[LinearRegression(),Ridge(),Lasso(),SVR(),RandomForestRegressor(),KNeighborsRegressor(),AdaBoostRegressor(),
        GradientBoostingRegressor()]
for model in models:
    model.fit(x_train,y_train)
    pred=model.predict(x_test)
    print(model,':','Score:',model.score(x_test,y_test))
    print(model,':','Mean absolute error:',mean_absolute_error(y_test,pred))
    print(model,':','Mean squared error:',mean_squared_error(y_test,pred))
    print(model,':','Root mean squared error:',root_mean_squared_error(y_test,pred))
    print('---------------------------------\n')

LinearRegression() : Score: 0.5675958846461988
LinearRegression() : Mean absolute error: 46.24285295983519
LinearRegression() : Mean squared error: 3285.329169603574
LinearRegression() : Root mean squared error: 57.31779103911432
---------------------------------

Ridge() : Score: 0.5661042764569595
Ridge() : Mean absolute error: 46.321835059181886
Ridge() : Mean squared error: 3296.6621419776184
Ridge() : Root mean squared error: 57.41656679023588
---------------------------------

Lasso() : Score: 0.5657312496422398
Lasso() : Mean absolute error: 46.46094827607498
Lasso() : Mean squared error: 3299.496333031605
Lasso() : Root mean squared error: 57.44124243983242
---------------------------------

SVR() : Score: 0.20970459248339224
SVR() : Mean absolute error: 66.88762784779811
SVR() : Mean squared error: 6004.523228909715
SVR() : Root mean squared error: 77.48885874052937
---------------------------------

RandomForestRegressor() : Score: 0.5284590717705868
RandomForestRegressor() :

In [70]:
params={'n_estimators':[50,60,80,100],
        'loss':['linear', 'square', 'exponential'],
        'random_state':[3,5,7,11,13,19,23]
        }
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(AdaBoostRegressor(),param_grid=params,cv=3,n_jobs=-1)
grid.fit(x_train,y_train)

In [71]:
grid.score(x_test,y_test)

0.5032323741577323

In [72]:
grid.best_params_

{'loss': 'exponential', 'n_estimators': 60, 'random_state': 5}

In [73]:
grid.best_score_

np.float64(0.39021201495972413)

In [74]:
print('Mean absolute error:',mean_absolute_error(y_test,grid.predict(x_test)))
print('Mean squared error:',mean_squared_error(y_test,grid.predict(x_test)))
print('Root mean squared error:',root_mean_squared_error(y_test,grid.predict(x_test)))

Mean absolute error: 51.34911128580169
Mean squared error: 3774.3516163322042
Root mean squared error: 61.435751939177926


In [75]:
params={'n_estimators':[50,60,80,100],
        'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],
        'learning_rate':[0.1,0.3,0.7,1.3],
        'criterion':['friedman_mse', 'squared_error']
        }

grid=GridSearchCV(GradientBoostingRegressor(),param_grid=params,cv=3,n_jobs=-1)
grid.fit(x_train,y_train)

In [76]:
grid.score(x_test,y_test), grid.best_params_, grid.best_score_

(0.5248960624281664,
 {'criterion': 'squared_error',
  'learning_rate': 0.1,
  'loss': 'absolute_error',
  'n_estimators': 50},
 np.float64(0.35304013268413187))