# Diabetes dataset analysis

## Importing the libraries

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Importing the dataset

In [51]:
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
print(diabetes['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - Age
      - Sex
      - Body mass index
      - Average blood pressure
      - S1
      - S2
      - S3
      - S4
      - S5
      - S6

Note: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times `n_samples` (i.e. the sum of squares of each column totals 1).

Source URL:
https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

For more information see:
Bra

In [52]:
dataset = pd.DataFrame(diabetes['data'], columns=diabetes['feature_names'])
dataset['target'] = diabetes['target']
dataset

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930,220.0


In [53]:
X = dataset.iloc[:, :-1].values
y = dataset['target'].values

## Splitting the dataset into train and test

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=5
)

## Feature Scaling

In [55]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Fitting Linear Rgression

In [34]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [35]:
y_pred = regressor.predict(X_test)
print(np.concatenate((y_test.reshape(-1,1), y_pred.reshape(-1,1)), axis=1))

[[281.         195.88049677]
 [332.         219.9869973 ]
 [178.         121.41731446]
 [155.         158.096774  ]
 [137.         199.41301643]
 [ 65.         128.03683528]
 [ 31.         100.64085969]
 [275.         235.20267685]
 [138.         178.49979907]
 [173.         214.01527918]
 [ 90.          56.01821838]
 [221.         199.16293283]
 [198.         176.62264109]
 [ 88.         144.48999932]
 [268.         221.42867632]
 [107.         113.97468265]
 [ 83.         124.09435034]
 [ 63.          62.78851419]
 [ 65.          80.68437133]
 [ 93.         142.4717434 ]
 [103.         131.1218549 ]
 [144.         124.34002996]
 [156.         160.57566929]
 [ 84.         184.64649247]
 [ 42.         124.65094908]
 [181.          86.5043832 ]
 [233.         249.70362665]
 [109.         206.59565158]
 [ 92.          74.10940066]
 [101.         181.06901941]
 [ 71.          86.57575242]
 [140.         125.23114985]
 [150.         122.36969588]
 [ 49.          86.94561886]
 [109.        

In [36]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.5271561853905229

## Fitting linear SVR

In [37]:
from sklearn.svm import SVR
svr = SVR(kernel='linear')
svr.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [38]:
y_pred = svr.predict(X_test)
r2_score(y_test, y_pred)

0.5200074951445961

## Fitting kernel SVR

In [39]:
from sklearn.svm import SVR
svr1 = SVR()
svr1.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [40]:
y_pred = svr1.predict(X_test)
r2_score(y_test, y_pred)

0.14692304612320972

## Grid Search for SVR

In [41]:
from sklearn.model_selection import GridSearchCV
params = {
    'C': [.5, 1.0, 1.5, 2.],
    'epsilon': [.1, .2, .3, .5, .6, .7, .8, .9],
    'gamma': [.1, .2, .3, .5, .6, .7, .8, .9],
    'kernel': ('linear', 'rbf', 'poly')
}
grid_search = GridSearchCV(
    estimator=svr1,
    param_grid=params,
    n_jobs=-1,
    cv=10
)
grid_search = grid_search.fit(X_train, y_train)
grid_search.best_params_



{'C': 1.5, 'epsilon': 0.9, 'gamma': 0.1, 'kernel': 'linear'}

In [42]:
best_svr = grid_search.best_estimator_
y_pred = best_svr.predict(X_test)
print(np.concatenate((y_test.reshape(-1,1), y_pred.reshape(-1,1)), axis=1))

[[281.         190.88054753]
 [332.         212.20726908]
 [178.         113.73398126]
 [155.         158.24598547]
 [137.         183.39466734]
 [ 65.         126.72448936]
 [ 31.         102.52821504]
 [275.         227.25294332]
 [138.         172.80248465]
 [173.         208.85866937]
 [ 90.          64.94437809]
 [221.         186.61380114]
 [198.         170.55225189]
 [ 88.         138.62027527]
 [268.         214.05607504]
 [107.         110.77904579]
 [ 83.         120.02565831]
 [ 63.          63.12780031]
 [ 65.          82.9350193 ]
 [ 93.         136.78833054]
 [103.         130.37236531]
 [144.         123.42047309]
 [156.         155.59410787]
 [ 84.         173.80596676]
 [ 42.         118.13186639]
 [181.          91.4566547 ]
 [233.         236.53559558]
 [109.         203.82024858]
 [ 92.          73.56822931]
 [101.         173.64336528]
 [ 71.          88.47345442]
 [140.         113.29480108]
 [150.         113.23715086]
 [ 49.          90.31105202]
 [109.        

In [43]:
r2_score(y_test, y_pred)

0.5181663396387861

## Fitting decision tree regression


In [56]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [57]:
y_pred = dt.predict(X_test)
r2_score(y_test, y_pred)

-0.08932347072251634

## Fitting Random Forest Regression

In [64]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [65]:
rf.feature_importances_

array([0.06160682, 0.01164266, 0.30822572, 0.11443787, 0.04547259,
       0.04595373, 0.06410502, 0.02305948, 0.25776497, 0.06773114])

In [66]:
y_pred = rf.predict(X_test)
r2_score(y_test, y_pred)

0.5341464071268479

## Fitting catboost Model

In [67]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor()
cat_model.fit(X_train, y_train)

total: 3.54s	remaining: 2.15s
622:	learn: 19.6830158	total: 3.56s	remaining: 2.15s
623:	learn: 19.6683580	total: 3.57s	remaining: 2.15s
624:	learn: 19.6355328	total: 3.57s	remaining: 2.14s
625:	learn: 19.5948115	total: 3.58s	remaining: 2.14s
626:	learn: 19.5347531	total: 3.58s	remaining: 2.13s
627:	learn: 19.4805305	total: 3.59s	remaining: 2.13s
628:	learn: 19.4692732	total: 3.6s	remaining: 2.12s
629:	learn: 19.4466579	total: 3.6s	remaining: 2.12s
630:	learn: 19.4448374	total: 3.61s	remaining: 2.11s
631:	learn: 19.4127559	total: 3.61s	remaining: 2.1s
632:	learn: 19.3937737	total: 3.62s	remaining: 2.1s
633:	learn: 19.3873834	total: 3.63s	remaining: 2.09s
634:	learn: 19.3468796	total: 3.63s	remaining: 2.09s
635:	learn: 19.3007159	total: 3.66s	remaining: 2.1s
636:	learn: 19.2888895	total: 3.67s	remaining: 2.09s
637:	learn: 19.2627862	total: 3.67s	remaining: 2.08s
638:	learn: 19.2116338	total: 3.67s	remaining: 2.08s
639:	learn: 19.1760549	total: 3.68s	remaining: 2.07s
640:	learn: 19.167680

<catboost.core.CatBoostRegressor at 0x7f5a1300a290>

In [68]:
y_pred = cat_model.predict(X_test)
r2_score(y_test, y_pred)

0.5484324814547379

## Applying k-fold cross validation

In [69]:
from sklearn.model_selection import cross_val_score
models = [regressor, svr, svr1, best_svr, dt, rf, cat_model]
accuracies = []
for model in models:
    accuracies.append(cross_val_score(
        estimator=model,
        X=X_train,
        y=y_train,
        cv=10,
        n_jobs=-1
    ))

In [70]:
a = {'Accuracy':[], "Standard Deviation":[]}
for accuracy in accuracies:
    a['Accuracy'].append(accuracy.mean()*100)
    a['Standard Deviation'].append(accuracy.std()*100)

pd.DataFrame(a, index=['Linear Regression', 'Linear SVR', 'SVR', 'Best SVR', 'DecisionTree', 'RandomForest', 'CatBoost'])

Unnamed: 0,Accuracy,Standard Deviation
Linear Regression,46.425463,8.049661
Linear SVR,45.802923,8.654703
SVR,11.510631,7.098958
Best SVR,45.925567,8.58415
DecisionTree,-24.380145,26.014407
RandomForest,38.343702,12.399156
CatBoost,34.248442,15.448366
