In [1]:
from sklearn.datasets import load_diabetes

In [2]:
diabetes = load_diabetes()

In [3]:
import pandas as pd

In [4]:
data = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)

In [5]:
data['Target'] = pd.DataFrame(diabetes.target)

In [6]:
data.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [7]:
pd.DataFrame(data.corr().round(2))

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Target
age,1.0,0.17,0.19,0.34,0.26,0.22,-0.08,0.2,0.27,0.3,0.19
sex,0.17,1.0,0.09,0.24,0.04,0.14,-0.38,0.33,0.15,0.21,0.04
bmi,0.19,0.09,1.0,0.4,0.25,0.26,-0.37,0.41,0.45,0.39,0.59
bp,0.34,0.24,0.4,1.0,0.24,0.19,-0.18,0.26,0.39,0.39,0.44
s1,0.26,0.04,0.25,0.24,1.0,0.9,0.05,0.54,0.52,0.33,0.21
s2,0.22,0.14,0.26,0.19,0.9,1.0,-0.2,0.66,0.32,0.29,0.17
s3,-0.08,-0.38,-0.37,-0.18,0.05,-0.2,1.0,-0.74,-0.4,-0.27,-0.39
s4,0.2,0.33,0.41,0.26,0.54,0.66,-0.74,1.0,0.62,0.42,0.43
s5,0.27,0.15,0.45,0.39,0.52,0.32,-0.4,0.62,1.0,0.46,0.57
s6,0.3,0.21,0.39,0.39,0.33,0.29,-0.27,0.42,0.46,1.0,0.38


In [8]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, T-Cells (a type of white blood cells)
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, thyroid stimulating hormone
      - s5      ltg, lamotrigine
      - s6      glu, blood sugar level

Note: Each of these 10 feature va

In [9]:
x = data['bmi']
y = data['Target']

In [10]:
pd.DataFrame([x, y]).head().transpose()

Unnamed: 0,bmi,Target
0,0.061696,151.0
1,-0.051474,75.0
2,0.044451,141.0
3,-0.011595,206.0
4,-0.036385,135.0
...,...,...
437,0.019662,178.0
438,-0.015906,104.0
439,-0.015906,132.0
440,0.039062,220.0


In [11]:
from sklearn.model_selection import train_test_split

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [26]:
from sklearn.linear_model import LinearRegression

In [27]:
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [28]:
from sklearn.metrics import mean_squared_error
from math import sqrt
print(sqrt(mean_squared_error(y_pred, y_test)))

61.058313028028685


### Taking 2 independent variables

In [73]:
x = data[['bmi', 's5']]
y = data['Target']
print(x)

          bmi        s5
0    0.061696  0.019908
1   -0.051474 -0.068330
2    0.044451  0.002864
3   -0.011595  0.022692
4   -0.036385 -0.031991
..        ...       ...
437  0.019662  0.031193
438 -0.015906 -0.018118
439 -0.015906 -0.046879
440  0.039062  0.044528
441 -0.073030 -0.004220

[442 rows x 2 columns]


In [80]:
pd.DataFrame([[x, y]]).head().transpose()

Unnamed: 0,0
0,bmi s5 0 0.061696 0.01990...
1,0 151.0 1 75.0 2 141.0 3 ...


In [97]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [98]:
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [99]:
print(sqrt(mean_squared_error(y_pred, y_test)))

59.35603134762395


### Taking 3 variables

In [101]:
x = data[['bmi', 's5', 'bp']]
y = data['Target']
print(x)

          bmi        s5        bp
0    0.061696  0.019908  0.021872
1   -0.051474 -0.068330 -0.026328
2    0.044451  0.002864 -0.005671
3   -0.011595  0.022692 -0.036656
4   -0.036385 -0.031991  0.021872
..        ...       ...       ...
437  0.019662  0.031193  0.059744
438 -0.015906 -0.018118 -0.067642
439 -0.015906 -0.046879  0.017282
440  0.039062  0.044528  0.001215
441 -0.073030 -0.004220 -0.081414

[442 rows x 3 columns]


In [103]:
pd.DataFrame([[x, y]]).head().transpose()

Unnamed: 0,0
0,bmi s5 bp 0 0.06169...
1,0 151.0 1 75.0 2 141.0 3 ...


In [122]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

In [123]:
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [124]:
print(sqrt(mean_squared_error(y_pred, y_test)))

53.54316265811792
