# Diabetes data

source: http://scikit-learn.org/stable/datasets/<br>

這是一個糖尿病的資料集，內有442筆資料，10項屬性值(attribute)<br><br>
自變項(x)︰Age、Sex、Body mass index、Average Blood Pressure、S1~S6一年後疾病級數指標<br>
<br>依變項(y)為一年後患疾病的定量指標

In [18]:
from sklearn import datasets, linear_model
#load linear model metrics
from sklearn.metrics import mean_squared_error,r2_score
import matplotlib.pyplot as plt 

In [19]:
#load the diabetes dataset
from sklearn.datasets import load_diabetes

In [20]:
#將data命名diabetes
diabetes=load_diabetes()

In [21]:
type(diabetes)
#diabetes的資料型態為bunch

sklearn.utils.Bunch

### 自變項(x)

In [22]:
#印出自變數(x): 影響糖尿病的因子
print(diabetes.data)

[[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990842
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06832974
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286377
  -0.02593034]
 ...
 [ 0.04170844  0.05068012 -0.01590626 ... -0.01107952 -0.04687948
   0.01549073]
 [-0.04547248 -0.04464164  0.03906215 ...  0.02655962  0.04452837
  -0.02593034]
 [-0.04547248 -0.04464164 -0.0730303  ... -0.03949338 -0.00421986
   0.00306441]]


In [23]:
print('features:',diabetes.feature_names)

#了解變數項
#共有10個自變數(X)
#Age(年齡)、性別(Sex)、Body mass index(體質指數)、Average Blood Pressure(平均血壓)、S1~S6一年後疾病級數指標

features: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


### 依變項(y)

In [24]:
#了解y
#為一年後患疾病的定量指標
print('y:',diabetes.target)

y: [151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  92.  83. 128. 102. 302. 198.  95.  53. 134. 144. 232.  81. 104.  59.
 246. 297. 258. 229. 275. 281. 179. 200. 200. 173. 180.  84. 121. 161.
  99. 109. 115. 268. 274. 158. 107.  83. 103. 272.  85. 280. 336. 281.
 118. 317. 235.  60. 174. 259. 178. 128.  96. 126. 288.  88. 292.  71.
 197. 186.  25.  84.  96. 195.  53. 217. 172. 131. 214.  59.  70. 220.
 268. 152.  47.  74. 295. 101. 151. 127. 237. 225.  81. 151. 107.  64.
 138. 185. 265. 101. 137. 143. 141.  79. 292. 178.  91. 116.  86. 122.
  7

In [25]:
#依變項(y)名稱: 無
print(diabetes.target_names)

AttributeError: target_names

### 確認x和y資料

In [26]:
print(type(diabetes.data))
print(type(diabetes.target))

print(diabetes.data.ndim)
print(diabetes.target.ndim)

#了解x和y的資料型態: x->2darray, y->1dnarray

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
2
1


In [27]:
print(diabetes.data.shape)
print(diabetes.target.shape)

#了解X和y資料大小: 分別皆為442筆，且有10個自變項(x)

(442, 10)
(442,)


### 線性迴規模型

#### 1. import package

In [28]:
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression

In [29]:
#定義、儲存X,y

X=diabetes.data
y=diabetes.target

#確認大小
print(X.shape)
print(y.shape)

(442, 10)
(442,)


#### 2. 設立訓練集與測試集

In [61]:
#以7:3的比例下去分
#設立每次所抓取的資料為固定同一群
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [62]:
#檢查x訓練集和測試集的資料array
print(X_train.shape)
print(X_test.shape)

(353, 10)
(89, 10)


#### 3.設立模型 

In [63]:
#將迴歸模型命名為lin
lin=LinearRegression()

In [64]:
#訓練集資料學習pattern
lin.fit(X_train,y_train)

LinearRegression()

In [65]:
#將測試集資料套用在剛剛所建立好的模型之中
#並將模型結果儲存在y_pred之中
y_pred=lin.predict(X_test)
print(y_pred)

[238.47145247 248.93170646 164.05404165 120.30794355 187.42422054
 259.04865002 113.55556372 188.07597044 149.49663441 236.01099949
 172.24629506 178.88073764 109.15751983  92.13508975 243.33042043
  87.356971   155.72606406  66.99073989 100.42610442 218.09422877
 196.66287912 161.29832968 161.70779605 156.52520454 197.88796516
 167.57984206 120.74478913  84.83879727 192.03728687 160.60687024
 175.17178362  84.22833237 145.7995542  145.97333493 140.96488953
 197.00421108 165.94322494 190.65906468 128.22520508 206.41941223
  84.35851196 164.0256504  144.1056776  184.68355549 177.80238966
  74.32855231 143.3660286  138.67726085 120.81146113 234.34252077
 161.94390244  74.5455476  154.71905074 156.78884927 237.42227096
 174.23053048 190.88212635 118.98373473 132.20418974 168.52674824
 214.74245466 171.42364091 157.37409906 108.86927343 257.06329636
 152.17777143  82.43686464 231.56746032 202.90641336  47.18340199
  78.46954525 129.30170908 104.60253144 144.65200281 132.27974254
 190.04134

In [67]:
#測試集所對應的觀測值v.s套用模型得出的結果
#計算出兩者極小誤差
#平均方根誤差(RMSE)、R-square估計預測準確值

print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
print('r2_score:',np.sqrt(metrics.r2_score(y_test,y_pred)))

RMSE: 58.51766133582009
r2_score: 0.5763870511129232


In [75]:
#迴歸模型
#y()=B0+B1*X1('age')+B2*X2('sex')+B3*X3('bmi')+B4*X4('bp')+B5*X5('s1')+B6*X6('s2')+B7*X7('s3')+B8*X8('s4')+B9*X9('s5')+B10*X10('s6')+error

print('intercept:',lin.intercept_)

intercept: 152.5381335195406
