## **diabetes 데이터 셋을 활용해 BMI 선형회귀 예측**

In [2]:
import pandas as pd
import os

# 데이터 불러오기
file_name = 'diabetes.csv'
full_file_path = os.path.join(os.getcwd() + "\\" + file_name)
df = pd.read_csv(full_file_path)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# 기존 레이블인 Outcome 제거
df = df.drop('Outcome', axis = 1)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [4]:
# 피처, 레이블 데이터
X = df.drop('BMI', axis = 1).values
y = df['BMI'].values

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [6]:
# 훈련 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 7), (154, 7), (614,), (154,))

In [None]:
# 모델 생성
dt_clf = DecisionTreeRegressor()
lr_clf = LinearRegression()
rf_clf = RandomForestRegressor()
sv_clf = SVR(kernel = 'linear')


In [8]:
# 결정트리 학습 / 예측 / 평가
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print('결정트리리 평균제곱근오차', mean_squared_error(y_test, dt_pred))

결정트리리 평균제곱근오차 111.50714285714285


In [9]:
# 회귀 학습 / 예측 / 평가
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print('선형회귀 평균제곱근오차 : {0:.4f}'.format(mean_squared_error(y_test, lr_pred)))

선형회귀 평균제곱근오차 : 52.2406


In [10]:
# RandomForest 학습/예측/평가
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print('랜덤포레스트 평균제곱근오차 : {0:.4f}'.format(mean_squared_error(y_test, rf_pred)))

랜덤포레스트 평균제곱근오차 : 47.3012


In [11]:
# svm 학습/예측/평가
sv_clf.fit(X_train, y_train)
sv_pred = sv_clf.predict(X_test)
print('랜덤포레스트 평균제곱근오차 : {0:.4f}'.format(mean_squared_error(y_test, sv_pred)))

랜덤포레스트 평균제곱근오차 : 53.2107
