## scikit-learn으로 데이터 준비하기

```python
# scikit-learn : 머신러닝 알고리즘
# sklearn
from sklearn.datasets import load_boston  # sklearn의 datasets 모듈에서 load_boston 데이터셋 불러오기

boston_dataset = load_boston()
boston_dataset.DESCR # description
```

<img src="image/13-1.png" width="700px"/>


```python
boston_dataset.feature_names
```

<img src="image/13-2.png" width="700px"/>

```python
boston_dataset.data
```

<img src="image/13-3.png" width="700px"/>

```python
boston_dataset.data.shape  # 506개의 집 데이터에 13개의 속성이 있음.
```

<img src="image/13-4.png" width="300px"/>

```python
boston_dataset.target  # 목표변수(MEDV), 즉 506개의 집 가격들이 출력됨. -> 차원이 506인 벡터.
```

<img src="image/13-5.png" width="700px"/>

현재 scikit-learn 버전이 1.2 이상이어서 load_boston이 지원되지 않음.  
`DESCR`, `target` 기능도 사용할 수 없다.

---

## CSV 파일로 준비하기

In [4]:
# csv 파일로 대신 준비하기
import pandas as pd

boston_df = pd.read_csv('data/boston.csv')
boston_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


CRIM ~ LSTAT 까지는 입력변수,  
MEDV가 목표변수.

In [5]:
# DataFrame의 shape 확인
boston_df.shape

(506, 14)

#### 입력변수와 목표변수를 분리해서 저장

In [6]:
# 입력변수 준비
x = boston_df.drop(columns='MEDV')  # 목표변수인 MEDV만 제거해서 입력변수만 남김
x

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


이번 챕터에서는 입력 변수가 1개인 경우만 보고 있으므로 `AGE` 컬럼만 가져와서 다시 x에 저장.

In [7]:
x = x[['AGE']]
x

Unnamed: 0,AGE
0,65.2
1,78.9
2,61.1
3,45.8
4,54.2
...,...
501,69.1
502,76.7
503,91.0
504,89.3


In [8]:
# 목표변수 준비
y = boston_df[['MEDV']]
y

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
...,...
501,22.4
502,20.6
503,23.9
504,22.0


p.s. 보스턴 집 값 데이터로 머신러닝 하는 건 R 심화수업에서도 했었음...참고하기

---

## scikit-learn 데이터셋 나누기

In [9]:
# 데이터를 train set과 test set으로 나눠주는 함수 불러오기.
from sklearn.model_selection import train_test_split

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

- `test_size`: 전체 데이터 중 test dataset으로 사용할 비율을 정하는 파라미터.
    - 0.2는 전체의 20%를 test dataset으로 사용하는 것.

- `random_state`: test dataset 20%를 어떻게 고를지 결정하는 파라미터.
    - optional parameter이므로 굳이 작성하지 않아도 됨.
    - 작성하지 않는 경우 매번 실행할 때 마다 랜덤한 데이터를 test dataset으로 새롭게 고름.
    - 정수값을 넘겨주는 경우 매번 똑같은 값을 골라서 test dataset으로 만듦.

`train_test_split` 함수는 매번 4개의 값을 튜플로 반환  
- x_train: train set의 입력변수
- x_test: test set의 입력변수
- y_train: train set의 목표변수
- y_test: test set의 목표변수

=> 네 개 모두 Pandas DataFrame.

In [11]:
# 데이터가 몇 개씩 들어갔는지 확인.
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(404, 1)
(102, 1)
(404, 1)
(102, 1)


---

## scikit-learn으로 선형 회귀 쉽게 하기

In [12]:
from sklearn.linear_model import LinearRegression

#### 모델 학습

In [13]:
model = LinearRegression()  # 모델 생성

In [14]:
model.fit(x_train, y_train)  # train set 학습

In [16]:
model.coef_  # θ1의 값.

array([[-0.12402883]])

In [17]:
model.intercept_  # θ0의 값.

array([31.04617413])

In [None]:
# f(x) = 31.04617413 + -0.12402883x → 이게 모델의 최적선.

#### 모델 평가

In [21]:
y_test_prediction = model.predict(x_test)
y_test_prediction  # test값에 대한 예측값.

array([[20.31768041],
       [28.14389953],
       [22.84786852],
       [18.64329122],
       [24.62148078],
       [19.99520545],
       [19.75955068],
       [22.79825699],
       [20.92542167],
       [21.74401194],
       [23.29437231],
       [25.41526529],
       [21.53316293],
       [25.94858925],
       [23.76568186],
       [24.55946636],
       [18.64329122],
       [18.64329122],
       [22.00447248],
       [18.64329122],
       [18.89134887],
       [18.90375176],
       [21.33471681],
       [22.71143681],
       [28.46637448],
       [19.06498923],
       [24.31140871],
       [22.05408402],
       [26.32067574],
       [26.59353916],
       [19.0153777 ],
       [18.85414022],
       [19.30064401],
       [18.7301114 ],
       [28.76404367],
       [18.92855752],
       [21.5579687 ],
       [19.21382383],
       [19.69753626],
       [29.08651863],
       [19.56110455],
       [18.64329122],
       [21.02464473],
       [20.14404005],
       [24.72070384],
       [25

In [None]:
# y_test(실제 목표변수 값)과 예측값의 차이를 구해서 평가해야 함.
# MSE를 구하는 함수 불러오기
from sklearn.metrics import mean_squared_error

In [24]:
mean_squared_error(y_test, y_test_prediction) ** 0.5  # 제곱근(0.5제곱)해서 RMSE 구하기

8.236881612652454

위의 모델로 집 값을 예측하는 경우 8천달러 정도의 오차가 있다고 보면 된다.