## 8.2 다중회귀

### 8.2.1 자동차 가격 데이터 읽어 들이기

In [2]:
import numpy as np
import pandas as pd
import scipy as sp
import sklearn
import requests, zipfile 
import io

# 자동차 가격 데이터 가져오기
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data' 
res = requests.get(url).content
auto = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)
auto.columns =['symboling','normalized-losses','make','fuel-type' ,'aspiration','num-of-doors',
               'body-style','drive-wheels','engine-location','wheel-base','length','width','height', 
               'curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore', 
               'stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

In [3]:
print('자동차 데이터 형태 {0}'.format(auto.shape))

자동차 데이터 형태 (205, 26)


In [4]:
auto.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


### 8.2.2 데이터 정리

#### 불필요한 데이터 제거

In [5]:
auto = auto[['price', 'horsepower', 'width', 'height']]
auto.isin(['?']).sum()

price         4
horsepower    2
width         0
height        0
dtype: int64

In [6]:
auto = auto.replace('?', np.nan).dropna()

In [7]:
auto.shape

(199, 4)

In [8]:
auto.dtypes

price          object
horsepower     object
width         float64
height        float64
dtype: object

In [9]:
auto = auto.assign(price=pd.to_numeric(auto.price))
auto = auto.assign(horsepower=pd.to_numeric(auto.horsepower))

In [10]:
# auto.dtypes

In [11]:
# auto.corr()

### 8.2.3 모델 구축과 평가

In [12]:
# help('sklearn.linear_model')

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [14]:
X = auto.drop('price', axis=1)
y = auto['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [15]:
print("결정계수(train) : {:.3f}".format(model.score(X_train, y_train)))
print("결정계수(test) : {:.3f}".format(model.score(X_test, y_test)))

결정계수(train) : 0.733
결정계수(test) : 0.737


In [16]:
print('\n회귀계수\n{0}'.format(pd.Series(model.coef_, index=X.columns)))
print('절편: {:.3f}'.format(model.intercept_))


회귀계수
horsepower      81.651078
width         1829.174506
height         229.510077
dtype: float64
절편: -128409.046


### 8.2.4 모델 구축 및 모델 평가 과정 정리
- 1단계. 모델 구축을 위한 클래스의 인스턴스 생성 : model = LinearRegression() 
- 2단계. 데이터를 설명변수와 목표변수로 나눔: X와 y
- 3단계. 훈련 데이터와 테스트 데이터로 분할: train_test_split(X, y test_size = 0.5, random_state = 0)
- 4단계. 훈련 데이터를 이용해(학습): model.fit(X_train, y_train) 
- 5단계. 모델 성능을 테스트 데이터로 평가: model.score(X_test, y_test)

## 8.3 로지스틱회귀

### 8.3.1 로지스틱회귀 예