# 08 머신러닝 기초 (지도학습)

## 8.1 머신러닝 개요

### 8.1.1 머신러닝이란

## 8.3 로지스틱회귀

### 8.3.1 로지스틱회귀 예

In [1]:
import io
import requests
import zipfile
import numpy as np
import pandas as pd
import scipy as sp

In [2]:
# 데이터 수집
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' 
res = requests.get(url).content
# 수집한 데이터를 DataFrame 객체로 읽어 들이기
adult = pd.read_csv(io.StringIO(res.decode('utf-8')), header=None)
# 데이터 컬럼에 레이블 설정
adult.columns =[
    'age','workclass','fnlwgt','education','education-num','marital-status',
    'occupation','relationship','race','sex','capital-gain','capital-loss',
    'hours-per-week', 'native-country','flg-50K']

In [3]:
# 데이터 형태와 결측값 개수 출력
print('데이터 형태 : {}'.format(adult.shape))
print('결측값 수 : {}'.format(adult.isnull().sum().sum()))

데이터 형태 : (32561, 15)
결측값 수 : 0


In [4]:
# 데이터의 처음 다섯 행 출력 
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,flg-50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
adult.groupby('flg-50K').size()

flg-50K
 <=50K    24720
 >50K      7841
dtype: int64

In [6]:
adult['fin_flg'] = adult['flg-50K'].map(lambda x: 1 if x == ' >50K' else 0)
adult.groupby(['fin_flg']).size()

fin_flg
0    24720
1     7841
dtype: int64

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [8]:
X = adult[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']]
y = adult['fin_flg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [9]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [10]:
print("정확도(train) : {:.3f}".format(model.score(X_train, y_train)))
print("정확도(test) : {:.3f}".format(model.score(X_test, y_test)))

정확도(train) : 0.797
정확도(test) : 0.798


In [11]:
model.coef_

array([[-1.18545968e-02, -4.37932054e-06, -2.77432658e-03,
         3.27384955e-04,  7.53237842e-04]])

In [12]:
np.exp(model.coef_)

array([[0.98821539, 0.99999562, 0.99722952, 1.00032744, 1.00075352]])

In [13]:
X = adult[['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss']]
y = adult['fin_flg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [14]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [16]:
model = LogisticRegression()
model.fit(X_train_std, y_train)

LogisticRegression()

In [17]:
print("정확도(train) : {:.3f}".format(model.score(X_train_std, y_train)))
print("정확도(test) : {:.3f}".format(model.score(X_test_std, y_test)))

정확도(train) : 0.811
정확도(test) : 0.810


## 8.4 정규화 항이 있는 회귀: 리지회귀, 라소회귀

### 8.4.1 라소회귀, 리지회귀의 특징

### 8.4.2 다중회귀와 리지회귀 비교

In [18]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split