In [40]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

In [41]:
cancer = load_breast_cancer()

In [42]:
x = cancer.data
y = cancer.target

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, test_size = 0.2, random_state = 42)

# log_loss를 이용한 Logistic Regression Model 훈련

In [44]:
model = SGDClassifier(loss = 'log_loss', random_state = 42)

In [45]:
model.fit(x_train, y_train)

In [46]:
model.score(x_test, y_test)

0.8333333333333334

# Support Vector Machine (SVM) - Classification 문제에 사용
# hinge_loss 이용 : L(y,f(x)) = max(0, 1 - y * f(x))
# 정확히 분류된다면 y * f(x) => 1 로 loss = 0이 된다.

In [47]:
model_hinge = SGDClassifier(loss = 'hinge', random_state = 42)

In [48]:
model_hinge.fit(x_train, y_train)

In [49]:
model_hinge.score(x_test, y_test)

0.9385964912280702

# Validation Set 만들기

In [50]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, stratify = y_train, test_size = 0.2, random_state = 42)

In [51]:
print(len(x_train), len(x_valid), len(x_test))
print(len(y_train), len(y_valid), len(y_test))

364 91 114
364 91 114


# Standardization (표준화): feature들의 scale이 다를 때 조정하는 방법

In [52]:
import numpy as np
import matplotlib.pyplot as plt

## train set 표준화

In [53]:
train_mean = np.mean(x_train, axis = 0)
train_std = np.std(x_train, axis = 0)

In [54]:
x_train_scaled = (x_train - train_mean) / train_std   # train set 표준화

## valid set의 mean, std로 표준화 : 표준화 전후의 train/valid set의 분포가 달라진다.
<center>
<img src = 'https://drive.google.com/uc?id=1qShBVb36TKyb_YHKav5_zzuX_jKdkY_X'
width = 1000 height = 300 /> <br>
</center>

In [55]:
valid_mean = np.mean(x_valid, axis = 0)
valid_std = np.std(x_valid, axis = 0)

In [56]:
x_valid_scaled = (x_valid - valid_mean) / valid_std   # valid set의 mean, std로 표준화

## train set의 mean, std로 valid/test set 표준화 : 표준화 전후 분포 동일

<center>
<img src = 'https://drive.google.com/uc?id=1bMymWtTwOzt6x9iHaNfa6zEvj-wUHvSt'
width = 1000 height = 300 /> <br>
</center>

In [57]:
x_valid_scaled = (x_valid - train_mean) / train_std   # valid set 표준화: train set의 mean, std를 이용해야 함

In [58]:
x_test_scaled = (x_test - train_mean) / train_std   # test set 표준화