# 서포트 벡터 머신(Support Vector Machines)

* 회귀, 분류, 이상치 탐지 등에 사용되는 지도학습 방법
* 클래스 사이의 경계에 위치한 데이터 포인트를 서포트 벡터(support vector)라고 함
* 각 서포트 벡터가 클래스 사이의 결정 경계를 구분하는데 얼마나 중요한지를 학습
* 각 서포트 벡터 사이의 마진이 가장 큰 방향으로 학습
* 지지 벡터 까지의 거리와 서포트 벡터의 중요도를 기반으로 예측을 수행

![support vector machine](https://upload.wikimedia.org/wikipedia/commons/thumb/2/20/Svm_separating_hyperplanes.png/220px-Svm_separating_hyperplanes.png)

* H3은 두 클래스의 점들을 제대로 분류하고 있지 않음
* H1과 H2는 두 클래스의 점들을 분류하는데, H2가 H1보다 더 큰 마진을 갖고 분류하는 것을 확인할 수 있음

In [None]:
import multiprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')

In [None]:
from sklearn.svm import SVR, SVC
from sklearn.datasets import fetch_california_housing, load_diabetes, load_breast_cancer, load_iris, load_wine
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.manifold import TSNE

## SVM을 이용한 회귀 모델과 분류 모델

### SVM을 사용한 회귀 모델 (SVR)

In [None]:
# 보스턴 주택 가격 데이터셋 로드 및 훈련/테스트 세트 분할
data = fetch_california_housing()
# 보스턴 주택 가격 데이터셋의 특성(X)과 타깃(y) 로드
X, y = data.data, data.target

# 훈련 세트와 테스트 세트로 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = SVR()
model.fit(X_train, y_train)

print('학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

### SVM을 사용한 분류 모델 (SVC)

In [None]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

model = SVC()
model.fit(X_train, y_train)

print('학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

## 커널 기법

* 입력 데이터를 고차원 공간에 사상해서 비선형 특징을 학습할 수 있도록 확장하는 방법
* scikit-learn에서는 Linear, Polynomial, RBF(Radial Basis Function)등 다양한 커널 기법을 지원

![image.png](attachment:image.png)

In [None]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

linear_model = SVR(kernel='linear')
linear_model.fit(X_train, y_train)

print('Linear SVR 학습 데이터 점수: {}'.format(linear_model.score(X_train, y_train)))
print('Linear SVR 평가 데이터 점수: {}'.format(linear_model.score(X_test, y_test)))

In [None]:
polynomial_model = SVR(kernel='poly')
polynomial_model.fit(X_train, y_train)

print('Polynomial SVR 학습 데이터 점수: {}'.format(polynomial_model.score(X_train, y_train)))
print('Polynomial SVR 평가 데이터 점수: {}'.format(polynomial_model.score(X_test, y_test)))

In [None]:
rbf_model = SVR(kernel='rbf')
rbf_model.fit(X_train, y_train)

print('RBF SVR 학습 데이터 점수: {}'.format(rbf_model.score(X_train, y_train)))
print('RBF SVR 평가 데이터 점수: {}'.format(rbf_model.score(X_test, y_test)))

## 매개변수 튜닝

* SVM은 사용하는 커널에 따라 다양한 매개변수 설정 가능
* 매개변수를 변경하면서 성능변화를 관찰

In [None]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [None]:
polynomial_model = SVC(kernel='poly', degree=2, C=0.1, gamma='auto')
polynomial_model.fit(X_train, y_train)

print('kernel=poly, degree={}, C={}, gamma={}'.format(2, 0.1, 'auto'))
print('Polynomial SVC 학습 데이터 점수: {}'.format(polynomial_model.score(X_train, y_train)))
print('Polynomial SVC 평가 데이터 점수: {}'.format(polynomial_model.score(X_test, y_test)))

In [None]:
rbf_model = SVC(kernel='rbf', C=2.0, gamma='scale')
rbf_model.fit(X_train, y_train)

print('kernel=rbf, C={}, gamma={}'.format(2.0, 'scale'))
print('RBF SVC 학습 데이터 점수: {}'.format(rbf_model.score(X_train, y_train)))
print('RBF SVC 평가 데이터 점수: {}'.format(rbf_model.score(X_test, y_test)))

## 데이터 전처리

* SVM은 입력 데이터가 정규화 되어야 좋은 성능을 보임
* 주로 모든 특성 값을 [0, 1] 범위로 맞추는 방법을 사용
* scikit-learn의 StandardScaler 또는 MinMaxScaler를 사용해 정규화

In [None]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [None]:
model = SVC()
model.fit(X_train, y_train)

print('SVC 학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('SVC 평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

In [None]:
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [None]:
model = SVC()
model.fit(X_train_scale, y_train)

print('SVC 학습 데이터 점수: {}'.format(model.score(X_train_scale, y_train)))
print('SVC 평가 데이터 점수: {}'.format(model.score(X_test_scale, y_test)))

In [None]:
scaler = MinMaxScaler()
X_train_mmscale = scaler.fit_transform(X_train)
X_test_mmscale = scaler.transform(X_test)

In [None]:
model = SVC()
model.fit(X_train_mmscale, y_train)

print('SVC 학습 데이터 점수: {}'.format(model.score(X_train_mmscale, y_train)))
print('SVC 평가 데이터 점수: {}'.format(model.score(X_test_mmscale, y_test)))

## Linear SVR

### 보스턴 주택 가격

In [None]:
# 보스턴 주택 가격 데이터셋 로드 및 훈련/테스트 세트 분할
data = fetch_california_housing()
# 보스턴 주택 가격 데이터셋의 특성(X)과 타깃(y) 로드
X, y = data.data, data.target

# 훈련 세트와 테스트 세트로 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = SVR(kernel='linear')
model.fit(X_train, y_train)

In [None]:
print('Linear SVR 학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('Linear SVR 평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

In [None]:
tsne = TSNE(n_components=1)
X_comp = tsne.fit_transform(X)
plt.scatter(X_comp, y);

In [None]:
model.fit(X_comp, y)
predict = model.predict(X_comp)
plt.scatter(X_comp, y)
plt.scatter(X_comp, predict, color='r')

In [None]:
estimator = make_pipeline(StandardScaler(), SVR(kernel='linear'))

cross_validate(
    estimator=estimator,
    X=X, y=y,
    cv=5,
    n_jobs=multiprocessing.cpu_count(),
    verbose=True    
)

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', SVR(kernel='linear'))])

param_grid = [{'model__gamma': ['scale', 'auto'],
               'model__C': [1.0, 0.1, 0.01],
               'model__epsilon': [1.0, 0.1, 0.01]}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    verbose=True
)

gs.fit(X, y)

In [None]:
gs.best_estimator_

### 당뇨병

In [None]:
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = SVR(kernel='linear')
model.fit(X_train, y_train)

In [None]:
print('Linear SVR 학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('Linear SVR 평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

In [None]:
tsne = TSNE(n_components=1)
X_comp = tsne.fit_transform(X)
plt.scatter(X_comp, y);

In [None]:
model.fit(X_comp, y)
predict = model.predict(X_comp)
plt.scatter(X_comp, y)
plt.scatter(X_comp, predict, color='r')

In [None]:
estimator = make_pipeline(StandardScaler(), SVR(kernel='linear'))

cross_validate(
    estimator=estimator,
    X=X, y=y,
    cv=5,
    n_jobs=multiprocessing.cpu_count(),
    verbose=True    
)

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', SVR(kernel='linear'))])

param_grid = [{'model__gamma': ['scale', 'auto'],
               'model__C': [1.0, 0.1, 0.01],
               'model__epsilon': [1.0, 0.1, 0.01]}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    verbose=True
)

gs.fit(X, y)

In [None]:
gs.best_estimator_

## Kernel SVR

### 보스턴 주택 가격

In [None]:
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = SVR(kernel='rbf')
model.fit(X_train, y_train)

In [None]:
print('RBF SVR 학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('RBF SVR 평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

In [None]:
tsne = TSNE(n_components=1)
X_comp = tsne.fit_transform(X)
plt.scatter(X_comp, y);

In [None]:
model.fit(X_comp, y)
predict = model.predict(X_comp)
plt.scatter(X_comp, y)
plt.scatter(X_comp, predict, color='r')

In [None]:
estimator = make_pipeline(StandardScaler(), SVR(kernel='rbf'))

cross_validate(
    estimator=estimator,
    X=X, y=y,
    cv=5,
    n_jobs=multiprocessing.cpu_count(),
    verbose=True    
)

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', SVR(kernel='rbf'))])

param_grid = [{'model__kernel': ['rbf', 'poly', 'sigmoid']}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    verbose=True
)

gs.fit(X, y)

In [None]:
gs.best_estimator_

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', SVR(kernel='rbf'))])

param_grid = [{'model__gamma': ['scale', 'auto'],
               'model__C': [1.0, 0.1, 0.01],
               'model__epsilon': [1.0, 0.1, 0.01]}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    verbose=True
)

gs.fit(X, y)

In [None]:
gs.best_estimator_

### 당뇨병

In [None]:
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = SVR(kernel='rbf')
model.fit(X_train, y_train)

In [None]:
print('RBF SVR 학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('RBF SVR 평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

In [None]:
tsne = TSNE(n_components=1)
X_comp = tsne.fit_transform(X)
plt.scatter(X_comp, y);

In [None]:
model.fit(X_comp, y)
predict = model.predict(X_comp)
plt.scatter(X_comp, y)
plt.scatter(X_comp, predict, color='r')

In [None]:
estimator = make_pipeline(StandardScaler(), SVR(kernel='rbf'))

cross_validate(
    estimator=estimator,
    X=X, y=y,
    cv=5,
    n_jobs=multiprocessing.cpu_count(),
    verbose=True    
)

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', SVR(kernel='rbf'))])

param_grid = [{'model__kernel': ['rbf', 'poly', 'sigmoid']}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    verbose=True
)

gs.fit(X, y)

In [None]:
gs.best_estimator_

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', SVR(kernel='sigmoid'))])

param_grid = [{'model__gamma': ['scale', 'auto'],
               'model__C': [1.0, 0.1, 0.01],
               'model__epsilon': [1.0, 0.1, 0.01]}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    verbose=True
)

gs.fit(X, y)

In [None]:
gs.best_estimator_

In [None]:
model = gs.best_estimator_
model.fit(X_train, y_train)


In [None]:
print('SVR 학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('SVR 평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

## Linear SVC

### 유방암

In [None]:
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [None]:
print('Linear SVC 학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('Linear SVC 평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

In [None]:
def make_meshgrid(x, y, h=0.02):
    x_min, x_max = x.min()-1, x.max()+1
    y_min, y_max = y.min()-1, y.max()+1
    xx, yy, = np.meshgrid(np.arange(x_min, x_max, h),
                          np.arange(y_min, y_max, h))
    
    return xx, yy

def plot_contours(clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = plt.contourf(xx, yy, Z, **params)
    
    return out

In [None]:
X_comp = TSNE(n_components=2).fit_transform(X)
X0, X1 = X_comp[:, 0], X_comp[:, 1]
xx, yy = make_meshgrid(X0, X1)

In [None]:
model.fit(X_comp, y)

plot_contours(model, xx, yy, cmap=plt.cm.coolwarm, alpha=0.7)
plt.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')

In [None]:
estimator = make_pipeline(StandardScaler(), SVC(kernel='linear'))

cross_validate(
    estimator=estimator,
    X=X, y=y,
    cv=5,
    n_jobs=multiprocessing.cpu_count(),
    verbose=True    
)

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', SVC(kernel='linear'))])

param_grid = [{'model__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
               'model__gamma': ['scale', 'auto'],
               'model__C': [1.0, 0.1, 0.01]}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    verbose=True
)

gs.fit(X, y)

In [None]:
gs.best_estimator_

### 붓꽃

In [None]:
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [None]:
print('Linear SVC 학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('Linear SVC 평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

In [None]:
def make_meshgrid(x, y, h=0.02):
    x_min, x_max = x.min()-1, x.max()+1
    y_min, y_max = y.min()-1, y.max()+1
    xx, yy, = np.meshgrid(np.arange(x_min, x_max, h),
                          np.arange(y_min, y_max, h))
    
    return xx, yy

def plot_contours(clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = plt.contourf(xx, yy, Z, **params)
    
    return out

In [None]:
X_comp = TSNE(n_components=2).fit_transform(X)
X0, X1 = X_comp[:, 0], X_comp[:, 1]
xx, yy = make_meshgrid(X0, X1)

In [None]:
model.fit(X_comp, y)

plot_contours(model, xx, yy, cmap=plt.cm.coolwarm, alpha=0.7)
plt.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')

In [None]:
estimator = make_pipeline(StandardScaler(), SVC(kernel='linear'))

cross_validate(
    estimator=estimator,
    X=X, y=y,
    cv=5,
    n_jobs=multiprocessing.cpu_count(),
    verbose=True    
)

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', SVC(kernel='linear'))])

param_grid = [{'model__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
               'model__gamma': ['scale', 'auto'],
               'model__C': [1.0, 0.1, 0.01]}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    verbose=True
)

gs.fit(X, y)

In [None]:
gs.best_estimator_

### 와인

In [None]:
X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [None]:
print('Linear SVC 학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('Linear SVC 평가 데이터 점수: {}'.format(model.score(X_test, y_test)))

In [None]:
def make_meshgrid(x, y, h=0.02):
    x_min, x_max = x.min()-1, x.max()+1
    y_min, y_max = y.min()-1, y.max()+1
    xx, yy, = np.meshgrid(np.arange(x_min, x_max, h),
                          np.arange(y_min, y_max, h))
    
    return xx, yy

def plot_contours(clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = plt.contourf(xx, yy, Z, **params)
    
    return out

In [None]:
X_comp = TSNE(n_components=2).fit_transform(X)
X0, X1 = X_comp[:, 0], X_comp[:, 1]
xx, yy = make_meshgrid(X0, X1)

In [None]:
model.fit(X_comp, y)

plot_contours(model, xx, yy, cmap=plt.cm.coolwarm, alpha=0.7)
plt.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')

In [None]:
estimator = make_pipeline(StandardScaler(), SVC(kernel='linear'))

cross_validate(
    estimator=estimator,
    X=X, y=y,
    cv=5,
    n_jobs=multiprocessing.cpu_count(),
    verbose=True    
)

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ('model', SVC(kernel='linear'))])

param_grid = [{'model__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
               'model__gamma': ['scale', 'auto'],
               'model__C': [1.0, 0.1, 0.01]}]

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=5,
    verbose=True
)

gs.fit(X, y)

In [None]:
gs.best_estimator_

In [None]:
model = gs.best_estimator_
model.fit(X_train, y_train)

In [None]:
print('RBF SVC 학습 데이터 점수: {}'.format(model.score(X_train, y_train)))
print('RBF SVC 평가 데이터 점수: {}'.format(model.score(X_test, y_test)))