### sklearn.svm.SVC
### sklearn.svm.SVR
- 마진 : 두 집단의 떨어짐 정도로, 데이터가 포함되지 않은 영역을 최대화해 카테고리를 분리할 수 있도록 하는 공간
- 서포트 벡터 : 결정경계선에 가장 가까이 있는 카테고리의 데이터

#### 주요 Hyperparameter
- C : 분류에만 존재, 기본값 1로, 작을수록 단순, 클수록 복잡. C는 Logscale(0.01,0.1, 1, 10, 100 등) 단위로 최적값 탐색 권고
- kernel : rbf(기본), linear, poly, sigmoid, precomputed 등

##### SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)
##### SVR(*, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1)

# 분석 코드 - Classification

In [1]:
# 라이브러리 및 데이터 로드
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv('../input/big-data-certification-study/breast-cancer-wisconsin.csv', encoding='utf-8')
df.head()

Unnamed: 0,code,Clump_Thickness,Cell_Size,Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0


In [2]:
# 데이터 분리
X = df.drop(columns=['code','Class'])
y=df[['Class']]

X_train, X_test, y_train, y_test=train_test_split(X,y,stratify=y,random_state=42)

In [3]:
# 정규화 - Min Max
scaler=MinMaxScaler()
scaler.fit(X_train)
mm_X_train=scaler.transform(X_train)
mm_X_test=scaler.transform(X_test)

In [4]:
# 모델 적용
model=SVC()
model.fit(mm_X_train, y_train)
pred_train=model.predict(mm_X_train)
model.score(mm_X_train, y_train)

0.984375

In [5]:
# 혼동행렬, 분류예측 보고서
cm_train=confusion_matrix(y_train, pred_train)
cfr_train=classification_report(y_train, pred_train)
print('혼동행렬 :\n', cm_train,'\n\n\n분류예측 보고서 :\n', cfr_train)

혼동행렬 :
 [[329   4]
 [  4 175]] 


분류예측 보고서 :
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       333
           1       0.98      0.98      0.98       179

    accuracy                           0.98       512
   macro avg       0.98      0.98      0.98       512
weighted avg       0.98      0.98      0.98       512



In [6]:
# 모델 적용
pred_test=model.predict(mm_X_test)
model.score(mm_X_test, y_test)

0.9649122807017544

In [7]:
# 혼동행렬, 분류예측 보고서
cm_test=confusion_matrix(y_test,pred_test)
cfr_test=classification_report(y_test,pred_test)
print('혼동행렬 :\n', cm_test,'\n\n\n분류예측 보고서 :\n', cfr_test)

혼동행렬 :
 [[106   5]
 [  1  59]] 


분류예측 보고서 :
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       111
           1       0.92      0.98      0.95        60

    accuracy                           0.96       171
   macro avg       0.96      0.97      0.96       171
weighted avg       0.97      0.96      0.97       171



In [8]:
# Hyperparameter Tuning
# Grid Search
param_g=[{'C':[0.0001,0.001,0.01,0.1,1,10,100],
          'kernel':['rbf'],
          'gamma':[0.0001,0.001,0.01,0.1,1,10,100]},
         {'C':[0.0001,0.001,0.01,0.1,1,10,100],
          'kernel':['linear'],
          'gamma':[0.0001,0.001,0.01,0.1,1,10,100]}]
search_g=GridSearchCV(SVC(), param_g, cv=5,
                      return_train_score=True)
search_g.fit(mm_X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                          'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                          'kernel': ['rbf']},
                         {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                          'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                          'kernel': ['linear']}],
             return_train_score=True)

In [9]:
print('Best Parameter :', search_g.best_params_)
print('Best Score :', round(search_g.best_score_, 4))
print('Test Score :', round(search_g.score(mm_X_test, y_test), 4))

Best Parameter : {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
Best Score : 0.9746
Test Score : 0.9591


In [10]:
# Randomized Search
param_r={'C':randint(low=0.001, high=100),
         'kernel':['rbf'],
         'gamma':randint(low=0.001, high=100)}
search_r=RandomizedSearchCV(SVC(), param_distributions=param_r,
                            cv=5, n_iter=100,
                            return_train_score=True)
search_r.fit(mm_X_train, y_train)

RandomizedSearchCV(cv=5, estimator=SVC(), n_iter=100,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f87daf08c10>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f87b8f1dc10>,
                                        'kernel': ['rbf']},
                   return_train_score=True)

In [11]:
print('Best Parameter :', search_r.best_params_)
print('Best Score :', round(search_r.best_score_, 4))
print('Test Score :', round(search_r.score(mm_X_test, y_test), 4))

Best Parameter : {'C': 2, 'gamma': 3, 'kernel': 'rbf'}
Best Score : 0.9648
Test Score : 0.9591


# 분석 코드 - Regression

In [12]:
df2 = pd.read_csv('../input/big-data-certification-study/house_price.csv', encoding='utf-8')
df2.head()

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
0,23,6.777,0.141112,2.442244,8.10396,500000
1,49,6.0199,0.160984,2.726688,5.752412,500000
2,35,5.1155,0.249061,1.902676,3.888078,500000
3,32,4.7109,0.231383,1.913669,4.508393,500000
4,21,4.5625,0.255583,3.092664,4.667954,500000


In [13]:
X=df2.drop(columns=['house_value'])
y=df2[['house_value']]

X_train, X_test, y_train, y_test=train_test_split(X,y,random_state=42)

In [14]:
# 정규화
scaler.fit(X_train)
ms_X_train = scaler.transform(X_train)
ms_X_test=scaler.transform(X_test)

In [15]:
# 모델 적용
model=SVR(kernel='poly')
model.fit(ms_X_train, y_train)
pre_train=model.predict(ms_X_train)
model.score(ms_X_train, y_train)

0.4411532001585847

In [16]:
pre_test=model.predict(ms_X_test)
model.score(ms_X_test,y_test)

0.45698485085656304

In [17]:
# RMSE 확인
from sklearn.metrics import mean_squared_error
mse_train=mean_squared_error(y_train,pre_train)
mse_test=mean_squared_error(y_test,pre_test)
print('Train RMSE :', np.sqrt(mse_train))
print('Test  RMSE :', np.sqrt(mse_test))

Train RMSE : 71350.56690936947
Test  RMSE : 70447.96409927838


In [18]:
# Hyperparameter Tuning
# Grid Search
param_g={'C':[0.001,0.01,0.1,1,10,100],
         'kernel':['poly'],
         'gamma':[0.001,0.01,0.1,1,10,100]}
search_g=GridSearchCV(SVR(), param_g, cv=5,
                      return_train_score=True)
search_g.fit(ms_X_train, y_train)

GridSearchCV(cv=5, estimator=SVR(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
                         'kernel': ['poly']},
             return_train_score=True)

In [19]:
print('Best Parameter :', search_g.best_params_)
print('Best Score :', round(search_g.best_score_, 4))
print('Test Score :', round(search_g.score(ms_X_test, y_test), 4))

Best Parameter : {'C': 10, 'gamma': 100, 'kernel': 'poly'}
Best Score : 0.5944
Test Score : 0.609


In [20]:
# Randomized Search
param_r={'C':randint(low=0.001, high=100),
         'kernel':['poly'],
         'gamma':randint(low=0.001, high=100)}
search_r=RandomizedSearchCV(SVR(), param_distributions=param_r,
                            cv=5, n_iter=100,
                            return_train_score=True)
search_r.fit(ms_X_train, y_train)

RandomizedSearchCV(cv=5, estimator=SVR(), n_iter=100,
                   param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f87b81d8790>,
                                        'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f87b81d88d0>,
                                        'kernel': ['poly']},
                   return_train_score=True)

In [21]:
print('Best Parameter :', search_r.best_params_)
print('Best Score :', round(search_r.best_score_, 4))
print('Test Score :', round(search_r.score(ms_X_test, y_test), 4))

Best Parameter : {'C': 66, 'gamma': 36, 'kernel': 'poly'}
Best Score : 0.5953
Test Score : 0.6105
