In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib

from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, make_pipeline # 파이프라인 구축

In [43]:
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data,
                                                    cancer.target,
                                                    random_state=42)

In [44]:
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [45]:
svm = SVC()
svm.fit(X_train_scaled, y_train)
X_test_scaled = scaler.transform(X_test) # scaler 쓰면 기억해야 함 -> 서비스할 때 필요 (input값이 있으면 그걸 scaler 넣어야함)

In [46]:
svm.score(X_test_scaled, y_test)

0.9790209790209791

## 최적화

In [47]:
param_grid = {"C":[0.001, 0.01, 0.1, 1, 10, 100],
              "gamma":[0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(SVC(), param_grid=param_grid)
grid.fit(X_train_scaled, y_train)
print(grid.best_score_)
grid.score(X_test_scaled, y_test)
grid.best_params_
# param_grid(파라미터), SVC(알고리즘)만 바꾸면 반복문돌려서 재사용가능 

0.9788508891928865


{'C': 1, 'gamma': 1}

## 파이프라인 구축

In [48]:
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())]) # --- "svm__" 이름붙이기 중요 !
pipe.fit(X_train, y_train) # MinMaxScaler에 넣을 거니까 scaled 말고 원래값 넣기
pipe.score(X_test, y_test)

0.9790209790209791

In [49]:
param_grid = {"svm__C":[0.001, 0.01, 0.1, 1, 10, 100], # --- "svm__" 이름붙이기 중요 !
              "svm__gamma":[0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
grid.score(X_test, y_test)
print(grid.best_score_, grid.best_params_)

0.9741450068399453 {'svm__C': 10, 'svm__gamma': 0.1}


## 파이프라인 인터페이스 (make_pipeline)

In [50]:
pipe_short = make_pipeline(MinMaxScaler(), SVC(C=100))
pipe_short

In [51]:
pipe_short.steps # 이름을 minmaxscaler, svc로 자동으로 만들어줌 (같은게 있으면 -번호로 만듦)

[('minmaxscaler', MinMaxScaler()), ('svc', SVC(C=100))]

In [52]:
pipe_short = make_pipeline(StandardScaler(), PCA(n_components=2), StandardScaler())
pipe_short.fit(cancer.data)

In [53]:
components = pipe_short.named_steps["pca"].components_ # sklearn에서 속성은 ..._
print(components)

[[ 0.21890244  0.10372458  0.22753729  0.22099499  0.14258969  0.23928535
   0.25840048  0.26085376  0.13816696  0.06436335  0.20597878  0.01742803
   0.21132592  0.20286964  0.01453145  0.17039345  0.15358979  0.1834174
   0.04249842  0.10256832  0.22799663  0.10446933  0.23663968  0.22487053
   0.12795256  0.21009588  0.22876753  0.25088597  0.12290456  0.13178394]
 [-0.23385713 -0.05970609 -0.21518136 -0.23107671  0.18611302  0.15189161
   0.06016536 -0.0347675   0.19034877  0.36657547 -0.10555215  0.08997968
  -0.08945723 -0.15229263  0.20443045  0.2327159   0.19720728  0.13032156
   0.183848    0.28009203 -0.21986638 -0.0454673  -0.19987843 -0.21935186
   0.17230435  0.14359317  0.09796411 -0.00825724  0.14188335  0.27533947]]


## 간단 예제 (교재 p.430)

### 전처리

In [54]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None) # sep=r"\s+" 문자열 띄어쓰기로 구분, skiprows 줄글 제외하고 읽어야함
raw_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.00632,18.00,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3
1,396.90000,4.98,24.00,,,,,,,,
2,0.02731,0.00,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8
3,396.90000,9.14,21.60,,,,,,,,
4,0.02729,0.00,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8
...,...,...,...,...,...,...,...,...,...,...,...
1007,396.90000,5.64,23.90,,,,,,,,
1008,0.10959,0.00,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0
1009,393.45000,6.48,22.00,,,,,,,,
1010,0.04741,0.00,11.93,0.0,0.573,6.030,80.8,2.5050,1.0,273.0,21.0


In [55]:
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
data

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [57]:
target = raw_df.values[1::2, 2]
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    target,
                                                    random_state=42)

In [None]:
# 가능하면 전처리를 엑셀이나, 구글 스프레드쉬트 등을 활용하세요.
# VSCode에서 텍스트 전처리를 진행하세요.

### pipeline

In [58]:
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(),Ridge())
pipe

In [61]:
param_grid = {"polynomialfeatures__degree": [1,2,3],
              "ridge__alpha": [0.001, 0.01, 0.1, 1, 10, 100]}

In [64]:
grid = GridSearchCV(pipe, param_grid = param_grid, cv=5, n_jobs=-1) # n_jobs=-1 : 가용된 CPU 자원 모두 쓰도록
grid.fit(X_train, y_train)

In [65]:
grid.best_params_

{'polynomialfeatures__degree': 2, 'ridge__alpha': 10}

In [66]:
grid.score(X_test, y_test)

0.8054402042295685

## 모델 선택을 위한 파이프라인

In [75]:
pipe = Pipeline([("preprocessing", StandardScaler()), ("classifier", SVC())], memory="cache_folder")
# memory="cache_folder" : 중간 결과를 캐시하여 반복적인 실행에서 시간을 절약 (전처리를 완전히 끝내고 사용해야함)

In [79]:
param_grid = [
    {"classifier":[SVC()],
     "preprocessing": [StandardScaler()],
     "classifier__C": [0.001, 0.01, 0.1, 1, 10, 100]},
     {"classifier":[RandomForestClassifier()], # 전처리 필요없음
     "preprocessing": [None],
     "classifier__max_features": [1,2,3]}
]

In [80]:
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data,
                                                    cancer.target,
                                                    random_state=42)

In [81]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

In [82]:
grid.best_params_

{'classifier': SVC(), 'classifier__C': 1, 'preprocessing': StandardScaler()}

In [83]:
grid.best_score_

0.9717920656634748

In [84]:
grid.score(X_test, y_test)

0.972027972027972

#### cf. 자연어 처리 (감정분석)
> 단어 임베딩(Word Embedding) : 단어를 고차원의 벡터 공간에 매핑하는 기법