# SVM 이진분류
- 피마 인디언 당뇨병 예측 데이터셋을 이용해서 이진분류를 실시

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

SEED = 42

## 1. 데이터 준비

In [None]:
# !wget https://raw.githubusercontent.com/devdio/flyai_datasets/main/diabetes.csv

In [None]:
path = 'diabetes.csv'
diabetes = pd.read_csv(path)
diabetes.shape

In [None]:
diabetes.head()

In [None]:
df = diabetes.copy()
df.info() # -> 결측치 없음, 전부 수치형 (인코딩 필요x)

In [None]:
df.describe().T

### 범주형 변수

In [None]:
df.columns

In [None]:
df['Outcome'].value_counts()

In [None]:
sns.countplot(data=df, x='Outcome')

### 연속형 변수

In [None]:
tmp = df['Pregnancies'].sort_values(ascending=False)
tmp = tmp.reset_index()
tmp.head()

In [None]:
sns.barplot(x=tmp.index, y = tmp['Pregnancies'])

In [None]:
df.hist()

### 결측치

In [None]:
df.isna().sum()

### 중복치

In [None]:
df.duplicated().sum()

### 이상치

In [None]:
# 박스플롯 그리기

In [None]:
df.describe().T

## 2. 트레인, 테스트 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.1, random_state=SEED, stratify=df['Outcome'])
train.shape, test.shape

In [None]:
train['Outcome'].value_counts()

In [None]:
train.head()

### X, y 변수 분리

In [None]:
X_train = train.drop('Outcome', axis=1)
y_train = train['Outcome']

X_train.shape, y_train.shape

In [None]:
# 이상치 0인 값을 특정 값(중간 값)으로 치환
# 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'

In [None]:
median_list = []

col_list = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in col_list:
  med = X_train[col].median()
  X_train.loc[X_train[col] == 0, col] = med
  median_list.append(med)

In [None]:
X_train.describe().T # min값이 0인 값이 없음을 확인

### 스케일링

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_train_s = ss.fit_transform(X_train)
X_train_s # np.array 로 자동으로 바뀜

In [None]:
print(ss.mean_) # 각 컬럼당 평균값
print(ss.var_) # 각 컬럼당 분산값

In [None]:
y_train_e = y_train.to_numpy()
y_train_e

In [None]:
print(X_train_s.shape, y_train_e.shape)
print(type(X_train_s), type(y_train_e))

## 모델 학습

In [None]:
from sklearn.svm import SVC

clf = SVC(random_state=SEED)
clf.fit(X_train_s, y_train_e)

## 검증

In [None]:
X_test = test.drop('Outcome', axis=1)
y_test = test['Outcome']

X_test.shape, y_test.shape

In [None]:
# 테스트값 전처리
# median_list = []
col_list = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for i, col in enumerate(col_list):
  X_test.loc[X_test[col] == 0, col] = median_list[i]
  median_list.append(med)

In [None]:
X_test_s = ss.transform(X_test)
X_test_s

In [None]:
y_test_e = y_test.to_numpy()
y_test_e

In [None]:
print(X_test_s.shape, y_test_e.shape)
print(type(X_test_s), type(y_test_e))

In [None]:
y_pred = clf.predict(X_test_s)
y_pred

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix

def print_metrics(y_true, y_pred, ave='binary'):
  print('accuracy:', accuracy_score(y_test_e, y_pred))
  print('recall:', recall_score(y_test_e, y_pred, average=ave))
  print('precision:', precision_score(y_test_e, y_pred, average=ave))
  print('f1 :', f1_score(y_test_e, y_pred, average=ave))

  clm = confusion_matrix(y_test_e, y_pred)
  s = sns.heatmap(clm, annot=True, fmt='d', cbar=False)
  s.set(xlabel='Predicted', ylabel='Actual')
  plt.show()


In [None]:
print_metrics(y_test_e, y_pred)

## 모델 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV

prams_grid = {
  'C':[0.01,0.02,0.05,0.1,0.5, 1,10,100], 'gamma': [1, 0.1, 0.01, 0.001],
  'kernel':['rbf','poly']
}

clf = SVC(random_state=SEED)
grid_search = GridSearchCV(
    estimator=clf, param_grid=prams_grid, cv=3,
    n_jobs=-1, # n_job -> 코어를 다 이용해서 돌려라
    refit=True, verbose=2, return_train_score=True)

grid_search.fit(X_train_s, y_train_e)

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_params_

In [None]:
y_pred = grid_search.best_estimator_.predict(X_test_s)
y_pred

In [None]:
print_metrics(y_test_e, y_pred)