In [None]:
# Support Vector Machine (SVM)

# 간단한 예시
# Import Libraries
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt

# Generate some (random) data
data = np.array([[1, -1], [2, -2], [0, 1], [-1, 2]])
label = np.array([1, 1, -1, -1])
plt.scatter(data[:,0], data[:,1], c=label, s=50, cmap='spring')
plt.show()

# Train SVM
svmmodel = svm.SVC(kernel='linear') # C값이 클수록 error를 허용하지 않겠다는 뜻이므로, 과도하게 크면 overfitting 된다.
svmmodel.fit(data, label)
svmmodel.support_    # indices of support vectors, 어느 인덱스가 서포트 벡터인지 / 첫번째, 세번쨰니까 0, 2
svmmodel.support_vectors_    # 인덱스 말고 support vectors 직접 확인하기
svmmodel.coef_    # a weight vector w (size: # of features)
svmmodel.intercept_    # bias (w0)

# 선 그어서 시각화
w = svmmodel.coef_[0]
slope = -w[0] / w[1]
margin = 1 / np.sqrt(np.sum(w ** 2)) # sqrt : 제곱근

xx = np.linspace(-5,5)
yy = slope * xx - svmmodel.intercept_[0] / w[1]        # decisoin boundary
yy1 = slope * xx - svmmodel.intercept_[0] / w[1] + margin    # margin
yy2 = slope * xx - svmmodel.intercept_[0] / w[1] - margin    # margin

print(xx.shape)
print(yy.shape)

plt.scatter(data[:,0], data[:,1], c=label, s=50, cmap='spring')
plt.plot(xx, yy, 'r')
plt.plot(xx, yy1, 'r--')
plt.plot(xx, yy2, 'r--')
plt.show()

In [None]:
# 복잡한 예시
# Import Libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
import matplotlib.pyplot as plt

# Generate some (random) data
X, Y = datasets.make_blobs(400, 2, centers=2, random_state=3, cluster_std=1.0)
TrainX, TestX, TrainY, TestY = train_test_split(X, Y, test_size=0.3, random_state=0)
plt.scatter(TrainX[:,0], TrainX[:,1], c=TrainY, s=50, cmap='spring')
plt.show()

# Train SVM
svmmodel = svm.SVC(kernel='linear')
svmmodel.fit(TrainX, TrainY)

# Let's check out the parameters of SVM
svmmodel.support_    # indices of support vectors
svmmodel.support_vectors_    # support vectors
svmmodel.coef_    # a weight vector w (size: # of features)
svmmodel.intercept_    # bias (w0)

# 선 그어서 시각화
w = svmmodel.coef_[0]
slope = -w[0] / w[1]
margin = 1 / np.sqrt(np.sum(w ** 2))

xx = np.linspace(-5,5)
yy = slope * xx - svmmodel.intercept_[0] / w[1]        # decisoin boundary
yy1 = slope * xx - svmmodel.intercept_[0] / w[1] + margin    # margin
yy2 = slope * xx - svmmodel.intercept_[0] / w[1] - margin    # margin

print(xx.shape)
print(yy.shape)

plt.figure(figsize=(8,6))
plt.scatter(TrainX[:,0], TrainX[:,1], c=TrainY, s=50, cmap='spring')
plt.scatter(TestX[:,0], TestX[:,1], c=TestY, marker='d', s=50, cmap='RdYlBu')
plt.plot(xx, yy, 'r')
plt.plot(xx, yy1, 'r--')
plt.plot(xx, yy2, 'r--')
plt.show()

# Performance Evaluation, Accuracy
from sklearn import metrics

tr_pred = svmmodel.predict(TrainX)
tr_acc = metrics.accuracy_score(TrainY, tr_pred)

ts_pred = svmmodel.predict(TestX)
ts_acc = metrics.accuracy_score(TestY, ts_pred)

print('Training Accuracy : ', tr_acc)
print('Test Accuracy : ', ts_acc)

# Confusion Matrix
tr_cmat = metrics.confusion_matrix(TrainY, tr_pred)
print(tr_cmat)

ts_cmat = metrics.confusion_matrix(TestY, ts_pred)
print(ts_cmat)

# ROC & AUC analysis
# label이 아닌, hyperplane과 data 사이의 거리에 따른 실제 값을 알아야 계산 가능
svmmodel = svm.SVC(kernel='linear', probability=True)
svmmodel.fit(TrainX, TrainY)

tr_score = svmmodel.predict_proba(TrainX)
ts_score = svmmodel.predict_proba(TestX)

tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainY, tr_score[:,1], pos_label=1)
ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestY, ts_score[:,1], pos_label=1)

plt.plot(tr_fpr, tr_tpr, 'b:', linewidth=10, label='Train')
plt.plot(ts_fpr, ts_tpr, 'r--', linewidth=5, label='Test')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.legend(loc='best')
plt.show()

tr_auc = metrics.roc_auc_score(TrainY, tr_score[:,1])
print('Training AUC : ', tr_auc)

ts_auc = metrics.roc_auc_score(TestY, ts_score[:,1])
print('Test AUC : ', ts_auc)

In [None]:
# Breast Cancer Wisconsin (Diagnostic) Dataset**
# 569 instances (212 Malignant(악성종양), 357 Benign(양성종양))
# 30 numerical features (computed from a digitized image of a breast mass)
# 2 classes (Malignant, Benign)

# Import Libraries
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
import matplotlib.pyplot as plt

# 데이터 가져오고, Train 30%, test 70% 나누기
wisconsin = datasets.load_breast_cancer()
TrainX, TestX, TrainY, TestY = train_test_split(wisconsin.data, wisconsin.target, test_size=0.7, random_state=0)

# Train SVM
# svmmodel = svm.SVC(kernel="linear")
svmmodel = svm.SVC(kernel="linear", probability=True)
svmmodel.fit(TrainX, TrainY)

# Accuracy
tr_pred = svmmodel.predict(TrainX)
tr_acc = metrics.accuracy_score(TrainY, tr_pred)

ts_pred = svmmodel.predict(TestX)
ts_acc = metrics.accuracy_score(TestY, ts_pred)

print('Training Accuracy : ', tr_acc)
print('Test Accuracy : ', ts_acc)

# Confusion Matrix
tr_cmat = metrics.confusion_matrix(TrainY, tr_pred)
print(tr_cmat)

ts_cmat = metrics.confusion_matrix(TestY, ts_pred)
print(ts_cmat)

# ROC & AUC analysis
tr_score = svmmodel.predict_proba(TrainX)
ts_score = svmmodel.predict_proba(TestX)

tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainY, tr_score[:,1], pos_label=1)
ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestY, ts_score[:,1], pos_label=1)

plt.plot(tr_fpr, tr_tpr, 'b:', linewidth=5, label='Train')
plt.plot(ts_fpr, ts_tpr, 'r--', linewidth=5, label='Test')
plt.xlim([0.0, 0.3])
plt.ylim([0.7, 1.0])
plt.legend(loc='best')
plt.show()

In [None]:
# Parkinson's Disease Dataset

# Speech dataset from Parkinson's Disease (PD) patients and healthy subjects
# 26 features computed from speech recordings
# Training dataset: 300 instances (150 PD, 150 healthy)
# Test dataset: 300 instances (150 PD, 150 healthy)

# Training dataset: 'PD_train_smalldata.csv'
# Test dataset: 'PD_test_smalldata.csv'

# Import Libraries
from sklearn import datasets
from sklearn import svm
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# CSV data 확인 / TrainX, TrainY에 CSV 데이터 담기
tr_dataset = pd.read_csv("./PD_train_smalldata.csv")
tr_dataset

sum(tr_dataset.Class == 0) # class0 인거 몇개?

TrainX = tr_dataset.iloc[:, :-1] # pandas lib, 데이터의 일부 가져오기. 맨 뒤에 class 잘라내기
TrainY = tr_dataset.Class

ts_dataset = pd.read_csv("./PD_test_smalldata.csv")
ts_dataset

TestX = ts_dataset.iloc[:, :-1]
TestY = ts_dataset.Class

# Normalize Data : Rescale the range of the values
# 0-1 range : rescale the values between 0 and 1 / 0~1사이에 다 넣기 -> x-xm / xM-xm
# 큰 값에 작은 값들이 영향을 받는다. model이 큰 값들에만 집중하게 됨. 따라서 그대로 쓰면 안되고, 정규화 진행하는것을 권장.

# 각 feature들의 max, min 확인. 어 너무 차이가 큰데? 정규화 가자.
plt.plot(np.arange(0,26), TrainX.min(), 'b')
plt.plot(np.arange(0,26), TrainX.max(), 'r')
plt.show() 

# Normalize
TrainXmin = TrainX.min()
TrainXmax = TrainX.max()

TrainX = (TrainX - TrainXmin) / (TrainXmax - TrainXmin)
TestX = (TestX - TrainXmin) / (TrainXmax - TrainXmin) # 분모가 Test가 이님. *주의*

plt.plot(np.arange(0,26), TrainX.min(), 'b')
plt.plot(np.arange(0,26), TrainX.max(), 'r')
plt.show() # max, min이 모두 1, 0으로 된 것 확인 가능하다. = 정규화 완료

# Train SVM
svmmodel = svm.SVC(kernel="linear", probability=True)
svmmodel.fit(TrainX, TrainY)

# Performance Evaluation, Accuracy
tr_pred = svmmodel.predict(TrainX)
tr_acc = metrics.accuracy_score(TrainY, tr_pred)

ts_pred = svmmodel.predict(TestX)
ts_acc = metrics.accuracy_score(TestY, ts_pred)

print('Training Accuracy : ', tr_acc)
print('Test Accuracy : ', ts_acc)
# = linear만 사용하니까 acc값이 너무 낮다.

# Confusion Matrix
tr_cmat = metrics.confusion_matrix(TrainY, tr_pred)
print(tr_cmat)

ts_cmat = metrics.confusion_matrix(TestY, ts_pred)
print(ts_cmat)

# ROC & AUC analysis
tr_score = svmmodel.predict_proba(TrainX)
ts_score = svmmodel.predict_proba(TestX)

tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainY, tr_score[:,1], pos_label=1)
ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestY, ts_score[:,1], pos_label=1)

plt.plot(tr_fpr, tr_tpr, 'b:', linewidth=5, label='Train')
plt.plot(ts_fpr, ts_tpr, 'r--', linewidth=5, label='Test')
plt.legend(loc='best')
plt.show()

# Kernel을 변경하기 용이하도록 function 정의
def SVMmodel(TrainX, TrainY, TestX, TestY, kernel="rbf"):
  svmmodel = svm.SVC(kernel=kernel, probability=True)
  svmmodel.fit(TrainX, TrainY)

  tr_pred = svmmodel.predict(TrainX)
  tr_acc = metrics.accuracy_score(TrainY, tr_pred)

  ts_pred = svmmodel.predict(TestX)
  ts_acc = metrics.accuracy_score(TestY, ts_pred)

  print('Training Accuracy : ', tr_acc)
  print('Test Accuracy : ', ts_acc)

  tr_score = svmmodel.predict_proba(TrainX)
  ts_score = svmmodel.predict_proba(TestX)

  tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainY, tr_score[:,1], pos_label=1)
  ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestY, ts_score[:,1], pos_label=1)

  plt.plot(tr_fpr, tr_tpr, 'b:', linewidth=5, label='Train')
  plt.plot(ts_fpr, ts_tpr, 'r--', linewidth=5, label='Test')
  plt.legend(loc='best')
  plt.show()

  tr_auc = metrics.roc_auc_score(TrainY, tr_score[:,1])
  print('Training AUC : ', tr_auc)

  ts_auc = metrics.roc_auc_score(TestY, ts_score[:,1])
  print('Test AUC : ', ts_auc)

# Kernel 변경하여 더 좋은 model 찾기

# Linear Kernel
# 아무것도 안하겠다.
SVMmodel(TrainX, TrainY, TestX, TestY, kernel="linear")

# Radial Bias Function Kernel
# 기본설정값, gamma값이 C값과 역할이 비슷하다. 커지면 trainning data에 더 높은 중요도롤 줘서 overfitting.
SVMmodel(TrainX, TrainY, TestX, TestY, kernel="rbf")

# Polynomial Kernel (2D -> 3D)
# linear하게 구분되지 않는 데이터들을 다른 공간으로 projection시켜서 구분할 수 있게 만든다.
SVMmodel(TrainX, TrainY, TestX, TestY, kernel="poly")

# 시간날 떄 읽어보자 : https://ekdud7667.tistory.com/entry/SVM1