# **Support Vector Machine (SVM)**

Import Libraries

In [None]:
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt

In [None]:
data = np.array([[1, -1], [2, -2], [0, 1], [-1, 2]])
label = np.array([1, 1, -1, -1])

plt.scatter(data[:,0], data[:,1], c=label, s=50, cmap='spring')
plt.show()

In [None]:
svmmodel = svm.SVC(kernel='linear')
svmmodel.fit(data, label)

In [None]:
svmmodel.support_    # indices of support vectors, 어느 인덱스가 서포트 벡터인지

In [None]:
svmmodel.support_vectors_    # support vectors

In [None]:
svmmodel.coef_    # a weight vector w (size: # of features)

In [None]:
svmmodel.intercept_    # bias (w0)

In [None]:
w = svmmodel.coef_[0]
slope = -w[0] / w[1]
margin = 1 / np.sqrt(np.sum(w ** 2))

xx = np.linspace(-5,5)
yy = slope * xx - svmmodel.intercept_[0] / w[1]        # decisoin boundary
yy1 = slope * xx - svmmodel.intercept_[0] / w[1] + margin    # margin
yy2 = slope * xx - svmmodel.intercept_[0] / w[1] - margin    # margin

print(xx.shape)
print(yy.shape)

plt.scatter(data[:,0], data[:,1], c=label, s=50, cmap='spring')
plt.plot(xx, yy, 'r')
plt.plot(xx, yy1, 'r--')
plt.plot(xx, yy2, 'r--')
plt.show()

Import Libraries

In [None]:
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

Generate some (random) data

In [None]:
X, Y = datasets.make_blobs(400, 2, centers=2, random_state=3, cluster_std=1.0)
TrainX, TestX, TrainY, TestY = train_test_split(X, Y, test_size=0.3, random_state=0)

plt.scatter(TrainX[:,0], TrainX[:,1], c=TrainY, s=50, cmap='spring')
plt.show()

**Train SVM**

In [None]:
svmmodel = svm.SVC(kernel='linear')
svmmodel.fit(TrainX, TrainY)

Let's check out the parameters of SVM

In [None]:
svmmodel.support_    # indices of support vectors

In [None]:
svmmodel.support_vectors_    # support vectors

In [None]:
svmmodel.coef_    # a weight vector w (size: # of features)

In [None]:
svmmodel.intercept_    # bias (w0)

In [None]:
w = svmmodel.coef_[0]
slope = -w[0] / w[1]
margin = 1 / np.sqrt(np.sum(w ** 2))

xx = np.linspace(-5,5)
yy = slope * xx - svmmodel.intercept_[0] / w[1]        # decisoin boundary
yy1 = slope * xx - svmmodel.intercept_[0] / w[1] + margin    # margin
yy2 = slope * xx - svmmodel.intercept_[0] / w[1] - margin    # margin

print(xx.shape)
print(yy.shape)

plt.figure(figsize=(8,6))
plt.scatter(TrainX[:,0], TrainX[:,1], c=TrainY, s=50, cmap='spring')
plt.scatter(TestX[:,0], TestX[:,1], c=TestY, marker='d', s=50, cmap='RdYlBu')
plt.plot(xx, yy, 'r')
plt.plot(xx, yy1, 'r--')
plt.plot(xx, yy2, 'r--')
plt.show()

**Performance Evaluation**

In [None]:
from sklearn import metrics

Accuracy

In [None]:
tr_pred = svmmodel.predict(TrainX)
tr_acc = metrics.accuracy_score(TrainY, tr_pred)

ts_pred = svmmodel.predict(TestX)
ts_acc = metrics.accuracy_score(TestY, ts_pred)

print('Training Accuracy : ', tr_acc)
print('Test Accuracy : ', ts_acc)

Confusion Matrix

In [None]:
tr_cmat = metrics.confusion_matrix(TrainY, tr_pred)
print(tr_cmat)

ts_cmat = metrics.confusion_matrix(TestY, ts_pred)
print(ts_cmat)

ROC Analysis

In [None]:
svmmodel = svm.SVC(kernel='linear', probability=True)
svmmodel.fit(TrainX, TrainY)

tr_score = svmmodel.predict_proba(TrainX)
ts_score = svmmodel.predict_proba(TestX)

In [None]:
tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainY, tr_score[:,1], pos_label=1)
ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestY, ts_score[:,1], pos_label=1)

plt.plot(tr_fpr, tr_tpr, 'b:', linewidth=10, label='Train')
plt.plot(ts_fpr, ts_tpr, 'r--', linewidth=5, label='Test')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.legend(loc='best')
plt.show()

In [None]:
tr_auc = metrics.roc_auc_score(TrainY, tr_score[:,1])
print('Training AUC : ', tr_auc)

ts_auc = metrics.roc_auc_score(TestY, ts_score[:,1])
print('Test AUC : ', ts_auc)

# **Breast Cancer Wisconsin (Diagnostic) Dataset**
*   569 instances (212 Malignant, 357 Benign)
*   30 numerical features (computed from a digitized image of a breast mass)
*   2 classes (Malignant, Benign)


Import Libraries

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
import matplotlib.pyplot as plt

**Prepare Data**

In [None]:
wisconsin = datasets.load_breast_cancer()
TrainX, TestX, TrainY, TestY = train_test_split(wisconsin.data, wisconsin.target, test_size=0.7, random_state=0)

**Train SVM**

In [None]:
# svmmodel = svm.SVC(kernel="linear")
svmmodel = svm.SVC(kernel="linear", probability=True)
svmmodel.fit(TrainX, TrainY)

**Performance Evaluation**

Accuracy

In [None]:
tr_pred = svmmodel.predict(TrainX)
tr_acc = metrics.accuracy_score(TrainY, tr_pred)

ts_pred = svmmodel.predict(TestX)
ts_acc = metrics.accuracy_score(TestY, ts_pred)

print('Training Accuracy : ', tr_acc)
print('Test Accuracy : ', ts_acc)

Confusion Matrix

In [None]:
tr_cmat = metrics.confusion_matrix(TrainY, tr_pred)
print(tr_cmat)

ts_cmat = metrics.confusion_matrix(TestY, ts_pred)
print(ts_cmat)

ROC Analysis

In [None]:
tr_score = svmmodel.predict_proba(TrainX)
ts_score = svmmodel.predict_proba(TestX)

tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainY, tr_score[:,1], pos_label=1)
ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestY, ts_score[:,1], pos_label=1)

plt.plot(tr_fpr, tr_tpr, 'b:', linewidth=5, label='Train')
plt.plot(ts_fpr, ts_tpr, 'r--', linewidth=5, label='Test')
plt.xlim([0.0, 0.3])
plt.ylim([0.7, 1.0])
plt.legend(loc='best')
plt.show()

In [None]:
tr_auc = metrics.roc_auc_score(TrainY, tr_score[:,1])
print('Training AUC : ', tr_auc)

ts_auc = metrics.roc_auc_score(TestY, ts_score[:,1])
print('Test AUC : ', ts_auc)

# **Parkinson's Disease Dataset**

*   Speech dataset from Parkinson's Disease (PD) patients and healthy subjects
*   26 features computed from speech recordings
*   Training dataset: 300 instances (150 PD, 150 healthy)
*   Test dataset: 300 instances (150 PD, 150 healthy)

*   Training dataset: 'PD_train_smalldata.csv'
*   Test dataset: 'PD_test_smalldata.csv'


In [None]:
from sklearn import datasets
from sklearn import svm
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

Prepare Data

In [None]:
tr_dataset = pd.read_csv("./PD_train_smalldata.csv")
tr_dataset

In [None]:
sum(tr_dataset.Class == 0)

In [None]:
TrainX = tr_dataset.iloc[:, :-1]
TrainY = tr_dataset.Class

In [None]:
ts_dataset = pd.read_csv("./PD_test_smalldata.csv")
ts_dataset

TestX = ts_dataset.iloc[:, :-1]
TestY = ts_dataset.Class

Normalize Data : Rescale the range of the values
*   0-1 range : rescale the values between 0 and 1
>$x_{new} = \frac{x-x_{min}}{x_{max}-x_{min}}$

In [None]:
plt.plot(np.arange(0,26), TrainX.min(), 'b')
plt.plot(np.arange(0,26), TrainX.max(), 'r')
plt.show()

In [None]:
TrainXmin = TrainX.min()
TrainXmax = TrainX.max()

TrainX = (TrainX - TrainXmin) / (TrainXmax - TrainXmin)
TestX = (TestX - TrainXmin) / (TrainXmax - TrainXmin)

In [None]:
plt.plot(np.arange(0,26), TrainX.min(), 'b')
plt.plot(np.arange(0,26), TrainX.max(), 'r')
plt.show()

**Train SVM**

In [None]:
svmmodel = svm.SVC(kernel="linear", probability=True)
svmmodel.fit(TrainX, TrainY)

**Performance Evaluation**

Accuracy

In [None]:
tr_pred = svmmodel.predict(TrainX)
tr_acc = metrics.accuracy_score(TrainY, tr_pred)

ts_pred = svmmodel.predict(TestX)
ts_acc = metrics.accuracy_score(TestY, ts_pred)

print('Training Accuracy : ', tr_acc)
print('Test Accuracy : ', ts_acc)

Confusion Matrix

In [None]:
tr_cmat = metrics.confusion_matrix(TrainY, tr_pred)
print(tr_cmat)

ts_cmat = metrics.confusion_matrix(TestY, ts_pred)
print(ts_cmat)

ROC Analysis

In [None]:
tr_score = svmmodel.predict_proba(TrainX)
ts_score = svmmodel.predict_proba(TestX)

tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainY, tr_score[:,1], pos_label=1)
ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestY, ts_score[:,1], pos_label=1)

plt.plot(tr_fpr, tr_tpr, 'b:', linewidth=5, label='Train')
plt.plot(ts_fpr, ts_tpr, 'r--', linewidth=5, label='Test')
plt.legend(loc='best')
plt.show()

In [None]:
tr_auc = metrics.roc_auc_score(TrainY, tr_score[:,1])
print('Training AUC : ', tr_auc)

ts_auc = metrics.roc_auc_score(TestY, ts_score[:,1])
print('Test AUC : ', ts_auc)

In [None]:
def SVMmodel(TrainX, TrainY, TestX, TestY, kernel="rbf"):
  svmmodel = svm.SVC(kernel=kernel, probability=True)
  svmmodel.fit(TrainX, TrainY)

  tr_pred = svmmodel.predict(TrainX)
  tr_acc = metrics.accuracy_score(TrainY, tr_pred)

  ts_pred = svmmodel.predict(TestX)
  ts_acc = metrics.accuracy_score(TestY, ts_pred)

  print('Training Accuracy : ', tr_acc)
  print('Test Accuracy : ', ts_acc)

  tr_score = svmmodel.predict_proba(TrainX)
  ts_score = svmmodel.predict_proba(TestX)

  tr_fpr, tr_tpr, tr_th = metrics.roc_curve(TrainY, tr_score[:,1], pos_label=1)
  ts_fpr, ts_tpr, ts_th = metrics.roc_curve(TestY, ts_score[:,1], pos_label=1)

  plt.plot(tr_fpr, tr_tpr, 'b:', linewidth=5, label='Train')
  plt.plot(ts_fpr, ts_tpr, 'r--', linewidth=5, label='Test')
  plt.legend(loc='best')
  plt.show()

  tr_auc = metrics.roc_auc_score(TrainY, tr_score[:,1])
  print('Training AUC : ', tr_auc)

  ts_auc = metrics.roc_auc_score(TestY, ts_score[:,1])
  print('Test AUC : ', ts_auc)


In [None]:
SVMmodel(TrainX, TrainY, TestX, TestY, kernel="linear")

In [None]:
SVMmodel(TrainX, TrainY, TestX, TestY, kernel="rbf")


In [None]:
SVMmodel(TrainX, TrainY, TestX, TestY, kernel="poly")