In [30]:
from sklearn import datasets
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
from cvxopt import matrix, solvers
from scipy.spatial.distance import cdist
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [31]:
cancer_data = datasets.load_breast_cancer()
# show to test record 5th
print(cancer_data.data[5])
print(cancer_data.data.shape)
#target set
print(cancer_data.target)

[1.245e+01 1.570e+01 8.257e+01 4.771e+02 1.278e-01 1.700e-01 1.578e-01
 8.089e-02 2.087e-01 7.613e-02 3.345e-01 8.902e-01 2.217e+00 2.719e+01
 7.510e-03 3.345e-02 3.672e-02 1.137e-02 2.165e-02 5.082e-03 1.547e+01
 2.375e+01 1.034e+02 7.416e+02 1.791e-01 5.249e-01 5.355e-01 1.741e-01
 3.985e-01 1.244e-01]
(569, 30)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 

In [32]:
target = cancer_data.target
target[target == 0] = -1
target

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1,  1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1,  1,  1,  1,
        1,  1, -1, -1,  1, -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1,  1,
        1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1,  1, -1, -1,  1,
       -1, -1, -1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1, -1,  1,
        1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
       -1,  1, -1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1, -1,  1, -1, -1,
        1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1, -1, -1,  1,
        1, -1, -1,  1,  1,  1,  1, -1,  1,  1, -1, -1, -1,  1, -1,  1, -1,
        1,  1,  1, -1,  1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1,
        1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,  1,
        1,  1, -1,  1,  1

In [33]:
X_train, X_test, y_train, y_test = train_test_split(cancer_data.data, target, test_size=0.3, random_state=109)

In [34]:
X0 = X_train[y_train == 1] # class 1
X1 = X_train[y_train == -1] # class -1
X = np.concatenate((X0.T, X1.T), axis = 1) # all data
y = y_train.reshape(1, 398) # labels

In [39]:
# build P ~ K
V = np.concatenate((X0.T, -X1.T), axis = 1)
P = matrix(V.T.dot(V), tc='d') # P ~ K in slide see definition of V, K near eq (8)
q = matrix(-np.ones((V.shape[1], 1)), tc='d') # all-one vector
# build A, b, G, h
G = matrix(-np.eye(V.shape[1]), tc='d') # for all lambda_n >= 0! note that we solve -g(lambda) -> min
h = matrix(np.zeros((V.shape[1], 1)), tc='d')
A = matrix(y, tc='d') # the equality constrain is actually y^T lambda = 0
b = matrix(np.zeros((1, 1)), tc='d')

solvers.options['show_progress'] = False
sol = solvers.qp(P, q, G, h, A, b)
l = np.array(sol['x']) # lambda
print('lambda = ')
print(l.T)

lambda = 
[[3.59440407e-176 1.26150511e-175 1.95750392e-176 1.63736209e-173
  3.03577431e-176 8.25845088e-175 1.65274695e+004 6.40388043e-176
  3.95842748e-176 2.72385779e-175 4.29913851e-176 5.21321552e-176
  5.56669296e-176 3.46631561e-176 3.75517932e-176 4.67954381e-176
  1.66323137e+002 2.74735744e-176 5.42225344e-176 3.61437839e-176
  9.63103303e-176 2.10120286e-176 4.13537898e+003 7.82394124e-176
  2.02859020e-176 6.65565194e-176 7.15485463e-176 2.97223125e-176
  4.10281201e-176 1.02128562e-175 3.82962695e-176 4.69675867e-176
  8.64233843e-176 3.99727866e-176 8.84624147e+004 2.91768751e-176
  2.95078571e-176 3.34943297e-176 3.19781818e-176 6.65851306e-176
  7.01335765e-176 2.85449167e-176 6.93183487e-176 1.67167778e-175
  8.16071887e-176 8.88980476e-176 3.58138845e-176 2.97118574e-176
  5.63888851e-176 2.49083814e-176 2.83324541e-176 6.01567431e-176
  2.91687228e-176 1.95255406e-175 1.41058472e-175 2.46642996e-176
  2.10061472e-174 5.42975972e-176 6.34726090e-176 4.87498467e-176


In [40]:
epsilon = 1e-6 # just a small number, greater than 1e-9, to filter values of lambda
S = np.where(l > epsilon)[0]
VS = V[:, S]
XS = X[:, S]
yS = y[:, S]
lS = l[S]
# calculate w and b
w = VS.dot(lS)
w0 = np.mean(yS.T - w.T.dot(XS))
print('W = ', w.T)
print('W0 = ', w0)

W =  [[ 2.04794581e+01 -9.89326614e-01 -1.87112227e+00 -2.81767026e-02
  -1.55731500e+02  8.87770547e+01 -3.33041867e+01 -1.58513336e+02
   2.01319269e+02 -2.16476782e+02 -1.89079422e+01  4.66143581e+00
  -7.57089789e+00  4.24400195e-02  9.38221564e+01 -3.69202424e+02
   4.00597718e+02 -2.19620117e+02  1.85129372e+02 -1.86562919e+01
   2.30498123e-02 -6.09667089e-01  7.91786118e-01 -1.13544017e-01
   9.77366506e+00  1.00004992e+02 -7.71734884e+01 -1.70124974e+02
  -1.47764268e+02  7.94040967e+01]]
W0 =  -1.3358484544539588


In [41]:
def predict(w0, w, X):
    return np.sign(w.T @ X + w0).flatten()
    

In [None]:
y_pred = predict(w0, w, X_test.T)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred))

Accuracy: 0.9649122807017544
Confusion Matrix: [[ 61   2]
 [  4 104]]


### Use Scikit-learn

In [45]:
from sklearn.svm import SVC

clf = SVC(kernel = 'linear', C = 1e5) 

# if C is small, method will be “SoftMagin SVM”,
# if C is large enough, method is near to hard margin

clf.fit(X_train, y_train)
w = clf.coef_
w0 = clf.intercept_
print('w = ', w)
print('W0 = ', w0)

w =  [[ 1.55841303e+04  5.69829895e+03  6.01236606e+02 -2.11652064e+02
  -7.58418989e+03 -3.58287335e+03 -2.28082132e+04 -1.67774049e+04
  -2.14646639e+04  2.35627498e+02 -4.23531681e+03  3.41852567e+04
  -7.07061016e+02 -1.82451704e+03 -1.29485985e+03  5.13476398e+03
  -1.07869294e+03 -3.52075548e+03 -5.84510533e+03  3.23810408e+02
   1.32407386e+03 -8.43438203e+03 -6.60376568e+02 -6.59702826e+00
  -1.27800564e+04  1.10353669e+04 -4.11879410e+04 -3.48086330e+04
  -6.38834076e+04  8.62849570e+02]]
W0 =  [88059.708578]


In [47]:
y_pred_sk = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_sk))
print("Confusion Matrix:", confusion_matrix(y_test, y_pred_sk))

Accuracy: 0.9649122807017544
Confusion Matrix: [[ 61   2]
 [  4 104]]


### Nhận xét:
Khi sử dụng thư viện và khi xây dựng từng bước, bộ hệ số thu được khá khác nhau. Nhưng độ chính xác thì vẫn tương đương.