In [None]:
from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
from sklearn.svm import SVC
from pandas import DataFrame
from mlxtend.plotting import plot_decision_regions
from sklearn.datasets import make_circles
from matplotlib.pyplot import figure
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import pandas as pd

# Linear Algebra
$A = (a_1, a_2, \dots, a_n)$ \\
$B = (b_1, b_2, \dots, b_n)$

- **Dot Product** 
<br>
$A \cdot B = \sum\limits_{i=1}^{n}a_i*b_i = a_1*b_1 + a_2*b_2 + \cdots + a_n*b_n$

# Concepts

*   ${(x_1, y_1), (x_2, y_2), \dots, (x_n, y_n)}$ - dataset D
*   $ x_i = (x_{i1}, x_{i2}, \dots , x_{ir})$ - a single sample with r attributes/features
*   $y_i \in \{\color{green}{+1}, \color{red}{-1}\}$
  *  $\color{green}{+1}$ - denotes positive class
  *  $\color{red}{-1}$ - denotes negative class



# SVM

- classifier function: 
$f(x) = \langle w \cdot x_i \rangle + b = w_1*x_1+w_2*x_2*\cdot*w_n*x_n + b$
  - $f:X \subseteq	\Re^r \rightarrow \Re$
  - $w=(w_1, w_2, \cdot, w_r) \in \Re^r$  - weight vector
  - $b \in \Re$ - bias vector
  - $\langle w \cdot x_i \rangle$ - dot product of w and x (Euclidian inner product)

- $x_i$ is assigned to a positive class if $f(x_i)\ge0$, and to the negative class otherwise \\
$y_i=\begin{cases}
    \color{green}{+1},& \text{if } \langle w \cdot x_i \rangle + b \geq 0\\
    \color{red}{-1},& \text{if } \langle w \cdot x_i \rangle + b < 0\\
\end{cases}$

- $\langle w \cdot x_i \rangle + b = 0 \rightarrow$  becomes the hyperplane(also called decision boundary of decision surface) that separates between positive and negative training examples 

SVM works only with separable data. In case the data is not linearly separable we make use of kernel functions. In the following sections, both cases are presented.

Other problem with the SVM is that is limited to binary classification. This issue can be addressed by using **one-versus-all** classification.

# Linear SVM

### Notations
- $\textbf{w}$: vector perpendicular on the plane (normal vector of the hyperplane)
- $\textbf{b}$: used to move hyperplane parallel to itself
- $\color{green}{d_+}|\color{red}{d_-}$: shortest distance from separating hyperplane closest <font color="green">positive</font> | <font color="red">negative</font> datapoint
-  $\color{green}{X_+}|\color{red}{X_-}$: support vectors
- $\textbf{margin} = \color{green}{d_+} + \color{red}{d_-}$

### Objective Function
- find the hyperplane with the largest margine
- for an indepth mathematical intuition please check the following resources:
  - [An Idiot’s guide to Support vector
machines (SVMs)](https://drive.google.com/file/d/1p-l0girUFg8-GuF7IQLKnOLEnOvLpZf4/view?usp=sharing)
  - [Lecture 15 - Kernel Methods from Caltech](https://www.youtube.com/watch?v=XUj5JbQihlU)
  - [Pattern Recognition and Machine Learning, chapter 7, C. Bishop](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf)
![](http://drive.google.com/uc?export=view&id=1abobdW1W2HJBXXbnHBYHcXPS_lsvMHj5)




The following dataset is generated from scikit-learn, by using [make_blobs](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html) where:
- **n_samples**: total number of points generated
- **centers**: number of classes (clusters)
- **cluster_std**: cluster's standard deviation, if cluster_std is zero, all points from a cluster will overlap
- **random_state**: seed used to generate datapoints, if you pass an int it will reproduce the output at every call

In [None]:
X, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.6)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='winter')
plt.show()

As you can imagine, this datapoints are linearly separable. Linear separability refers to the fact that classses can be separated with a decision surface. (1D data can be separated by a point, 2D can be separated by a line, 3D can be separated by a place, etc.). Now, let's try to draw some lines that can separate this points by finding certain values for the weight and the bias presented above in order to draw decision lines. 

The idea of this algorithm is to find the best decision lines that separates data for unseen points. Even if each of the tree lines drawn below is separating the actual data we are seeing, this classifier needs to be used also on unseed data. Suppose our new datapoint $\color{red}{\times}$ has to be classified by each one of the classifiers(each line is considered a classifier), then the datapoint will be classified as <font color="green">green class</font> if we use the blue classifier, and as <font color="blue">blue class</font> if we use the yellow or violet classifier.

In [None]:
no_lines = 3
x_values = np.linspace(-1, 4)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='winter')

for w, b, c in [(1, 0.65, 'c'), (0.5, 1.6, 'y'), (-0.2, 2.9, 'm')]:
  plt.plot(x_values, w*x_values+b, '-k', color=c)
plt.plot([0.6], [1.6], 'x', color='red', markeredgewidth=2, markersize=10)

plt.show()

Now, we will train a SVM using a linear kernel for the dataset presented above. All the data plotted as stars represents new data and SVM has not trained on this data. The line in the figure represents the decision boundary that the classifier has learnt during training.

[SVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html) and [LinearSVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC) packages from scikit-learn represent different implementations of the same algorithm. SVC is based on [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) and LinearSVC is based on [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) and it only supports linear kernel.




In [None]:
# create an instance with the classifier
svc_model = SVC(kernel='linear')

# train classifier
svc_model.fit(X, y)

# add test samples
test_samples = np.array([[0.6, 1.6], [3,5], [1.5, 2.5], [1.5, 2.7]])

all_samples = np.concatenate((X, test_samples), axis=0)
predicted_results = svc_model.predict(all_samples)

figure(num=None, figsize=(8, 6))

# print decision boundary and highlight test samples
plot_decision_regions(all_samples, predicted_results, 
                      clf=svc_model, colors="blue,green", 
                      legend=2, X_highlight=test_samples)
plt.show()

Now, we will compute the [confusion matrix](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) which expects the parameters for true labels and predicted labels. The following values are computes:

- True Positives (TP): The class is positive and we have predicted correctly
- True Negatives (TN): The class is negative and we have predicted correctly
- False Positives (FP): The class is Negative, but we predicted Positive
- False Negatives (FN): The class is Positive, but we predicted Negative

![](http://drive.google.com/uc?export=view&id=1HGnPRgQYEO49Es6CzQ2pcoamAp7qbXfj)

Accuracy is computes as it follows:

$Acc = \frac{TP + TN}{TP + TN + FP + FN}$

In our case we will have an accuracy of 100%, because all samples are predicted correctly. Remember, that we are always interested to compute the accuracy also on test dataset which is more important. The classifier has not seen the data from testing, and its guess are more relevant on unseen data. This gives us an intuition if the classifier is able to classify new data. For the moment, we will not compute it firstly the test samples are not labeled and secondly this is a toy dataset to illustrate the intuition behind SVM.

In [None]:
data = confusion_matrix(y, svc_model.predict(X))
plot_confusion_matrix(data)
plt.show()

#Non-Linear SVM

When data is not linearly separable, then we have to find a mapping $\phi$ of the feature space. This mapping transform the N-dimensional feature space into a higher-dimensional space. Basically, it adds a number of dimension which depends on the number of dimensions you already have and are not linearly. The more dimension there are, the more it results in the curse of dimensionality when speaking of mappings. While, training SVM does not need to explicitly apply a mapping function $\phi$, instead it is happy to have the dot product of $x$ and $x'$ computed in high-dimensional space.

![](http://drive.google.com/uc?export=view&id=1DWyZhW0cO_hgdG2ctyifl4gg68jQTZzF)

## Kernels

Given a non-linear feature space mapping $\phi(x)$, the kernel function is given by:
<br>
$k(x,x') = \phi(x)^T\phi(x')$
<br>
If we use a linear kernel function $\phi(x)=x$, then the we have 
<br>
$k(x,x') = x^Tx'$

Kernels:
- define a similarity measure (check this [tutorial](https://dataaspirant.com/2015/04/11/five-most-popular-similarity-measures-implementation-in-python/) with Kung-Fu Panda 🐼 in order to understand what is a similarity measure)
- defined by an implicit mapping $\phi$
- is symmetric: $k(x, x')=k(x',x)$

Here are the folowing formulas for Kernels according to scikit-learn [implementation](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/svm/src/libsvm/svm.cpp):


- Linear <br>
$k(X, X') = X^T \cdot X'$
- Polynomial <br>
$k(X, X') = (\gamma X^T \cdot X' + c_0)^p$
- RBF <br>
$k(X,X') = e^{-\gamma * ||X-X'||^2}$
- Sigmoid <br>
 $k(X,X') = tanh(\gamma X  X' + c_0)$

 For more information, please check the [documentation](https://scikit-learn.org/stable/modules/metrics.html)


The next dataset is not linearly separable. There is no line that can separate this datapoints. We will try to apply SVM without using a kernel to see what will SVM classifier will learn.

In [None]:
X, y = make_circles(n_samples=900, factor=.3, noise=.1, random_state=0)
plt.scatter(X[:, 0], X[:, 1], c=y, s=10, cmap='winter')
plt.show()

As we can see, SVM is not capable to find a line that can separate datapoints. That's why we have to call the help of kernel functions.

In [None]:
# create an instance with the classifier
svc_model = SVC(kernel='linear')

# train classifier
svc_model.fit(X, y)

# add test samples
predicted_results = svc_model.predict(X)

figure(num=None, figsize=(8, 6))

# print decision boundary and highlight test samples
plot_decision_regions(X, predicted_results, 
                      clf=svc_model, colors="blue,green", 
                      legend=2)
plt.show()

In [None]:
# create an instance with the classifier
svc_model = SVC(kernel='rbf', gamma=0.9)

# train classifier
svc_model.fit(X, y)

# add test samples
predicted_results = svc_model.predict(X)

figure(num=None, figsize=(8, 6))

# print decision boundary and highlight test samples
plot_decision_regions(X, predicted_results, 
                      clf=svc_model, colors="blue,green", 
                      legend=2)
plt.show()

# Ex 1. XOR Dataset

Based on the XOR dataset, train a SVC classifier and check the decision boundaries learnt. Compute accuracy.

In [None]:
rng = np.random.RandomState(0)
X = rng.randn(300, 2)
y = np.array(np.logical_xor(X[:, 0] > 0, X[:, 1] > 0), 
             dtype=int)
plt.scatter(X[:, 0], X[:, 1], c=y, s=10, cmap='winter')
plt.show()

In [None]:
# create an instance with the classifier
svc_model = SVC(kernel='rbf')

# train classifier
svc_model.fit(X, y)

# add test samples
predicted_results = svc_model.predict(X)

figure(num=None, figsize=(8, 6))

# print decision boundary and highlight test samples
plot_decision_regions(X, predicted_results, 
                      clf=svc_model, colors="blue,green", 
                      legend=2)
plt.show()

data = confusion_matrix(y, svc_model.predict(X))
FN = data[0][0]
FP = data[0][1]
TN = data[1][0]
TP = data[1][1]

acc = (TP + TN) / (TP + TN + FP + FN)
print(acc)

# Binary Classifier to Multiclass Classifier

This sections discusses strategies to use binary classificators in problems where there are more than two classes.

Observations:
- OVO computes $\frac{N*(N-1)}{2}$ classifier while OVR computes $N$ classifiers
- OVR is trained on inbalanced data, even if classes are balanced, the number of negative samples is larger than the number of positive samples
- when using OVR SVMs for multiclass classification, tie cases frequently occur

## OVR (One-vs-Rest)

In this strategy we train a classifier for each class, with the samples of the class labeled as positive observations and all other samples labeled as negative observations.

- samples: $X=\{x_1, x_2, \cdots, x_n\}$
- classes: $y = \{y_1, y_2, \cdots, y_k\}$

OVR(model, X, y) \\
- FOREACH $y_i$ in $y$
  - train classifier $f_i$ with $y_i$ as positive class, and all the other classes merged in the negative class
  - predict class
- choose the classifier which has the positive value


## OVO (One-vs-One)

In this strategy we train a classifier for each two classes, with the samples from one class labeled as positive observations and all samples from the other class labeled as negative observations.

- samples: $X=\{x_1, x_2, \cdots, x_n\}$
- classes: $y = \{y_1, y_2, \cdots, y_k\}$

OVO(model, X, y) \\
- FOREACH $y_i$ in $y$, $y_j$ in $y$, $i \neq j$
  - train classifier $f_{i,j}$ with $y_i$ as positive class, and $y_j$ as negative class

- a voting schema is applied, where each trained classifier $f_{i,j}$ is applied on the new data and the class which has the most votes will win.



# Ex 2. Multiclass Classifier

Train a SVM with both OVO and OVR methods based on the below dataset. Compute confusion matrix for each strategy and check decision boundaries.

In [None]:
X, y = make_blobs(n_samples=300, centers=3, random_state=0, cluster_std=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='winter')
plt.show()

In [None]:
# create an instance with the classifier
svc_model = SVC(kernel='rbf', decision_function_shape='ovr', gamma=0.9)

# train classifier
svc_model.fit(X, y)

# add test samples
predicted_results = svc_model.predict(X)

figure(num=None, figsize=(8, 6))

# print decision boundary and highlight test samples
plot_decision_regions(X, predicted_results, 
                      clf=svc_model, colors="red,blue,green", 
                      legend=3)
plt.show()
data = confusion_matrix(y, svc_model.predict(X))
plot_confusion_matrix(data)
plt.show()


# create an instance with the classifier
svc_model = SVC(kernel='rbf', decision_function_shape='ovo', gamma=0.9)

# train classifier
svc_model.fit(X, y)

# add test samples
predicted_results = svc_model.predict(X)

figure(num=None, figsize=(8, 6))

# print decision boundary and highlight test samples
plot_decision_regions(X, predicted_results, 
                      clf=svc_model, colors="red,blue,green", 
                      legend=3)
plt.show()
data = confusion_matrix(y, svc_model.predict(X))
plot_confusion_matrix(data)
plt.show()

# Ex 3. Breast Cancer Dataset

Based on [Breast Cancer Dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html), predict if a tumor is malign or benign using different markers(attributes) which are mentioned [here](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic). Compare different models:
- [SVM](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html) model  
- [Decision Tree](https://scikit-learn.org/stable/modules/tree.html) model.

Check:
- accuracy on train vs test dataset
- confusion matrix (because classes might have an unbalanced number of samples)
- choose a subset of features/attributes and check if accuracy improves or not

Before you start, please analyze the dataset.



In [None]:
from sklearn.model_selection import train_test_split
#X, y = make_blobs(n_samples=300, centers=3, random_state=0, cluster_std=0.8)
from sklearn.datasets import load_breast_cancer

X, y = load_breast_cancer(return_X_y=True)
#data = pd.DataFrame(np.c_[cancer['data'], cancer['target']], columns=np.append(cancer['feature_names'], ['target']))

#X = data.drop(['target'], axis = 1)
#y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=20) 
#plt.scatter(X[:, 0], X[:, 1], c=y, cmap='winter')
#plt.show()

# create an instance with the classifier
svc_model = SVC(kernel='rbf', gamma=0.001)

# train classifier
svc_model.fit(X_train, y_train)

# add test samples
predicted_results = svc_model.predict(X_test)
data = confusion_matrix(y_test, predicted_results)
plot_confusion_matrix(data)

cm = np.array(confusion_matrix(y_test, predicted_results))
confusion = pd.DataFrame(cm, index=['is_cancer', 'is_healthy'], columns=['predicted_cancer', 'predicted_healthy'])
print(confusion)

from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print(y_pred)
data = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(data)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#figure(num=None, figsize=(8, 6))

# print decision boundary and highlight test samples
#plot_decision_regions(X, predicted_results, 
#                      clf=svc_model, colors="blue,green", 
#                      legend=2)
#plt.show()

#data = confusion_matrix(y, svc_model.predict(X))
#plot_confusion_matrix(data)
#plt.show()