# Support Vector Machine - SVM

In [27]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import fetch_20newsgroups

# NLP
from sklearn.feature_extraction.text import TfidfVectorizer

# SVM --> Support Vector Classification
from sklearn.svm import SVC

# pipeline
from sklearn.pipeline import Pipeline

# Numpy
import numpy as np

In [3]:


categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


### Usando TfidfVectorizer se crea una matriz con la relevancia de las palabras en relación a los diferentes documentos del dataset de entrenamiento

**use_idf : boolean (default=True): Enable inverse-document-frequency reweighting.(???)** 

In [5]:
X_train_tf = TfidfVectorizer(use_idf=False).fit_transform(twenty_train.data)
X_train_tf.shape

(2257, 35788)

In [21]:
X_train_tf.dtype

dtype('float64')

**El objetivo de está clasificación es usar el dataset de noticias de diferentes categorias (['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'])
Obtener la relevancia de las palabras y clasificarlas por categoria
Se usa un Pipeline para entrenar el Tfidf (Term Frequency Inverse Document Frequency) y luego realizar la predicción con SVC (Support Vector Classifier)**

### Pipeline: 
**Secuencialmente aplica transformaciones y finalmente un estimador. Los pasos intermedios deben implementar los métodos _fit_ y _transform_, el estimador final solo requiere implementar el método _fit_.**

In [29]:
text_clf = Pipeline([('tfidf' , TfidfVectorizer()), ('classifier', SVC(kernel='linear'))])

In [30]:
# Entrenando (tfidf o classifier ????)
text_clf.fit(twenty_train.data, twenty_train.target)

predicted = text_clf.predict(twenty_test.data)

**Calcular accuracy:**

In [31]:
np.mean(predicted == twenty_test.target)

0.9207723035952063

# Implementing SVM - Excersice 2

**Reference:** https://stackabuse.com/implementing-svm-and-kernel-svm-with-pythons-scikit-learn/

**Dataset:** https://drive.google.com/file/d/13nw-uRXPY8XIZQxKRNZ3yYlho-CYm_Qt/view

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

**load dataset**

In [37]:
data = pd.read_csv('./bill_authentication.csv')
data.head(3)

Unnamed: 0,Variance,Skewness,Curtosis,Entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0


In [36]:
data.shape

(1372, 5)

### Data PreProcessing

In [38]:
X = data.drop(['Class'], axis=1)
y = data['Class']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Training the model

In [42]:
svc_classifier = SVC(kernel='linear')
svc_classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

### Making Predictions

In [45]:
y_pred = svc_classifier.predict(X_test)

### Evaluating the Algorithm

In [46]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[146   2]
 [  2 125]]


In [47]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       148
           1       0.98      0.98      0.98       127

   micro avg       0.99      0.99      0.99       275
   macro avg       0.99      0.99      0.99       275
weighted avg       0.99      0.99      0.99       275



# Kernel SVM

**use the famous _iris_ dataset to predict the category to which a plant belongs based on four attributes: _sepal-width, sepal-length, petal-width and petal-length_.**

In [48]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

colnames = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']

iris_data = pd.read_csv(url, names=colnames)

In [49]:
iris_data.shape

(150, 5)

In [50]:
iris_data.head(3)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa


### Data PreProcessing

In [51]:
X = iris_data.drop(['Class'], axis=1)
y = iris_data['Class']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Training the Algorithm

- In the case of the simple SVM we used the **_linear_ kernel**
- To train **non-linear SVM** we can use:
    - Guassian
    - Polynomial
    - Sigmoid
    - Computable Kernel

### Polynomial Kernel

In [54]:
svc_clf = SVC(kernel='poly', degree=8)

# Trainign
svc_clf.fit(X_train, y_train)

# Predicting
y_predict = svc_clf.predict(X_test)

# Evaluating
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      0.89      0.94         9
 Iris-virginica       0.92      1.00      0.96        11

      micro avg       0.97      0.97      0.97        30
      macro avg       0.97      0.96      0.97        30
   weighted avg       0.97      0.97      0.97        30



### Gaussian Kernel

In [56]:
svc_clf = SVC(kernel='rbf')

# Training
svc_clf.fit(X_train, y_train)

# Predict
y_predict = svc_clf.predict(X_test)

# Evaluating
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      1.00      1.00         9
 Iris-virginica       1.00      1.00      1.00        11

      micro avg       1.00      1.00      1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30



### Sigmoid Kernel

In [57]:
svc_clf = SVC(kernel='sigmoid')

# Training 
svc_clf.fit(X_train, y_train)

# Predict
y_predict = svc_clf.predict(X_test)

# Evaluating
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[ 0 10  0]
 [ 0  9  0]
 [ 0 11  0]]
                 precision    recall  f1-score   support

    Iris-setosa       0.00      0.00      0.00        10
Iris-versicolor       0.30      1.00      0.46         9
 Iris-virginica       0.00      0.00      0.00        11

      micro avg       0.30      0.30      0.30        30
      macro avg       0.10      0.33      0.15        30
   weighted avg       0.09      0.30      0.14        30



- **_Sigmoid kernel_** performed the worst, this due to the reason that **_Sigmoid_** function returns 1 o 0 values, 
therefore is most suitable for binary classification. In this case we had 3 output classes.

- **Gaussian** achieved a perfect 100%

- **Polynomial** missclassified two instances

**However, there is no hard and fast rule as to which kernel performs best in every scenario. It is all about testing all the kernels and selecting the one with the best results on your test dataset.**