# SVM Classifier

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

## Reading data

In [2]:
data = pd.read_csv(r".\small_ohe.csv")
data.head()

Unnamed: 0,age,default,housing,loan,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,...,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed
0,0.171429,1,-1,1,0.029412,1.0,0.0,0,0.333333,0.26968,...,0,1,0,0,0,1,0,0,0,0
1,0.3,1,1,1,0.088235,1.0,0.0,0,0.9375,0.698753,...,0,1,0,0,0,1,0,0,0,0
2,0.1,1,-1,1,0.0,1.0,0.0,0,1.0,0.882307,...,0,0,0,0,0,0,0,0,0,1
3,0.285714,1,0,0,0.058824,1.0,0.0,0,1.0,0.882307,...,0,0,0,0,0,1,0,0,0,0
4,0.414286,1,-1,1,0.0,1.0,0.0,0,0.6875,0.389322,...,0,0,1,0,0,0,1,0,0,0


In [3]:
# raw data
X = data.drop('y', axis=1).values
y = data['y'].values

### Test Train Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

## Choosing the best parameters for SVM classifier based on 5-fold Cross Validation score

In [5]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [0.1], 'C': [1]},
                    {'kernel': ['linear'], 'C': [1]}]

In [6]:
clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='precision')
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.1], 'C': [1]}, {'kernel': ['linear'], 'C': [1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='precision', verbose=0)

In [7]:
print('The best model is: ', clf.best_params_)
print('This model produces a mean cross-validated score (precision) of', clf.best_score_)

The best model is:  {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
This model produces a mean cross-validated score (precision) of 0.8302833451252198


## Testing

In [8]:
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
y_true, y_pred = y_test, clf.predict(X_test)
pre1 = precision_score(y_true, y_pred)
rec1 = recall_score(y_true, y_pred)
acc1 = accuracy_score(y_true, y_pred)
f1_1 = f1_score(y_true, y_pred)
print('precision on the evaluation set: ', pre1)
print('recall on the evaluation set: ', rec1)
print('accuracy on the evaluation set: ', acc1)

precision on the evaluation set:  0.8480603448275862
recall on the evaluation set:  0.7266851338873499
accuracy on the evaluation set:  0.7998167659184608


### -------------------------------------------------------------------------------------------------------------------------------------------------------

## Reducing Features using PCA

In [9]:
from sklearn.decomposition import PCA
# raw data
X = data.drop('y', axis=1).values
y = data['y'].values
# split, random_state is used for repeatable results, you should remove it if you are running your own code.
pca = PCA(n_components=0.9)
x_pca = pca.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.30, random_state=42)
x_pca.shape

(7276, 24)

## Observation : To capture 90% variance we need only 24 Features out of 56

In [10]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [0.1],
                     'C': [1]},
                    {'kernel': ['linear'], 'C': [1]}]

## Training after appliying PCA

In [11]:
clf = GridSearchCV(SVC(), tuned_parameters, cv=5, scoring='precision')
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.1], 'C': [1]}, {'kernel': ['linear'], 'C': [1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='precision', verbose=0)

In [12]:
print('The best model is: ', clf.best_params_)
print('This model produces a mean cross-validated score (precision) of', clf.best_score_)

The best model is:  {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
This model produces a mean cross-validated score (precision) of 0.7998243179740869


## Testing after appliying PCA

In [13]:
y_true, y_pred = y_test, clf.predict(X_test)
pre2 = precision_score(y_true, y_pred)
rec2 = recall_score(y_true, y_pred)
acc2 = accuracy_score(y_true, y_pred)
f1_2 = f1_score(y_true, y_pred)
print('precision on the evaluation set: ', pre2)
print('recall on the evaluation set: ', rec2)
print('accuracy on the evaluation set: ', acc2)

precision on the evaluation set:  0.8419889502762431
recall on the evaluation set:  0.703601108033241
accuracy on the evaluation set:  0.7874484654145671


### ----------------------------------------------------------------------------------------------------------------------------------------------------

## Comparing the results

In [14]:

table = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Score Without PCA': [acc1, pre1, rec1, f1_1],
    'Score With PCA': [acc2, pre2, rec2, f1_2]
    })
table

Unnamed: 0,Metric,Score Without PCA,Score With PCA
0,Accuracy,0.799817,0.787448
1,Precision,0.84806,0.841989
2,Recall,0.726685,0.703601
3,F1 Score,0.782695,0.7666


## Observation : We can get the almost same accuracy using just 24 Features obtained by using PCA instead of using all the 56 Features