In [None]:
'''This notebook implements the Support Vector Machine classifier on
    1. The linear dataset
    2. The standardized linear dataset
    3. The standardized expanded dataset'''

__author__ = 'Anjana Niranjan'
__email__ = 'anjanani@usc.edu'

**Support vector machine**
Thanks to the sklearn website for examples on the functions used in this code.
Thanks to https://stackoverflow.com/questions/51194627/python-naive-bayes-with-cross-validation-using-gaussiannb-classifier for helping with cross-validation.

In [None]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
#Loading the linear data
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/mpr/linearTrain.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/mpr/linearTest.csv')

In [None]:
X_train = train.iloc[:, 1:14]
y_train = train.iloc[:,14]
X_test = test.iloc[:, 1:14]
y_test = test.iloc[:, 14]

Implementing SVM classifier on linear dataset with grid search and cross validation

In [None]:
kf = KFold(n_splits=9, random_state=None, shuffle=False)
params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'decision_function_shape':['ovo', 'ovr']}
clf = SVC()
gs = GridSearchCV(clf, cv=kf, param_grid=params, return_train_score=True)

gs.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=9, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'decision_function_shape': ['ovo', 'ovr'],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [None]:
#grid search results
gs.cv_results_

{'mean_fit_time': array([7.54063265, 0.3682033 , 0.94930662, 9.92085062, 7.17450107,
        0.36588979, 0.94860217, 9.93195767]),
 'mean_score_time': array([0.01338257, 0.05031949, 0.15229355, 0.96837923, 0.01335475,
        0.05115059, 0.15113907, 0.96926472]),
 'mean_test_score': array([0.87333333, 0.90733333, 0.91325926, 0.10666667, 0.87333333,
        0.90733333, 0.91325926, 0.10666667]),
 'mean_train_score': array([0.99553704, 0.99067593, 0.98347222, 0.12643519, 0.99553704,
        0.99067593, 0.98347222, 0.12643519]),
 'param_decision_function_shape': masked_array(data=['ovo', 'ovo', 'ovo', 'ovo', 'ovr', 'ovr', 'ovr', 'ovr'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'poly', 'rbf', 'sigmoid', 'linear', 'poly',
                    'rbf', 'sigmoid'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',


In [None]:
#Best parameters
gs.best_params_

{'decision_function_shape': 'ovo', 'kernel': 'rbf'}

In [None]:
#Best score
gs.best_score_

0.9132592592592592

In [None]:
#Training the classifier with best parameters
bestclf = SVC(decision_function_shape= 'ovo', kernel= 'rbf')
bestclf.fit(X_train, y_train)
tr_p = bestclf.predict(X_train)
print(accuracy_score(y_train, tr_p))
confusion_matrix(y_train, tr_p)

0.9828888888888889


array([[2778,    0,    0,    0,    6],
       [  10, 2455,    1,   26,   90],
       [   1,    0, 2640,    3,    5],
       [   0,   16,   31, 2422,    7],
       [   0,   25,    7,    3, 2974]])

In [None]:
#Running the model on test set
predictions = bestclf.predict(X_test)
print(accuracy_score(y_test, predictions))
confusion_matrix(y_test, predictions)

0.9133608227878098


array([[4325,   48,   67,    0,   26],
       [  31, 4269,   87,    0,   15],
       [   3,    0, 4586,   82,  108],
       [   0,  170,  262, 2681,  801],
       [   0,   26,   50,   52, 3410]])

Implementing the SVM classifier on normalized linear dataset

In [None]:
#Normalizing the data
from sklearn import preprocessing
scalar = preprocessing.StandardScaler()
scalar.fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

In [None]:
kf = KFold(n_splits=9, random_state=None, shuffle=False)
params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'decision_function_shape':['ovo', 'ovr']}
clf = SVC()
gs = GridSearchCV(clf, cv=kf, param_grid=params, return_train_score=True)

gs.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=9, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'decision_function_shape': ['ovo', 'ovr'],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [None]:
#Visualizing the results of grid search
gs.cv_results_

{'mean_fit_time': array([0.43392913, 0.37296375, 0.34329481, 1.88506256, 0.43409387,
        0.36958713, 0.34252932, 1.88494672]),
 'mean_score_time': array([0.01931037, 0.03832311, 0.04151016, 0.30103138, 0.0191767 ,
        0.03842727, 0.04132848, 0.30008592]),
 'mean_test_score': array([0.86644444, 0.87837037, 0.8837037 , 0.698     , 0.86644444,
        0.87837037, 0.8837037 , 0.698     ]),
 'mean_train_score': array([0.99380556, 0.99701852, 0.99881481, 0.73535185, 0.99380556,
        0.99701852, 0.99881481, 0.73535185]),
 'param_decision_function_shape': masked_array(data=['ovo', 'ovo', 'ovo', 'ovo', 'ovr', 'ovr', 'ovr', 'ovr'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'poly', 'rbf', 'sigmoid', 'linear', 'poly',
                    'rbf', 'sigmoid'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',


In [None]:
#Best parameters
gs.best_params_

{'decision_function_shape': 'ovo', 'kernel': 'rbf'}

In [None]:
#Best score
gs.best_score_

0.8837037037037037

In [None]:
#Training the model with the best parameters
bestclf = SVC(decision_function_shape= 'ovo', kernel= 'rbf')
bestclf.fit(X_train, y_train)
tr_p = bestclf.predict(X_train)
print(accuracy_score(y_train, tr_p))
confusion_matrix(y_train, tr_p)

0.9987407407407407


array([[2784,    0,    0,    0,    0],
       [   8, 2569,    2,    3,    0],
       [   1,    0, 2647,    0,    1],
       [   0,    0,    0, 2476,    0],
       [   0,    1,    0,    1, 3007]])

In [None]:
#Running the model on the test set
predictions = bestclf.predict(X_test)
print(accuracy_score(y_test, predictions))
confusion_matrix(y_test, predictions)

0.9434096402673112


array([[4338,   48,    9,    2,   69],
       [  37, 4129,   54,    0,  182],
       [  63,    0, 4215,  266,  235],
       [   0,    0,   56, 3858,    0],
       [   0,   25,    8,  140, 3365]])

Implelemting the SVM classifier on the normalized expanded dataset 

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import SGDClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, KFold, train_test_split

In [None]:
#Loading the expanded dataset
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/mpr/linearTrainexpanded.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/mpr/linearTestexpanded.csv')

In [None]:
X_train = train.iloc[:, 1:60]
y_train = train.iloc[:,60]
X_test = test.iloc[:, 1:60]
y_test = test.iloc[:, 60]

In [None]:
#Normalizing the data
from sklearn import preprocessing
scalar = preprocessing.StandardScaler()
scalar.fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [None]:
kf = KFold(n_splits=9, random_state=None, shuffle=False)
params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'decision_function_shape':['ovo', 'ovr']}
clf = SVC()
gs = GridSearchCV(clf, cv=kf, param_grid=params, return_train_score=True)

gs.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=9, random_state=None, shuffle=False),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'decision_function_shape': ['ovo', 'ovr'],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [None]:
#Visualizing the results of grid search
gs.cv_results_

{'mean_fit_time': array([1.1539509 , 2.66585591, 2.6411287 , 3.02902966, 1.1366854 ,
        2.65769148, 2.6411265 , 3.02940109]),
 'mean_score_time': array([0.05387312, 0.31043707, 0.38738865, 0.53361943, 0.05340711,
        0.31146134, 0.38644756, 0.53394341]),
 'mean_test_score': array([0.86288889, 0.86696296, 0.88874074, 0.70814815, 0.86288889,
        0.86696296, 0.88874074, 0.70814815]),
 'mean_train_score': array([0.998     , 0.99808333, 0.99812037, 0.85249074, 0.998     ,
        0.99808333, 0.99812037, 0.85249074]),
 'param_decision_function_shape': masked_array(data=['ovo', 'ovo', 'ovo', 'ovo', 'ovr', 'ovr', 'ovr', 'ovr'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['linear', 'poly', 'rbf', 'sigmoid', 'linear', 'poly',
                    'rbf', 'sigmoid'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',


In [None]:
#Best estimator
gs.best_estimator_

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
#Best parameters
gs.best_params_

{'decision_function_shape': 'ovo', 'kernel': 'rbf'}

In [None]:
#Best score
gs.best_score_

0.8887407407407407

In [None]:
#Training the model with the best parameters
bestclf = SVC(decision_function_shape= 'ovo', kernel= 'rbf')
bestclf.fit(X_train, y_train)
tr_p = bestclf.predict(X_train)
print(accuracy_score(y_train, tr_p))
confusion_matrix(y_train, tr_p)

0.9980740740740741


array([[2784,    0,    0,    0,    0],
       [   8, 2567,    4,    2,    1],
       [   1,    0, 2643,    0,    5],
       [   0,    0,    4, 2472,    0],
       [   0,    0,    0,    1, 3008]])

In [None]:
#Running the model on the test set
predictions = bestclf.predict(X_test)
print(accuracy_score(y_test, predictions))
confusion_matrix(y_test, predictions)

0.9062040855016825


array([[4361,   48,   10,    0,   47],
       [  84, 4188,   82,    0,   48],
       [ 348,    0, 3749,   99,  583],
       [   0,    0,    1, 3401,  512],
       [   2,   36,    7,   72, 3421]])