<h1>SVM Classification</h1>
<hr>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=FutureWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

import os
cwd = os.getcwd()

# importing dataset
pdTestFeatures  = pd.read_csv( os.path.join(cwd, "data", "test-features.csv"),  header=None)
pdTestLabels    = pd.read_csv( os.path.join(cwd, "data", "test-labels.csv"),    header=None)
pdTrainFeatures = pd.read_csv( os.path.join(cwd, "data", "train-features.csv"), header=None)
pdTrainLabels   = pd.read_csv( os.path.join(cwd, "data", "train-labels.csv"),   header=None)
    
# labeling as 0 or 1 the dataset
pdTestLabels[ pdTestLabels[0] <  90 ] = 0
pdTestLabels[ pdTestLabels[0] >= 90 ] = 1
pdTrainLabels[ pdTrainLabels[0] <  90 ] = 0
pdTrainLabels[ pdTrainLabels[0] >= 90 ] = 1
    
# dataset 
npTestFeatures  = pdTestFeatures.values
npTestLabels    = pdTestLabels.values
npTrainFeatures = pdTrainFeatures.values
npTrainLabels   = pdTrainLabels.values
    
# properties
testSampleCount  = npTestFeatures.shape[0]
trainSampleCount = npTrainFeatures.shape[0]
featureCount = npTestFeatures.shape[1]

In [2]:
# printing test results for SVM  soft margin.
def printTestResultsForSoftMargin(npTrainFeatures, npTrainLabels, npTestFeatures, npTestLabels, optimalC):
    from sklearn.svm import SVC
    svmSoftMarginModel = SVC(random_state=1, C=optimalC)
    svmSoftMarginModel.fit(trainFeatures, trainLabels)
    predictions = svmSoftMarginModel.predict(npTestFeatures).reshape(-1,1)
 
    truePositives  = np.sum( np.logical_and( npTestLabels, predictions) )
    falsePositives = np.sum( np.logical_and( np.logical_not(npTestLabels), predictions) )
    falseNegatives  = np.sum( np.logical_and( npTestLabels, np.logical_not(predictions)) )
    trueNegatives = np.sum( np.logical_and( np.logical_not(npTestLabels), np.logical_not(predictions)) )

    acurracy = ((truePositives + trueNegatives) / testSampleCount)
    precision = truePositives / (truePositives + falsePositives) 
    recall = truePositives / (truePositives + falseNegatives)
    negativePredictiveValue = trueNegatives / (falseNegatives + trueNegatives)
    falsePositiveRate = falsePositives / (falsePositives + trueNegatives)
    falseDiscoveryRate = falsePositives / (falsePositives + truePositives)
    f1Score = (2 * precision * recall) / (precision + recall)
    f2Score = (5 * precision * recall) / (4 * precision + recall)
    
    print("C-Value " + str(optimalC) + ": ")
    print("-----------------------------------------------------")
    print("True Positives: "  + str(truePositives))
    print("False Positives: " + str(falsePositives))
    print("False Negatives: " + str(falseNegatives))
    print("True Negatives: " + str(trueNegatives))
    print("Acurracy: " + str(acurracy))
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("Negative Predictive Value: " + str(negativePredictiveValue))
    print("False Positive Rate: " + str(falsePositiveRate))
    print("False Discovery Rate: " + str(falseDiscoveryRate))
    print("F1 Score: " + str(f1Score))
    print("F2 Score: " + str(f2Score))
    print()

In [3]:
# printing test results for SVM hard margin.
def printTestResultsForHardMargin(npTrainFeatures, npTrainLabels, npTestFeatures, npTestLabels, optimalGamma):
    from sklearn.svm import SVC
    svmHardMarginModel = SVC(random_state=1, gamma=optimalGamma)
    svmHardMarginModel.fit(trainFeatures, trainLabels)
    predictions = svmHardMarginModel.predict(npTestFeatures).reshape(-1,1)
    
    truePositives  = np.sum( np.logical_and( npTestLabels, predictions) )
    falsePositives = np.sum( np.logical_and( np.logical_not(npTestLabels), predictions) )
    falseNegatives  = np.sum( np.logical_and( npTestLabels, np.logical_not(predictions)) )
    trueNegatives = np.sum( np.logical_and( np.logical_not(npTestLabels), np.logical_not(predictions)) )
    
    acurracy = ((truePositives + trueNegatives) / testSampleCount)
    precision = truePositives / (truePositives + falsePositives) 
    recall = truePositives / (truePositives + falseNegatives)
    negativePredictiveValue = trueNegatives / (falseNegatives + trueNegatives)
    falsePositiveRate = falsePositives / (falsePositives + trueNegatives)
    falseDiscoveryRate = falsePositives / (falsePositives + truePositives)
    f1Score = (2 * precision * recall) / (precision + recall)
    f2Score = (5 * precision * recall) / (4 * precision + recall)
    
    print("Gamma-Value " + str(optimalGamma) + ": ")
    print("-----------------------------------------------------")
    print("True Positives: "  + str(truePositives))
    print("False Positives: " + str(falsePositives))
    print("False Negatives: " + str(falseNegatives))
    print("True Negatives: " + str(trueNegatives))
    print("Acurracy: " + str(acurracy))
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("Negative Predictive Value: " + str(negativePredictiveValue))
    print("False Positive Rate: " + str(falsePositiveRate))
    print("False Discovery Rate: " + str(falseDiscoveryRate))
    print("F1 Score: " + str(f1Score))
    print("F2 Score: " + str(f2Score))
    print()

In [4]:
# printing results of micro and macro averaging for SVM soft margin.
def printMicroMacroResultsSoft(npTrainFeatures, npTrainLabels, npTestFeatures, npTestLabels):
    
    cValues = np.array([0.001, 0.01, 0.1, 10, 100])
    truePositiveArr  = np.zeros(5)
    falsePositiveArr = np.zeros(5)
    falseNegativeArr = np.zeros(5)
    trueNegativeArr  = np.zeros(5)
    
    for i in range(cValues.shape[0]):
        from sklearn.svm import SVC
        svmSoftMarginModel = SVC(random_state=1, C=cValues[i])
        svmSoftMarginModel.fit(trainFeatures, trainLabels)
        predictions = svmSoftMarginModel.predict(npTestFeatures).reshape(-1,1)
 
        truePositives  = np.sum( np.logical_and( npTestLabels, predictions) )
        falsePositives = np.sum( np.logical_and( np.logical_not(npTestLabels), predictions) )
        falseNegatives  = np.sum( np.logical_and( npTestLabels, np.logical_not(predictions)) )
        trueNegatives = np.sum( np.logical_and( np.logical_not(npTestLabels), np.logical_not(predictions)) )
        
        # adding one to avoid getting nan values 
        truePositiveArr[i] = truePositives + 1
        falsePositiveArr[i] = falsePositives + 1
        falseNegativeArr[i] = falseNegatives + 1
        trueNegativeArr[i] = trueNegatives + 1

    precision = truePositiveArr / (truePositiveArr + falsePositiveArr)
    recall = truePositiveArr / (truePositiveArr + falseNegativeArr)
    
    # MACRO AVERAGING
    macroPrecision = precision.mean() 
    macroRecall = recall.mean()
    macroNegativePredictiveValue = (trueNegativeArr / (falseNegativeArr + trueNegativeArr)).mean()
    macroFalsePositiveRate = (falsePositiveArr / (falsePositiveArr + trueNegativeArr)).mean()
    macroFalseDiscoveryRate = (falsePositiveArr / (falsePositiveArr + truePositiveArr)).mean()
    macroF1Score = ((2 * precision * recall) / (precision + recall)).mean()
    macroF2Score = ((5 * precision * recall) / (4 * precision + recall)).mean()
        
    # MICRO AVERAGING
    microPrecision = truePositiveArr.sum() / ((truePositiveArr + falsePositiveArr).sum())
    microRecall = truePositiveArr.sum() / ((truePositiveArr + falseNegativeArr).sum())
    microNegativePredictiveValue = trueNegatives.sum() / ((falseNegatives + trueNegatives).sum())
    microFalsePositiveRate = falsePositives.sum() / ((falsePositives + trueNegatives).sum())
    microFalseDiscoveryRate = falsePositives.sum() / ((falsePositives + truePositives).sum())
    microF1Score = (2 * precision * recall).sum() / ((precision + recall).sum())
    microF2Score = (5 * precision * recall).sum() / ((4 * precision + recall).sum())
    
    # printing results
    print('-----------------------------------------------------------------')
    print('-------------------------MACRO AVERAGING-------------------------')
    print('-----------------------------------------------------------------')
    print("Macro Precision: " + str(macroPrecision))
    print("Macro Recall: " + str(macroRecall))
    print("Macro Negative Predictive Value: " + str(macroNegativePredictiveValue))
    print("Macro False Positive Rate: " + str(macroFalsePositiveRate))
    print("Macro False Discovery Rate: " + str(macroFalseDiscoveryRate))
    print("Macro F1 Score: " + str(macroF1Score))
    print("Macro F2 Score: " + str(macroF2Score))
    print()
    
    print('-----------------------------------------------------------------')
    print('-------------------------MICRO AVERAGING-------------------------')
    print('-----------------------------------------------------------------')
    print("Micro Precision: " + str(microPrecision))
    print("Micro Recall: " + str(microRecall))
    print("Micro Negative Predictive Value: " + str(microNegativePredictiveValue))
    print("Micro False Positive Rate: " + str(microFalsePositiveRate))
    print("Micro False Discovery Rate: " + str(microFalseDiscoveryRate))
    print("Micro F1 Score: " + str(microF1Score))
    print("Micro F2 Score: " + str(microF2Score))
    print()

In [5]:
# printing results of micro and macro averaging for SVM hard margin.
def printMicroMacroResultsHard(npTrainFeatures, npTrainLabels, npTestFeatures, npTestLabels):
    
    gammaValues = np.array([
        math.pow(2, -4), 
        math.pow(2, -3), 
        math.pow(2, -2), 
        math.pow(2,  0), 
        math.pow(2,  1)
    ])
    
    truePositiveArr  = np.zeros(5)
    falsePositiveArr = np.zeros(5)
    falseNegativeArr = np.zeros(5)
    trueNegativeArr  = np.zeros(5)
    
    for i in range(gammaValues.shape[0]):
        from sklearn.svm import SVC
        svmSoftMarginModel = SVC(random_state=1, gamma=gammaValues[i])
        svmSoftMarginModel.fit(trainFeatures, trainLabels)
        predictions = svmSoftMarginModel.predict(npTestFeatures).reshape(-1,1)
 
        truePositives  = np.sum( np.logical_and( npTestLabels, predictions) )
        falsePositives = np.sum( np.logical_and( np.logical_not(npTestLabels), predictions) )
        falseNegatives  = np.sum( np.logical_and( npTestLabels, np.logical_not(predictions)) )
        trueNegatives = np.sum( np.logical_and( np.logical_not(npTestLabels), np.logical_not(predictions)) )
        
        # adding one to avoid getting nan values 
        truePositiveArr[i] = truePositives + 1
        falsePositiveArr[i] = falsePositives + 1
        falseNegativeArr[i] = falseNegatives + 1
        trueNegativeArr[i] = trueNegatives + 1
        
    precision = truePositiveArr / (truePositiveArr + falsePositiveArr)
    recall = truePositiveArr / (truePositiveArr + falseNegativeArr)
    
    # MACRO AVERAGING
    macroPrecision = precision.mean() 
    macroRecall = recall.mean()
    macroNegativePredictiveValue = (trueNegativeArr / (falseNegativeArr + trueNegativeArr)).mean()
    macroFalsePositiveRate = (falsePositiveArr / (falsePositiveArr + trueNegativeArr)).mean()
    macroFalseDiscoveryRate = (falsePositiveArr / (falsePositiveArr + truePositiveArr)).mean()
    macroF1Score = ((2 * precision * recall) / (precision + recall)).mean()
    macroF2Score = ((5 * precision * recall) / (4 * precision + recall)).mean()
        
    # MICRO AVERAGING
    microPrecision = truePositiveArr.sum() / ((truePositiveArr + falsePositiveArr).sum())
    microRecall = truePositiveArr.sum() / ((truePositiveArr + falseNegativeArr).sum())
    microNegativePredictiveValue = trueNegatives.sum() / ((falseNegatives + trueNegatives).sum())
    microFalsePositiveRate = falsePositives.sum() / ((falsePositives + trueNegatives).sum())
    microFalseDiscoveryRate = falsePositives.sum() / ((falsePositives + truePositives).sum())
    microF1Score = (2 * precision * recall).sum() / ((precision + recall).sum())
    microF2Score = (5 * precision * recall).sum() / ((4 * precision + recall).sum())
    
    # printing results
    print('-----------------------------------------------------------------')
    print('-------------------------MACRO AVERAGING-------------------------')
    print('-----------------------------------------------------------------')
    print("Macro Precision: " + str(macroPrecision))
    print("Macro Recall: " + str(macroRecall))
    print("Macro Negative Predictive Value: " + str(macroNegativePredictiveValue))
    print("Macro False Positive Rate: " + str(macroFalsePositiveRate))
    print("Macro False Discovery Rate: " + str(macroFalseDiscoveryRate))
    print("Macro F1 Score: " + str(macroF1Score))
    print("Macro F2 Score: " + str(macroF2Score))
    print()
    
    print('-----------------------------------------------------------------')
    print('-------------------------MICRO AVERAGING-------------------------')
    print('-----------------------------------------------------------------')
    print("Micro Precision: " + str(microPrecision))
    print("Micro Recall: " + str(microRecall))
    print("Micro Negative Predictive Value: " + str(microNegativePredictiveValue))
    print("Micro False Positive Rate: " + str(microFalsePositiveRate))
    print("Micro False Discovery Rate: " + str(microFalseDiscoveryRate))
    print("Micro F1 Score: " + str(microF1Score))
    print("Micro F2 Score: " + str(microF2Score))
    print()

<strong>Question 4.1: </strong> In this part, you will train a linear SVM model with soft margin without using any kernels. Your model's hyper-parameter is C. Using 10-fold cross validation on your training set, and the optimum C value of your model. Look for the best C value with line search in the following range [10􀀀3; 10􀀀210􀀀1; 10101; 102] and calculate accuracy on the left-out fold. For each value of C, calculate mean cross validation accuracy by changing the left-out fold each time and plot it in a nice form. Report your optimum C value. Then, run your model on the test set with this C value and report test set accuracy with the confusion matrix. Calculate and report micro and macro averages of precision, recall, negative predictive value (NPV), false positive rate (FPR), false discovery rate (FDR), F1 and F2 scores.

In [6]:
# SOFT MARGIN
splitsFeatures = np.split( npTrainFeatures, 10 )
splitsLabels = np.split( npTrainLabels, 10 )
cValues = np.array([0.001, 0.01, 0.1, 10, 100])
acurraciesSoft = np.zeros((5, 10))

for i in range(cValues.shape[0]):
    for j in range(10):
        trainFeatures = np.zeros((1,8))
        trainLabels = np.zeros((1,1))
        validationFeatures = np.zeros((1,8))
        validationLabels = np.zeros((1,1))
    
        for k in range(10):
            if k != j:
                trainFeatures = np.vstack((trainFeatures, splitsFeatures[k]))
                trainLabels = np.vstack((trainLabels, splitsLabels[k]))
            else:
                validationFeatures = np.vstack((validationFeatures, splitsFeatures[k]))
                validationLabels = np.vstack((validationLabels, splitsLabels[k]))
    
        validationFeatures = np.delete(validationFeatures, (0), axis=0)
        validationLabels = np.delete(validationLabels, (0), axis=0)
        trainFeatures = np.delete(trainFeatures, (0), axis=0)
        trainLabels = np.delete(trainLabels, (0), axis=0)
    
        from sklearn.svm import SVC
        svmSoftMargin = SVC(random_state=1, C=cValues[i])
        svmSoftMargin.fit(trainFeatures, trainLabels)
        acc = svmSoftMargin.score(validationFeatures, validationLabels)
        acurraciesSoft[i, j] = acc

In [7]:
accurraciesAveragesSoft = acurraciesSoft.mean(axis=1)
print("Acurracies for Soft Margin")
print("--------------------------")
print(accurraciesAveragesSoft)
# for C = 100, we obtained the best acurrracy value.

Acurracies for Soft Margin
--------------------------
[0.60371429 0.8455     0.85307143 0.85814286 0.85914286]


In [8]:
# printing the test results of C value with highest acurracy (C = 100)
printTestResultsForSoftMargin(npTrainFeatures, npTrainLabels, npTestFeatures, npTestLabels, 100)

C-Value 100: 
-----------------------------------------------------
True Positives: 2099
False Positives: 68
False Negatives: 243
True Negatives: 968
Acurracy: 0.9079336885731202
Precision: 0.9686202122750346
Recall: 0.8962425277540563
Negative Predictive Value: 0.7993393889347646
False Positive Rate: 0.06563706563706563
False Discovery Rate: 0.03137978772496539
F1 Score: 0.9310268352184519
F2 Score: 0.9098396185522324



In [9]:
# printing micro and macro average of Soft Margin SVM
printMicroMacroResultsSoft(npTrainFeatures, npTrainLabels, npTestFeatures, npTestLabels)

-----------------------------------------------------------------
-------------------------MACRO AVERAGING-------------------------
-----------------------------------------------------------------
Macro Precision: 0.8969372261776188
Macro Recall: 0.9318259385665529
Macro Negative Predictive Value: 0.7584702454509007
Macro False Positive Rate: 0.2926782273603083
Macro False Discovery Rate: 0.10306277382238113
Macro F1 Score: 0.9082555621715149
Macro F2 Score: 0.9205012906528423

-----------------------------------------------------------------
-------------------------MICRO AVERAGING-------------------------
-----------------------------------------------------------------
Micro Precision: 0.8778938906752412
Micro Recall: 0.9318259385665529
Micro Negative Predictive Value: 0.7993393889347646
Micro False Positive Rate: 0.06563706563706563
Micro False Discovery Rate: 0.03137978772496539
Micro F1 Score: 0.9099445584925333
Micro F2 Score: 0.9204809900459388



<strong>Question 4.2: </strong> This time, use radial basis function (RBF) kernel to train your hard margin SVM model on the processed (discretized) dataset from Question 3. RBF kernel is dened a sIn RBF kernel formula, = 􀀀 1 22 is a free parameter that can be ne-tuned. This parameter is the inverse of the radius the in uence of samples selected by the model as support vectors. Similar to linear SVM part, train a SVM classier with RBF kernel using same training and test sets you have used in linear SVM model above. In addition to the penalty parameter C, is your new hyper-parameter that needs be optimized. Using 10-fold cross validation and calculating mean cross validation accuracy as described in Question 4.1, nd and report the best within the interval from the logarithmic scale [2􀀀4; 2􀀀3; 2􀀀220; 21]. After tuning on your training set, run your model on the test set and report your accuracy along with the confusion matrix. Calculate and report micro and macro averages of precision, recall, negative predictive value (NPV), false positive rate (FPR), false discovery rate (FDR), F1 and F2 scores.

In [10]:
# GAMMA VALUES
import math
splitsFeatures = np.split( npTrainFeatures, 10 )
splitsLabels = np.split( npTrainLabels, 10 )
gammaValues = np.array([
    math.pow(2, -4), 
    math.pow(2, -3), 
    math.pow(2, -2), 
    math.pow(2,  0), 
    math.pow(2,  1)
])

acurraciesHard = np.zeros((5, 10))

for i in range(gammaValues.shape[0]):
    for j in range(10):
        trainFeatures = np.zeros((1,8))
        trainLabels = np.zeros((1,1))
        validationFeatures = np.zeros((1,8))
        validationLabels = np.zeros((1,1))
    
        for k in range(10):
            if k != j:
                trainFeatures = np.vstack((trainFeatures, splitsFeatures[k]))
                trainLabels = np.vstack((trainLabels, splitsLabels[k]))
            else:
                validationFeatures = np.vstack((validationFeatures, splitsFeatures[k]))
                validationLabels = np.vstack((validationLabels, splitsLabels[k]))
    
        validationFeatures = np.delete(validationFeatures, (0), axis=0)
        validationLabels = np.delete(validationLabels, (0), axis=0)
        trainFeatures = np.delete(trainFeatures, (0), axis=0)
        trainLabels = np.delete(trainLabels, (0), axis=0)
    
        from sklearn.svm import SVC
        svmSoftMargin = SVC(random_state=1, gamma=gammaValues[i])
        svmSoftMargin.fit(trainFeatures, trainLabels)
        acc = svmSoftMargin.score(validationFeatures, validationLabels)
        acurraciesHard[i, j] = acc

In [11]:
accurraciesAveragesHard = acurraciesHard.mean(axis=1)
print("Acurracies for Hard Margin")
print("--------------------------")
print(accurraciesAveragesHard)
# for gamma = 2^-5, we obtained the best acurrracy value.

Acurracies for Hard Margin
--------------------------
[0.85864286 0.8565     0.85307143 0.8415     0.8325    ]


In [12]:
# printing the test results of gamma value with highest acurracy (gamma = 100)
printTestResultsForHardMargin(npTrainFeatures, npTrainLabels, npTestFeatures, npTestLabels, gammaValues[0])

Gamma-Value 0.0625: 
-----------------------------------------------------
True Positives: 2127
False Positives: 79
False Negatives: 215
True Negatives: 957
Acurracy: 0.9129662522202486
Precision: 0.9641885766092475
Recall: 0.908198121263877
Negative Predictive Value: 0.8165529010238908
False Positive Rate: 0.07625482625482626
False Discovery Rate: 0.035811423390752495
F1 Score: 0.9353562005277045
F2 Score: 0.9188698807672369



In [13]:
# printing micro and macro average of Hard Margin SVM
printMicroMacroResultsHard(npTrainFeatures, npTrainLabels, npTestFeatures, npTestLabels)

-----------------------------------------------------------------
-------------------------MACRO AVERAGING-------------------------
-----------------------------------------------------------------
Macro Precision: 0.9616350966102394
Macro Recall: 0.9088737201365188
Macro Negative Predictive Value: 0.8169058999375503
Macro False Positive Rate: 0.08188824662813102
Macro False Discovery Rate: 0.038364903389760716
Macro F1 Score: 0.9345092501269663
Macro F2 Score: 0.9189570526732729

-----------------------------------------------------------------
-------------------------MICRO AVERAGING-------------------------
-----------------------------------------------------------------
Micro Precision: 0.9616322108874243
Micro Recall: 0.9088737201365188
Micro Negative Predictive Value: 0.8169257340241797
Micro False Positive Rate: 0.08687258687258688
Micro False Discovery Rate: 0.04054054054054054
Micro F1 Score: 0.9345103005107307
Micro F2 Score: 0.9189576960438709

