# Support Vector Classifier Testing

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import NearMiss 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
xTestData = np.genfromtxt("data/X_test.txt", delimiter = None, skip_header=1)
xTrainData = np.genfromtxt("data/X_train.txt", delimiter = None, skip_header=1)
yTrainData = np.genfromtxt("data/Y_train.txt", delimiter = None, skip_header=1)

In [3]:
xTrain, xVal, yTrain, yVal = train_test_split(xTrainData, yTrainData, test_size=0.25, shuffle = True)

In [4]:
yTrainDist = yTrain[:, 1]
yTrainZ = yTrain[:, 0]
yValDist = yVal[:,1]
yValZ = yVal[:,0]

xTestIds = xTestData[:,0]

#OverSampler
sm = SMOTE(random_state = 2)

#UnderSampler
nr = NearMiss()

**Helper method for printing Score for KNN(1-50 neighbors)**

In [5]:
def printScores(xTrain, yTrain, xVal):
    scale = StandardScaler().fit(xTrain)
    xTrainScaled = scale.transform(xTrain)
    xValScaled = scale.transform(xVal)

    kernelTypes = ["rbf", "linear", "poly", "sigmoid"]
    
    for kType in kernelTypes:
        svc = svm.SVC(kernel=kType, probability=True)
        svc.fit(xTrainScaled, yTrain.ravel())
        perc = "{:.2%}".format(svc.score(xValScaled, yValDist))
        yPred = svc.predict(xValScaled)
        print(kType + ": " + str(perc) + "\n" + str(classification_report(yValDist, yPred)) + "\n" + str(confusion_matrix(yValDist, yPred)) + "\n")

# All Features

**Features**

In [6]:
xTrainAll = xTrain[:, 1:35]
xValAll = xVal[:, 1:35]
xTestAll = xTestData[:, 1:35]

**Unbalanced**

In [7]:
printScores(xTrainAll, yTrainDist, xValAll)

rbf: 96.98%
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98      1132
         1.0       0.91      0.47      0.62        62

    accuracy                           0.97      1194
   macro avg       0.94      0.73      0.80      1194
weighted avg       0.97      0.97      0.97      1194

[[1129    3]
 [  33   29]]

linear: 96.65%
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      1132
         1.0       0.81      0.47      0.59        62

    accuracy                           0.97      1194
   macro avg       0.89      0.73      0.79      1194
weighted avg       0.96      0.97      0.96      1194

[[1125    7]
 [  33   29]]

poly: 95.98%
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.98      1132
         1.0       0.77      0.32      0.45        62

    accuracy                           0.96      1194
   macro avg       0.87      0.6

**OverSampling**

In [None]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainAll, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValAll)

**UnderSampling**

In [None]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainAll, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValAll)

# All first Features of each Category

**Features**

In [None]:
xTrainFirsts = xTrain[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xValFirsts = xVal[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xTestFirsts = xTestData[:,[1,2,3, 4, 10, 20, 23, 27,30]]

**Unbalanced Data**

In [None]:
printScores(xTrainFirsts, yTrainDist, xValFirsts)

**OverSampling**

In [None]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

**UnderSampling**

In [None]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

# All Firsts plus one

In [None]:
xTrainFirstsP = xTrain[:,[1,2,3,8,11,18,22,26,27,32]]
xValFirstsP = xVal[:,[1,2,3,8,11,18,22,26,27,32]]
xTestFirstsP = xTestData[:,[1,2,3,8,11,18,22,26,27,32]]

**OverSampling**

In [None]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainFirstsP, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirstsP)

**UnderSampling**

In [None]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainFirstsP, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirstsP)

# All second Features of each Category

**Features**

In [None]:
xTrainSeconds = xTrain[:,[1,2,3, 5, 11, 21, 24, 28,31]]
xValSeconds = xVal[:,[1,2,3, 5, 11, 21, 24, 28,31]]

**Unbalanced**

In [None]:
printScores(xTrainSeconds, yTrainDist, xValSeconds)

**OverSampling**

In [None]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

**UnderSampling**

In [None]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

# All third features of each Category

**Features**

In [None]:
xTrainThirds = xTrain[:,[1,2,3, 6, 12, 22, 25, 29,32]]
xValThirds = xVal[:,[1,2,3, 6, 12, 22, 25, 29,32]]

**Unbalanced**

In [None]:
printScores(xTrainThirds, yTrainDist, xValThirds)

**OverSampling**

In [None]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

**UnderSampling**

In [None]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

# Controls and Liquidity Ratios

**Features**

In [None]:
xTrainLR = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9]]
xValLR = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9]]

**Unbalanced**

In [None]:
printScores(xTrainLR, yTrainDist, xValLR)

**OverSampling**

In [None]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

**UnderSampling**

In [None]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

# Controls and Profitability Ratios

**Features**

In [None]:
xTrainPR = xTrain[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
xValPR = xVal[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

**Unbalanced**

In [None]:
printScores(xTrainPR, yTrainDist, xValPR)

**OverSampling**

In [None]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

**UnderSampling**

In [None]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

# Controls and Profitability Ratios Booleans

In [None]:
xTrainPRB = xTrain[:,[1, 2, 3, 13, 14, 15, 16, 19]]
xValPRB = xVal[:,[1, 2, 3, 13, 14, 15, 16, 19]]

**Unbalanced**

In [None]:
printScores(xTrainPRB, yTrainDist, xValPRB)

**OverSampling**

In [None]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

**UnderSampling**

In [None]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

# Controls and Profitability Ratios Non-Boolean

In [None]:
xTrainPRNB = xTrain[:,[1, 2, 3, 10, 11, 12, 17, 18]]
xValPRNB = xVal[:,[1, 2, 3, 10, 11, 12, 17, 18]]

**Unbalanced**

In [None]:
printScores(xTrainPRB, yTrainDist, xValPRB)

**OverSampling**

In [None]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

**UnderSampling**

In [None]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

# Constants and Capital Structure

**Features**

In [None]:
xTrainCS = xTrain[:,[1, 2, 3, 20, 21, 22]]
xValCS = xVal[:,[1, 2, 3, 20, 21, 22]]

**Unbalanced**

In [None]:
printScores(xTrainCS, yTrainDist, xValCS)

**OverSampling**

In [None]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

**UnderSampling**

In [None]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

# New Tests 4/20/2020

**Features**

In [None]:
xTrainNT = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]
xValNT = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]
xTestNT = xTestData[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]

**Unbalanced**

In [None]:
printScores(xTrainNT, yTrainDist, xValNT)

**OverSampling**

In [None]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

**UnderSampling**

In [None]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

# Best UnderSampling Performer

In [None]:
# xTrainBal, yTrainBal = nr.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="poly", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake1.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [None]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="poly", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake2.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [None]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="linear", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake2Extra.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [None]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainFirstsP, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestFirstsP)

# svc = svm.SVC(kernel="linear", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake3.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()