# Support Vector Classifier Testing

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import NearMiss 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [2]:
xTestData = np.genfromtxt("data/X_test.txt", delimiter = None, skip_header=1)
xTrainData = np.genfromtxt("data/X_train.txt", delimiter = None, skip_header=1)
yTrainData = np.genfromtxt("data/Y_train.txt", delimiter = None, skip_header=1)

In [3]:
xTrain, xVal, yTrain, yVal = train_test_split(xTrainData, yTrainData, test_size=0.25, shuffle = True)

In [4]:
yTrainDist = yTrain[:, 1]
yTrainZ = yTrain[:, 0]
yValDist = yVal[:,1]
yValZ = yVal[:,0]

xTestIds = xTestData[:,0]

#OverSampler
sm = SMOTE(random_state = 2)

#UnderSampler
nr = NearMiss()

**Helper method for printing Score for KNN(1-50 neighbors)**

In [5]:
def printScores(xTrain, yTrain, xVal):
    scale = StandardScaler().fit(xTrain)
    xTrainScaled = scale.transform(xTrain)
    xValScaled = scale.transform(xVal)

    kernelTypes = ["rbf", "linear", "poly", "sigmoid"]
    
    for kType in kernelTypes:
        svc = svm.SVC(kernel=kType, probability=True)
        svc.fit(xTrainScaled, yTrain.ravel())
#         perc = "{:.2%}".format(svc.score(xValScaled, yValDist))
#         print( kType + " score: " + str(perc))
        yPred = svc.predict(xVal)
        print(kType + ":\n" + str(confusion_matrix(yValDist, yPred)))

# All Features

**Features**

In [6]:
xTrainAll = xTrain[:, 1:35]
xValAll = xVal[:, 1:35]
xTestAll = xTestData[:, 1:35]

**Unbalanced**

In [7]:
printScores(xTrainAll, yTrainDist, xValAll)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1062   71]
 [  33   28]]
poly:
[[695 438]
 [ 52   9]]
sigmoid:
[[968 165]
 [ 31  30]]


**OverSampling**

In [8]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainAll, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValAll)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[931 202]
 [ 17  44]]
poly:
[[1047   86]
 [  28   33]]
sigmoid:
[[  12 1121]
 [   3   58]]


**UnderSampling**

In [9]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainAll, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValAll)

rbf:
[[   0 1133]
 [   0   61]]
linear:
[[   0 1133]
 [   1   60]]
poly:
[[  15 1118]
 [   3   58]]
sigmoid:
[[   2 1131]
 [   1   60]]


# All first Features of each Category

**Features**

In [10]:
xTrainFirsts = xTrain[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xValFirsts = xVal[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xTestFirsts = xTestData[:,[1,2,3, 4, 10, 20, 23, 27,30]]

**Unbalanced Data**

In [11]:
printScores(xTrainFirsts, yTrainDist, xValFirsts)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  56    5]]
poly:
[[1112   21]
 [  45   16]]
sigmoid:
[[1130    3]
 [  61    0]]


**OverSampling**

In [12]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1097   36]
 [  10   51]]
poly:
[[1132    1]
 [  57    4]]
sigmoid:
[[ 111 1022]
 [   2   59]]


**UnderSampling**

In [13]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

rbf:
[[   0 1133]
 [   0   61]]
linear:
[[  34 1099]
 [   0   61]]
poly:
[[ 124 1009]
 [   0   61]]
sigmoid:
[[1133    0]
 [  61    0]]


# All second Features of each Category

**Features**

In [14]:
xTrainSeconds = xTrain[:,[1,2,3, 5, 11, 21, 24, 28,31]]
xValSeconds = xVal[:,[1,2,3, 5, 11, 21, 24, 28,31]]

**Unbalanced**

In [15]:
printScores(xTrainSeconds, yTrainDist, xValSeconds)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[794 339]
 [ 50  11]]
poly:
[[1059   74]
 [  47   14]]
sigmoid:
[[440 693]
 [ 27  34]]


**OverSampling**

In [16]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[  83 1050]
 [  11   50]]
poly:
[[1052   81]
 [  50   11]]
sigmoid:
[[  52 1081]
 [   9   52]]


**UnderSampling**

In [17]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

rbf:
[[   0 1133]
 [   0   61]]
linear:
[[  35 1098]
 [   7   54]]
poly:
[[  23 1110]
 [   0   61]]
sigmoid:
[[1133    0]
 [  61    0]]


# All third features of each Category

**Features**

In [18]:
xTrainThirds = xTrain[:,[1,2,3, 6, 12, 22, 25, 29,32]]
xValThirds = xVal[:,[1,2,3, 6, 12, 22, 25, 29,32]]

**Unbalanced**

In [19]:
printScores(xTrainThirds, yTrainDist, xValThirds)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  61    0]]
poly:
[[  58 1075]
 [  33   28]]
sigmoid:
[[1132    1]
 [  61    0]]


**OverSampling**

In [20]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  61    0]]
poly:
[[1133    0]
 [  60    1]]
sigmoid:
[[1130    3]
 [  55    6]]


**UnderSampling**

In [21]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

rbf:
[[   0 1133]
 [   0   61]]
linear:
[[   0 1133]
 [   2   59]]
poly:
[[1126    7]
 [  55    6]]
sigmoid:
[[   1 1132]
 [   1   60]]


# Controls and Liquidity Ratios

**Features**

In [22]:
xTrainLR = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9]]
xValLR = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9]]

**Unbalanced**

In [23]:
printScores(xTrainLR, yTrainDist, xValLR)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  61    0]]
poly:
[[413 720]
 [  5  56]]
sigmoid:
[[1132    1]
 [  60    1]]


**OverSampling**

In [24]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  61    0]]
poly:
[[917 216]
 [ 19  42]]
sigmoid:
[[1121   12]
 [  61    0]]


**UnderSampling**

In [25]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

rbf:
[[   0 1133]
 [   0   61]]
linear:
[[1133    0]
 [  61    0]]
poly:
[[   2 1131]
 [   0   61]]
sigmoid:
[[1133    0]
 [  61    0]]


# Controls and Profitability Ratios

**Features**

In [26]:
xTrainPR = xTrain[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
xValPR = xVal[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

**Unbalanced**

In [27]:
printScores(xTrainPR, yTrainDist, xValPR)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  61    0]]
poly:
[[  33 1100]
 [   4   57]]
sigmoid:
[[1133    0]
 [  59    2]]


**OverSampling**

In [28]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  59    2]]
poly:
[[  16 1117]
 [   1   60]]
sigmoid:
[[  87 1046]
 [  10   51]]


**UnderSampling**

In [29]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

rbf:
[[   0 1133]
 [   0   61]]
linear:
[[  23 1110]
 [   0   61]]
poly:
[[   0 1133]
 [   1   60]]
sigmoid:
[[  23 1110]
 [   0   61]]


# Controls and Profitability Ratios Booleans

In [30]:
xTrainPRB = xTrain[:,[1, 2, 3, 13, 14, 15, 16, 19]]
xValPRB = xVal[:,[1, 2, 3, 13, 14, 15, 16, 19]]

**Unbalanced**

In [31]:
printScores(xTrainPRB, yTrainDist, xValPRB)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  61    0]]
poly:
[[ 130 1003]
 [  14   47]]
sigmoid:
[[1133    0]
 [  59    2]]


**OverSampling**

In [32]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  57    4]]
poly:
[[   0 1133]
 [   1   60]]
sigmoid:
[[   3 1130]
 [   4   57]]


**UnderSampling**

In [33]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

rbf:
[[   0 1133]
 [   0   61]]
linear:
[[   7 1126]
 [   0   61]]
poly:
[[   0 1133]
 [   1   60]]
sigmoid:
[[778 355]
 [ 36  25]]


# Controls and Profitability Ratios Non-Boolean

In [34]:
xTrainPRNB = xTrain[:,[1, 2, 3, 10, 11, 12, 17, 18]]
xValPRNB = xVal[:,[1, 2, 3, 10, 11, 12, 17, 18]]

**Unbalanced**

In [35]:
printScores(xTrainPRB, yTrainDist, xValPRB)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  61    0]]
poly:
[[ 130 1003]
 [  14   47]]
sigmoid:
[[1133    0]
 [  59    2]]


**OverSampling**

In [36]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  61    0]]
poly:
[[   6 1127]
 [   0   61]]
sigmoid:
[[  25 1108]
 [  11   50]]


**UnderSampling**

In [37]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

rbf:
[[   0 1133]
 [   0   61]]
linear:
[[   0 1133]
 [   0   61]]
poly:
[[   0 1133]
 [   0   61]]
sigmoid:
[[1060   73]
 [  37   24]]


# Constants and Capital Structure

**Features**

In [38]:
xTrainCS = xTrain[:,[1, 2, 3, 20, 21, 22]]
xValCS = xVal[:,[1, 2, 3, 20, 21, 22]]

**Unbalanced**

In [39]:
printScores(xTrainCS, yTrainDist, xValCS)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  60    1]]
poly:
[[170 963]
 [ 30  31]]
sigmoid:
[[1127    6]
 [  52    9]]


**OverSampling**

In [40]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1131    2]
 [  55    6]]
poly:
[[  17 1116]
 [   6   55]]
sigmoid:
[[   1 1132]
 [   7   54]]


**UnderSampling**

In [41]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

rbf:
[[   0 1133]
 [   0   61]]
linear:
[[1104   29]
 [  33   28]]
poly:
[[   8 1125]
 [   2   59]]
sigmoid:
[[1130    3]
 [  58    3]]


# New Tests 4/20/2020

**Features**

In [42]:
xTrainNT = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]
xValNT = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]
xTestNT = xTestData[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]

**Unbalanced**

In [43]:
printScores(xTrainNT, yTrainDist, xValNT)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1133    0]
 [  61    0]]
poly:
[[626 507]
 [ 10  51]]
sigmoid:
[[1060   73]
 [  60    1]]


**OverSampling**

In [44]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

rbf:
[[1133    0]
 [  61    0]]
linear:
[[1131    2]
 [  59    2]]
poly:
[[1133    0]
 [  61    0]]
sigmoid:
[[1118   15]
 [  61    0]]


**UnderSampling**

In [45]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

rbf:
[[   0 1133]
 [   0   61]]
linear:
[[1085   48]
 [  38   23]]
poly:
[[  17 1116]
 [   0   61]]
sigmoid:
[[1131    2]
 [  60    1]]


# Best UnderSampling Performer

In [46]:
# xTrainBal, yTrainBal = nr.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="poly", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake1.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [47]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="poly", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake2.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [48]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="linear", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake2Extra.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

NameError: name 'f' is not defined