# Support Vector Classifier Testing

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import NearMiss 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
xTestData = np.genfromtxt("data/X_test.txt", delimiter = None, skip_header=1)
xTrainData = np.genfromtxt("data/X_train.txt", delimiter = None, skip_header=1)
yTrainData = np.genfromtxt("data/Y_train.txt", delimiter = None, skip_header=1)

In [67]:
xTrain, xVal, yTrain, yVal = train_test_split(xTrainData, yTrainData, test_size=0.25, shuffle = True)

In [76]:
yTrainDist = yTrain[:, 1]
yTrainZ = yTrain[:, 0]
yValDist = yVal[:,1]
yValZ = yVal[:,0]

xTestIds = xTestData[:,0]

#OverSampler
sm = SMOTE(random_state = 2)

#UnderSampler
nr = NearMiss()

(3582,)


**Helper method for printing Score for KNN(1-50 neighbors)**

In [69]:
def printScores(xTrain, yTrain, xVal):
    scale = StandardScaler().fit(xTrain)
    xTrainScaled = scale.transform(xTrain)
    xValScaled = scale.transform(xVal)

    kernelTypes = ["rbf", "linear", "poly", "sigmoid"]
    
    for kType in kernelTypes:
        svc = svm.SVC(kernel=kType, probability=True)
        svc.fit(xTrainScaled, yTrain.ravel())
        perc = "{:.2%}".format(svc.score(xValScaled, yValDist))
        yPred = svc.predict(xValScaled)
        print(kType + ": " + str(perc) + "\n" + str(classification_report(yValDist, yPred)) + "\n" + str(confusion_matrix(yValDist, yPred)) + "\n")

# All Features

**Features**

In [6]:
xTrainAll = xTrain[:, 1:35]
xValAll = xVal[:, 1:35]
xTestAll = xTestData[:, 1:35]

**Unbalanced**

In [7]:
printScores(xTrainAll, yTrainDist, xValAll)

rbf: 96.98%
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98      1132
         1.0       0.91      0.47      0.62        62

    accuracy                           0.97      1194
   macro avg       0.94      0.73      0.80      1194
weighted avg       0.97      0.97      0.97      1194

[[1129    3]
 [  33   29]]

linear: 96.65%
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      1132
         1.0       0.81      0.47      0.59        62

    accuracy                           0.97      1194
   macro avg       0.89      0.73      0.79      1194
weighted avg       0.96      0.97      0.96      1194

[[1125    7]
 [  33   29]]

poly: 95.98%
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.98      1132
         1.0       0.77      0.32      0.45        62

    accuracy                           0.96      1194
   macro avg       0.87      0.6

**OverSampling**

In [8]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainAll, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValAll)

rbf: 94.81%
              precision    recall  f1-score   support

         0.0       0.99      0.95      0.97      1132
         1.0       0.50      0.87      0.64        62

    accuracy                           0.95      1194
   macro avg       0.75      0.91      0.80      1194
weighted avg       0.97      0.95      0.95      1194

[[1078   54]
 [   8   54]]

linear: 94.97%
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97      1132
         1.0       0.51      0.97      0.67        62

    accuracy                           0.95      1194
   macro avg       0.75      0.96      0.82      1194
weighted avg       0.97      0.95      0.96      1194

[[1074   58]
 [   2   60]]

poly: 93.47%
              precision    recall  f1-score   support

         0.0       0.99      0.94      0.96      1132
         1.0       0.43      0.85      0.58        62

    accuracy                           0.93      1194
   macro avg       0.71      0.9

**UnderSampling**

In [9]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainAll, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValAll)

rbf: 54.86%
              precision    recall  f1-score   support

         0.0       1.00      0.52      0.69      1132
         1.0       0.10      0.98      0.18        62

    accuracy                           0.55      1194
   macro avg       0.55      0.75      0.44      1194
weighted avg       0.95      0.55      0.66      1194

[[594 538]
 [  1  61]]

linear: 93.63%
              precision    recall  f1-score   support

         0.0       1.00      0.93      0.97      1132
         1.0       0.45      0.97      0.61        62

    accuracy                           0.94      1194
   macro avg       0.72      0.95      0.79      1194
weighted avg       0.97      0.94      0.95      1194

[[1058   74]
 [   2   60]]

poly: 90.12%
              precision    recall  f1-score   support

         0.0       0.98      0.91      0.95      1132
         1.0       0.31      0.74      0.44        62

    accuracy                           0.90      1194
   macro avg       0.65      0.83   

# All first Features of each Category

**Features**

In [70]:
xTrainFirsts = xTrain[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xValFirsts = xVal[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xTestFirsts = xTestData[:,[1,2,3, 4, 10, 20, 23, 27,30]]

**Unbalanced Data**

In [11]:
printScores(xTrainFirsts, yTrainDist, xValFirsts)

rbf: 95.23%
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98      1132
         1.0       0.67      0.16      0.26        62

    accuracy                           0.95      1194
   macro avg       0.81      0.58      0.62      1194
weighted avg       0.94      0.95      0.94      1194

[[1127    5]
 [  52   10]]

linear: 94.89%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       1.00      0.02      0.03        62

    accuracy                           0.95      1194
   macro avg       0.97      0.51      0.50      1194
weighted avg       0.95      0.95      0.92      1194

[[1132    0]
 [  61    1]]

poly: 95.06%
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97      1132
         1.0       0.59      0.16      0.25        62

    accuracy                           0.95      1194
   macro avg       0.77      0.5

**OverSampling**

In [71]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

rbf: 91.79%
              precision    recall  f1-score   support

         0.0       0.99      0.92      0.96      1140
         1.0       0.34      0.85      0.48        54

    accuracy                           0.92      1194
   macro avg       0.67      0.89      0.72      1194
weighted avg       0.96      0.92      0.93      1194

[[1050   90]
 [   8   46]]

linear: 93.63%
              precision    recall  f1-score   support

         0.0       1.00      0.94      0.97      1140
         1.0       0.41      0.96      0.58        54

    accuracy                           0.94      1194
   macro avg       0.71      0.95      0.77      1194
weighted avg       0.97      0.94      0.95      1194

[[1066   74]
 [   2   52]]

poly: 87.35%
              precision    recall  f1-score   support

         0.0       0.99      0.88      0.93      1140
         1.0       0.24      0.83      0.37        54

    accuracy                           0.87      1194
   macro avg       0.62      0.8

**UnderSampling**

In [13]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

rbf: 75.04%
              precision    recall  f1-score   support

         0.0       1.00      0.74      0.85      1132
         1.0       0.17      0.95      0.28        62

    accuracy                           0.75      1194
   macro avg       0.58      0.85      0.57      1194
weighted avg       0.95      0.75      0.82      1194

[[837 295]
 [  3  59]]

linear: 94.39%
              precision    recall  f1-score   support

         0.0       0.99      0.95      0.97      1132
         1.0       0.48      0.89      0.62        62

    accuracy                           0.94      1194
   macro avg       0.74      0.92      0.80      1194
weighted avg       0.97      0.94      0.95      1194

[[1072   60]
 [   7   55]]

poly: 95.48%
              precision    recall  f1-score   support

         0.0       0.99      0.96      0.98      1132
         1.0       0.54      0.84      0.66        62

    accuracy                           0.95      1194
   macro avg       0.77      0.90   

# All Firsts plus one

In [72]:
xTrainFirstsP = xTrain[:,[1,2,3,8,11,18,22,26,27,32]]
xValFirstsP = xVal[:,[1,2,3,8,11,18,22,26,27,32]]
xTestFirstsP = xTestData[:,[1,2,3,8,11,18,22,26,27,32]]

**OverSampling**

In [73]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainFirstsP, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirstsP)

rbf: 94.47%
              precision    recall  f1-score   support

         0.0       0.99      0.95      0.97      1140
         1.0       0.44      0.80      0.57        54

    accuracy                           0.94      1194
   macro avg       0.71      0.87      0.77      1194
weighted avg       0.97      0.94      0.95      1194

[[1085   55]
 [  11   43]]

linear: 89.70%
              precision    recall  f1-score   support

         0.0       0.99      0.90      0.94      1140
         1.0       0.29      0.87      0.43        54

    accuracy                           0.90      1194
   macro avg       0.64      0.88      0.69      1194
weighted avg       0.96      0.90      0.92      1194

[[1024  116]
 [   7   47]]

poly: 92.55%
              precision    recall  f1-score   support

         0.0       0.99      0.93      0.96      1140
         1.0       0.35      0.76      0.48        54

    accuracy                           0.93      1194
   macro avg       0.67      0.8

**UnderSampling**

In [16]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainFirstsP, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirstsP)

rbf: 68.76%
              precision    recall  f1-score   support

         0.0       1.00      0.67      0.80      1132
         1.0       0.14      1.00      0.25        62

    accuracy                           0.69      1194
   macro avg       0.57      0.84      0.53      1194
weighted avg       0.96      0.69      0.77      1194

[[759 373]
 [  0  62]]

linear: 96.90%
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.98      1132
         1.0       0.63      0.98      0.77        62

    accuracy                           0.97      1194
   macro avg       0.81      0.98      0.88      1194
weighted avg       0.98      0.97      0.97      1194

[[1096   36]
 [   1   61]]

poly: 96.23%
              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98      1132
         1.0       0.60      0.82      0.69        62

    accuracy                           0.96      1194
   macro avg       0.80      0.90   

# All second Features of each Category

**Features**

In [17]:
xTrainSeconds = xTrain[:,[1,2,3, 5, 11, 21, 24, 28,31]]
xValSeconds = xVal[:,[1,2,3, 5, 11, 21, 24, 28,31]]

**Unbalanced**

In [18]:
printScores(xTrainSeconds, yTrainDist, xValSeconds)

rbf: 95.98%
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98      1132
         1.0       0.82      0.29      0.43        62

    accuracy                           0.96      1194
   macro avg       0.89      0.64      0.70      1194
weighted avg       0.95      0.96      0.95      1194

[[1128    4]
 [  44   18]]

linear: 95.64%
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98      1132
         1.0       0.78      0.23      0.35        62

    accuracy                           0.96      1194
   macro avg       0.87      0.61      0.66      1194
weighted avg       0.95      0.96      0.94      1194

[[1128    4]
 [  48   14]]

poly: 96.23%
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      1132
         1.0       0.77      0.39      0.52        62

    accuracy                           0.96      1194
   macro avg       0.87      0.6

**OverSampling**

In [19]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

rbf: 93.38%
              precision    recall  f1-score   support

         0.0       0.99      0.94      0.96      1132
         1.0       0.43      0.82      0.56        62

    accuracy                           0.93      1194
   macro avg       0.71      0.88      0.76      1194
weighted avg       0.96      0.93      0.94      1194

[[1064   68]
 [  11   51]]

linear: 89.87%
              precision    recall  f1-score   support

         0.0       0.99      0.90      0.94      1132
         1.0       0.33      0.89      0.48        62

    accuracy                           0.90      1194
   macro avg       0.66      0.89      0.71      1194
weighted avg       0.96      0.90      0.92      1194

[[1018  114]
 [   7   55]]

poly: 88.53%
              precision    recall  f1-score   support

         0.0       0.99      0.89      0.94      1132
         1.0       0.29      0.85      0.44        62

    accuracy                           0.89      1194
   macro avg       0.64      0.8

**UnderSampling**

In [20]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

rbf: 58.12%
              precision    recall  f1-score   support

         0.0       0.99      0.56      0.72      1132
         1.0       0.10      0.92      0.19        62

    accuracy                           0.58      1194
   macro avg       0.55      0.74      0.45      1194
weighted avg       0.95      0.58      0.69      1194

[[637 495]
 [  5  57]]

linear: 88.11%
              precision    recall  f1-score   support

         0.0       0.99      0.88      0.93      1132
         1.0       0.29      0.87      0.43        62

    accuracy                           0.88      1194
   macro avg       0.64      0.88      0.68      1194
weighted avg       0.96      0.88      0.91      1194

[[998 134]
 [  8  54]]

poly: 83.33%
              precision    recall  f1-score   support

         0.0       0.98      0.84      0.91      1132
         1.0       0.19      0.69      0.30        62

    accuracy                           0.83      1194
   macro avg       0.59      0.77      0

# All third features of each Category

**Features**

In [21]:
xTrainThirds = xTrain[:,[1,2,3, 6, 12, 22, 25, 29,32]]
xValThirds = xVal[:,[1,2,3, 6, 12, 22, 25, 29,32]]

**Unbalanced**

In [22]:
printScores(xTrainThirds, yTrainDist, xValThirds)

rbf: 95.14%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.83      0.08      0.15        62

    accuracy                           0.95      1194
   macro avg       0.89      0.54      0.56      1194
weighted avg       0.95      0.95      0.93      1194

[[1131    1]
 [  57    5]]

linear: 94.89%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       1.00      0.02      0.03        62

    accuracy                           0.95      1194
   macro avg       0.97      0.51      0.50      1194
weighted avg       0.95      0.95      0.92      1194

[[1132    0]
 [  61    1]]

poly: 95.06%
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97      1132
         1.0       0.58      0.18      0.27        62

    accuracy                           0.95      1194
   macro avg       0.77      0.5

**OverSampling**

In [23]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

rbf: 93.30%
              precision    recall  f1-score   support

         0.0       0.99      0.94      0.96      1132
         1.0       0.42      0.76      0.54        62

    accuracy                           0.93      1194
   macro avg       0.70      0.85      0.75      1194
weighted avg       0.96      0.93      0.94      1194

[[1067   65]
 [  15   47]]

linear: 83.17%
              precision    recall  f1-score   support

         0.0       0.99      0.83      0.90      1132
         1.0       0.22      0.89      0.35        62

    accuracy                           0.83      1194
   macro avg       0.61      0.86      0.63      1194
weighted avg       0.95      0.83      0.87      1194

[[938 194]
 [  7  55]]

poly: 88.94%
              precision    recall  f1-score   support

         0.0       0.99      0.89      0.94      1132
         1.0       0.29      0.79      0.43        62

    accuracy                           0.89      1194
   macro avg       0.64      0.84   

**UnderSampling**

In [24]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

rbf: 56.37%
              precision    recall  f1-score   support

         0.0       0.98      0.55      0.71      1132
         1.0       0.09      0.79      0.16        62

    accuracy                           0.56      1194
   macro avg       0.53      0.67      0.43      1194
weighted avg       0.93      0.56      0.68      1194

[[624 508]
 [ 13  49]]

linear: 72.61%
              precision    recall  f1-score   support

         0.0       0.97      0.73      0.84      1132
         1.0       0.11      0.61      0.19        62

    accuracy                           0.73      1194
   macro avg       0.54      0.67      0.51      1194
weighted avg       0.93      0.73      0.80      1194

[[829 303]
 [ 24  38]]

poly: 71.52%
              precision    recall  f1-score   support

         0.0       0.97      0.72      0.83      1132
         1.0       0.10      0.58      0.17        62

    accuracy                           0.72      1194
   macro avg       0.54      0.65      0

# Controls and Liquidity Ratios

**Features**

In [25]:
xTrainLR = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9]]
xValLR = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9]]

**Unbalanced**

In [26]:
printScores(xTrainLR, yTrainDist, xValLR)

rbf: 95.39%
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.98      1132
         1.0       0.62      0.29      0.40        62

    accuracy                           0.95      1194
   macro avg       0.79      0.64      0.69      1194
weighted avg       0.94      0.95      0.95      1194

[[1121   11]
 [  44   18]]

linear: 94.81%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.00      0.00      0.00        62

    accuracy                           0.95      1194
   macro avg       0.47      0.50      0.49      1194
weighted avg       0.90      0.95      0.92      1194

[[1132    0]
 [  62    0]]



  'precision', 'predicted', average, warn_for)


poly: 95.06%
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97      1132
         1.0       0.56      0.24      0.34        62

    accuracy                           0.95      1194
   macro avg       0.76      0.62      0.66      1194
weighted avg       0.94      0.95      0.94      1194

[[1120   12]
 [  47   15]]

sigmoid: 92.63%
              precision    recall  f1-score   support

         0.0       0.95      0.97      0.96      1132
         1.0       0.19      0.13      0.15        62

    accuracy                           0.93      1194
   macro avg       0.57      0.55      0.56      1194
weighted avg       0.91      0.93      0.92      1194

[[1098   34]
 [  54    8]]



**OverSampling**

In [27]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

rbf: 89.70%
              precision    recall  f1-score   support

         0.0       0.99      0.90      0.94      1132
         1.0       0.32      0.90      0.48        62

    accuracy                           0.90      1194
   macro avg       0.66      0.90      0.71      1194
weighted avg       0.96      0.90      0.92      1194

[[1015  117]
 [   6   56]]

linear: 85.93%
              precision    recall  f1-score   support

         0.0       0.99      0.86      0.92      1132
         1.0       0.26      0.90      0.40        62

    accuracy                           0.86      1194
   macro avg       0.63      0.88      0.66      1194
weighted avg       0.96      0.86      0.89      1194

[[970 162]
 [  6  56]]

poly: 88.02%
              precision    recall  f1-score   support

         0.0       0.99      0.88      0.93      1132
         1.0       0.29      0.92      0.44        62

    accuracy                           0.88      1194
   macro avg       0.64      0.90   

**UnderSampling**

In [28]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

rbf: 34.51%
              precision    recall  f1-score   support

         0.0       0.95      0.33      0.48      1132
         1.0       0.05      0.71      0.10        62

    accuracy                           0.35      1194
   macro avg       0.50      0.52      0.29      1194
weighted avg       0.91      0.35      0.46      1194

[[368 764]
 [ 18  44]]

linear: 72.70%
              precision    recall  f1-score   support

         0.0       0.97      0.74      0.84      1132
         1.0       0.10      0.53      0.17        62

    accuracy                           0.73      1194
   macro avg       0.53      0.63      0.50      1194
weighted avg       0.92      0.73      0.80      1194

[[835 297]
 [ 29  33]]

poly: 68.68%
              precision    recall  f1-score   support

         0.0       0.97      0.69      0.81      1132
         1.0       0.09      0.58      0.16        62

    accuracy                           0.69      1194
   macro avg       0.53      0.64      0

# Controls and Profitability Ratios

**Features**

In [29]:
xTrainPR = xTrain[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
xValPR = xVal[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

**Unbalanced**

In [30]:
printScores(xTrainPR, yTrainDist, xValPR)

rbf: 94.89%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.60      0.05      0.09        62

    accuracy                           0.95      1194
   macro avg       0.78      0.52      0.53      1194
weighted avg       0.93      0.95      0.93      1194

[[1130    2]
 [  59    3]]



  'precision', 'predicted', average, warn_for)


linear: 94.81%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.00      0.00      0.00        62

    accuracy                           0.95      1194
   macro avg       0.47      0.50      0.49      1194
weighted avg       0.90      0.95      0.92      1194

[[1132    0]
 [  62    0]]

poly: 95.06%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.67      0.10      0.17        62

    accuracy                           0.95      1194
   macro avg       0.81      0.55      0.57      1194
weighted avg       0.94      0.95      0.93      1194

[[1129    3]
 [  56    6]]

sigmoid: 91.62%
              precision    recall  f1-score   support

         0.0       0.95      0.96      0.96      1132
         1.0       0.17      0.16      0.17        62

    accuracy                           0.92      1194
   macro avg       0.56     

**OverSampling**

In [31]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

rbf: 86.52%
              precision    recall  f1-score   support

         0.0       0.98      0.87      0.92      1132
         1.0       0.24      0.71      0.35        62

    accuracy                           0.87      1194
   macro avg       0.61      0.79      0.64      1194
weighted avg       0.94      0.87      0.90      1194

[[989 143]
 [ 18  44]]

linear: 84.34%
              precision    recall  f1-score   support

         0.0       0.99      0.84      0.91      1132
         1.0       0.22      0.82      0.35        62

    accuracy                           0.84      1194
   macro avg       0.61      0.83      0.63      1194
weighted avg       0.95      0.84      0.88      1194

[[956 176]
 [ 11  51]]

poly: 83.84%
              precision    recall  f1-score   support

         0.0       0.99      0.84      0.91      1132
         1.0       0.21      0.77      0.33        62

    accuracy                           0.84      1194
   macro avg       0.60      0.81      0

**UnderSampling**

In [32]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

rbf: 27.72%
              precision    recall  f1-score   support

         0.0       0.93      0.26      0.40      1132
         1.0       0.05      0.65      0.08        62

    accuracy                           0.28      1194
   macro avg       0.49      0.45      0.24      1194
weighted avg       0.88      0.28      0.39      1194

[[291 841]
 [ 22  40]]

linear: 79.73%
              precision    recall  f1-score   support

         0.0       0.99      0.80      0.88      1132
         1.0       0.18      0.81      0.29        62

    accuracy                           0.80      1194
   macro avg       0.58      0.80      0.59      1194
weighted avg       0.94      0.80      0.85      1194

[[902 230]
 [ 12  50]]

poly: 67.92%
              precision    recall  f1-score   support

         0.0       0.95      0.70      0.81      1132
         1.0       0.06      0.32      0.09        62

    accuracy                           0.68      1194
   macro avg       0.50      0.51      0

# Controls and Profitability Ratios Booleans

In [33]:
xTrainPRB = xTrain[:,[1, 2, 3, 13, 14, 15, 16, 19]]
xValPRB = xVal[:,[1, 2, 3, 13, 14, 15, 16, 19]]

**Unbalanced**

In [34]:
printScores(xTrainPRB, yTrainDist, xValPRB)

  'precision', 'predicted', average, warn_for)


rbf: 94.81%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.00      0.00      0.00        62

    accuracy                           0.95      1194
   macro avg       0.47      0.50      0.49      1194
weighted avg       0.90      0.95      0.92      1194

[[1132    0]
 [  62    0]]



  'precision', 'predicted', average, warn_for)


linear: 94.81%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.00      0.00      0.00        62

    accuracy                           0.95      1194
   macro avg       0.47      0.50      0.49      1194
weighted avg       0.90      0.95      0.92      1194

[[1132    0]
 [  62    0]]

poly: 94.72%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.00      0.00      0.00        62

    accuracy                           0.95      1194
   macro avg       0.47      0.50      0.49      1194
weighted avg       0.90      0.95      0.92      1194

[[1131    1]
 [  62    0]]

sigmoid: 93.89%
              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97      1132
         1.0       0.24      0.08      0.12        62

    accuracy                           0.94      1194
   macro avg       0.59     

**OverSampling**

In [35]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

rbf: 87.35%
              precision    recall  f1-score   support

         0.0       0.98      0.88      0.93      1132
         1.0       0.25      0.69      0.36        62

    accuracy                           0.87      1194
   macro avg       0.61      0.79      0.65      1194
weighted avg       0.94      0.87      0.90      1194

[[1000  132]
 [  19   43]]

linear: 83.17%
              precision    recall  f1-score   support

         0.0       0.98      0.84      0.90      1132
         1.0       0.20      0.76      0.32        62

    accuracy                           0.83      1194
   macro avg       0.59      0.80      0.61      1194
weighted avg       0.94      0.83      0.87      1194

[[946 186]
 [ 15  47]]

poly: 79.56%
              precision    recall  f1-score   support

         0.0       0.98      0.80      0.88      1132
         1.0       0.16      0.68      0.26        62

    accuracy                           0.80      1194
   macro avg       0.57      0.74   

**UnderSampling**

In [36]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

rbf: 22.61%
              precision    recall  f1-score   support

         0.0       0.93      0.20      0.33      1132
         1.0       0.05      0.71      0.09        62

    accuracy                           0.23      1194
   macro avg       0.49      0.45      0.21      1194
weighted avg       0.88      0.23      0.32      1194

[[226 906]
 [ 18  44]]

linear: 32.41%
              precision    recall  f1-score   support

         0.0       0.97      0.30      0.46      1132
         1.0       0.06      0.81      0.11        62

    accuracy                           0.32      1194
   macro avg       0.51      0.55      0.28      1194
weighted avg       0.92      0.32      0.44      1194

[[337 795]
 [ 12  50]]

poly: 50.67%
              precision    recall  f1-score   support

         0.0       0.93      0.52      0.67      1132
         1.0       0.03      0.29      0.06        62

    accuracy                           0.51      1194
   macro avg       0.48      0.40      0

# Controls and Profitability Ratios Non-Boolean

In [37]:
xTrainPRNB = xTrain[:,[1, 2, 3, 10, 11, 12, 17, 18]]
xValPRNB = xVal[:,[1, 2, 3, 10, 11, 12, 17, 18]]

**Unbalanced**

In [38]:
printScores(xTrainPRB, yTrainDist, xValPRB)

  'precision', 'predicted', average, warn_for)


rbf: 94.81%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.00      0.00      0.00        62

    accuracy                           0.95      1194
   macro avg       0.47      0.50      0.49      1194
weighted avg       0.90      0.95      0.92      1194

[[1132    0]
 [  62    0]]



  'precision', 'predicted', average, warn_for)


linear: 94.81%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.00      0.00      0.00        62

    accuracy                           0.95      1194
   macro avg       0.47      0.50      0.49      1194
weighted avg       0.90      0.95      0.92      1194

[[1132    0]
 [  62    0]]

poly: 94.72%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.00      0.00      0.00        62

    accuracy                           0.95      1194
   macro avg       0.47      0.50      0.49      1194
weighted avg       0.90      0.95      0.92      1194

[[1131    1]
 [  62    0]]

sigmoid: 93.89%
              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97      1132
         1.0       0.24      0.08      0.12        62

    accuracy                           0.94      1194
   macro avg       0.59     

**OverSampling**

In [39]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

rbf: 83.17%
              precision    recall  f1-score   support

         0.0       0.98      0.84      0.90      1132
         1.0       0.20      0.74      0.31        62

    accuracy                           0.83      1194
   macro avg       0.59      0.79      0.61      1194
weighted avg       0.94      0.83      0.87      1194

[[947 185]
 [ 16  46]]

linear: 83.84%
              precision    recall  f1-score   support

         0.0       0.99      0.84      0.91      1132
         1.0       0.22      0.85      0.35        62

    accuracy                           0.84      1194
   macro avg       0.61      0.85      0.63      1194
weighted avg       0.95      0.84      0.88      1194

[[948 184]
 [  9  53]]

poly: 84.76%
              precision    recall  f1-score   support

         0.0       0.99      0.85      0.91      1132
         1.0       0.22      0.79      0.35        62

    accuracy                           0.85      1194
   macro avg       0.61      0.82      0

**UnderSampling**

In [40]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

rbf: 49.33%
              precision    recall  f1-score   support

         0.0       0.97      0.48      0.64      1132
         1.0       0.07      0.73      0.13        62

    accuracy                           0.49      1194
   macro avg       0.52      0.60      0.39      1194
weighted avg       0.92      0.49      0.62      1194

[[544 588]
 [ 17  45]]

linear: 78.98%
              precision    recall  f1-score   support

         0.0       0.98      0.79      0.88      1132
         1.0       0.17      0.77      0.28        62

    accuracy                           0.79      1194
   macro avg       0.58      0.78      0.58      1194
weighted avg       0.94      0.79      0.85      1194

[[895 237]
 [ 14  48]]

poly: 83.25%
              precision    recall  f1-score   support

         0.0       0.97      0.85      0.91      1132
         1.0       0.15      0.48      0.23        62

    accuracy                           0.83      1194
   macro avg       0.56      0.67      0

# Constants and Capital Structure

**Features**

In [41]:
xTrainCS = xTrain[:,[1, 2, 3, 20, 21, 22]]
xValCS = xVal[:,[1, 2, 3, 20, 21, 22]]

**Unbalanced**

In [42]:
printScores(xTrainCS, yTrainDist, xValCS)

rbf: 95.39%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98      1132
         1.0       0.89      0.13      0.23        62

    accuracy                           0.95      1194
   macro avg       0.92      0.56      0.60      1194
weighted avg       0.95      0.95      0.94      1194

[[1131    1]
 [  54    8]]

linear: 94.89%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       1.00      0.02      0.03        62

    accuracy                           0.95      1194
   macro avg       0.97      0.51      0.50      1194
weighted avg       0.95      0.95      0.92      1194

[[1132    0]
 [  61    1]]

poly: 94.97%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       1.00      0.03      0.06        62

    accuracy                           0.95      1194
   macro avg       0.97      0.5

**OverSampling**

In [43]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

rbf: 90.79%
              precision    recall  f1-score   support

         0.0       0.97      0.93      0.95      1132
         1.0       0.29      0.53      0.38        62

    accuracy                           0.91      1194
   macro avg       0.63      0.73      0.66      1194
weighted avg       0.94      0.91      0.92      1194

[[1051   81]
 [  29   33]]

linear: 88.61%
              precision    recall  f1-score   support

         0.0       0.99      0.89      0.94      1132
         1.0       0.29      0.84      0.43        62

    accuracy                           0.89      1194
   macro avg       0.64      0.86      0.69      1194
weighted avg       0.95      0.89      0.91      1194

[[1006  126]
 [  10   52]]

poly: 82.08%
              precision    recall  f1-score   support

         0.0       0.97      0.84      0.90      1132
         1.0       0.14      0.47      0.21        62

    accuracy                           0.82      1194
   macro avg       0.55      0.6

**UnderSampling**

In [44]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

rbf: 39.70%
              precision    recall  f1-score   support

         0.0       0.99      0.37      0.54      1132
         1.0       0.07      0.90      0.13        62

    accuracy                           0.40      1194
   macro avg       0.53      0.64      0.34      1194
weighted avg       0.94      0.40      0.52      1194

[[418 714]
 [  6  56]]

linear: 84.34%
              precision    recall  f1-score   support

         0.0       0.99      0.84      0.91      1132
         1.0       0.24      0.90      0.37        62

    accuracy                           0.84      1194
   macro avg       0.62      0.87      0.64      1194
weighted avg       0.95      0.84      0.88      1194

[[951 181]
 [  6  56]]

poly: 82.33%
              precision    recall  f1-score   support

         0.0       0.98      0.83      0.90      1132
         1.0       0.19      0.76      0.31        62

    accuracy                           0.82      1194
   macro avg       0.59      0.79      0

# New Tests 4/20/2020

**Features**

In [45]:
xTrainNT = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]
xValNT = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]
xTestNT = xTestData[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]

**Unbalanced**

In [46]:
printScores(xTrainNT, yTrainDist, xValNT)

rbf: 95.31%
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.98      1132
         1.0       0.65      0.21      0.32        62

    accuracy                           0.95      1194
   macro avg       0.80      0.60      0.65      1194
weighted avg       0.94      0.95      0.94      1194

[[1125    7]
 [  49   13]]

linear: 94.81%
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97      1132
         1.0       0.00      0.00      0.00        62

    accuracy                           0.95      1194
   macro avg       0.47      0.50      0.49      1194
weighted avg       0.90      0.95      0.92      1194

[[1132    0]
 [  62    0]]



  'precision', 'predicted', average, warn_for)


poly: 95.06%
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.97      1132
         1.0       0.56      0.24      0.34        62

    accuracy                           0.95      1194
   macro avg       0.76      0.62      0.66      1194
weighted avg       0.94      0.95      0.94      1194

[[1120   12]
 [  47   15]]

sigmoid: 92.88%
              precision    recall  f1-score   support

         0.0       0.96      0.97      0.96      1132
         1.0       0.26      0.19      0.22        62

    accuracy                           0.93      1194
   macro avg       0.61      0.58      0.59      1194
weighted avg       0.92      0.93      0.92      1194

[[1097   35]
 [  50   12]]



**OverSampling**

In [47]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

rbf: 92.21%
              precision    recall  f1-score   support

         0.0       0.99      0.92      0.96      1132
         1.0       0.39      0.87      0.54        62

    accuracy                           0.92      1194
   macro avg       0.69      0.90      0.75      1194
weighted avg       0.96      0.92      0.94      1194

[[1047   85]
 [   8   54]]

linear: 86.35%
              precision    recall  f1-score   support

         0.0       1.00      0.86      0.92      1132
         1.0       0.27      0.94      0.42        62

    accuracy                           0.86      1194
   macro avg       0.63      0.90      0.67      1194
weighted avg       0.96      0.86      0.90      1194

[[973 159]
 [  4  58]]

poly: 90.87%
              precision    recall  f1-score   support

         0.0       0.99      0.91      0.95      1132
         1.0       0.34      0.84      0.49        62

    accuracy                           0.91      1194
   macro avg       0.67      0.88   

**UnderSampling**

In [48]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

rbf: 52.93%
              precision    recall  f1-score   support

         0.0       0.96      0.52      0.68      1132
         1.0       0.07      0.65      0.12        62

    accuracy                           0.53      1194
   macro avg       0.52      0.58      0.40      1194
weighted avg       0.92      0.53      0.65      1194

[[592 540]
 [ 22  40]]

linear: 81.99%
              precision    recall  f1-score   support

         0.0       0.98      0.83      0.90      1132
         1.0       0.17      0.66      0.28        62

    accuracy                           0.82      1194
   macro avg       0.58      0.74      0.59      1194
weighted avg       0.94      0.82      0.86      1194

[[938 194]
 [ 21  41]]

poly: 79.15%
              precision    recall  f1-score   support

         0.0       0.98      0.80      0.88      1132
         1.0       0.15      0.63      0.24        62

    accuracy                           0.79      1194
   macro avg       0.56      0.71      0

# Best UnderSampling Performer

In [49]:
# xTrainBal, yTrainBal = nr.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="poly", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake1.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [50]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="poly", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake2.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [51]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="linear", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake2Extra.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [74]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainFirstsP, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestFirstsP)

# svc = svm.SVC(kernel="rbf", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake3.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [75]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainFirsts, yTrainDist.ravel())

scale = StandardScaler().fit(xTrainBal)
xTrainScaled = scale.transform(xTrainBal)
xTestScaled = scale.transform(xTestFirsts)

svc = svm.SVC(kernel="linear", probability=True)
svc.fit(xTrainScaled, yTrainBal)
probs = svc.predict_proba(xTestScaled)
ids = xTestIds.tolist()
probs = probs.tolist()

f = open("SVCTake3.txt", "w+")
f.write("Unique Id,DIST\n")

for i in range(len(ids)):
    f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
f.close()