# Neural Network Classifier Testing

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import NearMiss 
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
xTestData = np.genfromtxt("data/X_test.txt", delimiter = None, skip_header=1)
xTrainData = np.genfromtxt("data/X_train.txt", delimiter = None, skip_header=1)
yTrainData = np.genfromtxt("data/Y_train.txt", delimiter = None, skip_header=1)

In [3]:
xTrain, xVal, yTrain, yVal = train_test_split(xTrainData, yTrainData, test_size=0.25, shuffle = True)

In [4]:
yTrainDist = yTrain[:, 1]
yTrainZ = yTrain[:, 0]
yValDist = yVal[:,1]
yValZ = yVal[:,0]

xTestIds = xTestData[:,0]

#OverSampler
sm = SMOTE(random_state = 2)

#UnderSampler
nr = NearMiss()

**Helper method for printing Score for KNN(1-50 neighbors)**

In [108]:
def printScores(xTrain, yTrain, xVal):
    scale = StandardScaler().fit(xTrain)
    xTrainScaled = scale.transform(xTrain)
    xValScaled = scale.transform(xVal)

    activationTypes = ["identity", "logistic", "tanh", "relu"] # Logistic is sigmoid
    
    for activate in activationTypes:
        clf = MLPClassifier(activation=activate,hidden_layer_sizes=(2,2), random_state=1, max_iter = 2000)
        clf.fit(xTrainScaled, yTrain.ravel())
        perc = "{:.2%}".format(clf.score(xValScaled, yValDist))
        yPred = clf.predict(xValScaled)
        print(activate + ": " + str(perc) + "\n" + str(classification_report(yValDist, yPred)) + "\n" + str(confusion_matrix(yValDist, yPred)) + "\n")

# Important Features

In [109]:
xTrainImp = xTrain[:,[1,4,5,11,13,8,14,22,20,27]]
xValImp = xVal[:,[1,4,5,11,13,8,14,22,20,27]]
xTestImp = xTestData[:,[1,4,5,11,13,8,14,22,20,27]]

In [110]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainImp, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValImp)

identity: 95.14%
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.97      1130
         1.0       0.53      0.94      0.67        64

    accuracy                           0.95      1194
   macro avg       0.76      0.94      0.82      1194
weighted avg       0.97      0.95      0.96      1194

[[1076   54]
 [   4   60]]

logistic: 95.48%
              precision    recall  f1-score   support

         0.0       1.00      0.95      0.98      1130
         1.0       0.54      0.98      0.70        64

    accuracy                           0.95      1194
   macro avg       0.77      0.97      0.84      1194
weighted avg       0.97      0.95      0.96      1194

[[1077   53]
 [   1   63]]

tanh: 95.64%
              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98      1130
         1.0       0.56      0.94      0.70        64

    accuracy                           0.96      1194
   macro avg       0.78  

# Testing Features for Overfit

In [77]:
xTrainFeat = xTrain[:,[1,4,5,11,8,14,22]]
xValFeat = xVal[:,[1,4,5,11,8,14,22]]
xTestFeat = xTestData[:,[1,4,5,11,8,14,22]]

In [78]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainFeat, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFeat)

identity: 86.43%
              precision    recall  f1-score   support

         0.0       1.00      0.86      0.92      1130
         1.0       0.28      0.94      0.43        64

    accuracy                           0.86      1194
   macro avg       0.64      0.90      0.67      1194
weighted avg       0.96      0.86      0.90      1194

[[972 158]
 [  4  60]]



  _warn_prf(average, modifier, msg_start, len(result))


logistic: 5.36%
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00      1130
         1.0       0.05      1.00      0.10        64

    accuracy                           0.05      1194
   macro avg       0.03      0.50      0.05      1194
weighted avg       0.00      0.05      0.01      1194

[[   0 1130]
 [   0   64]]

tanh: 97.24%
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99      1130
         1.0       0.67      0.95      0.79        64

    accuracy                           0.97      1194
   macro avg       0.83      0.96      0.89      1194
weighted avg       0.98      0.97      0.97      1194

[[1100   30]
 [   3   61]]

relu: 97.82%
              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99      1130
         1.0       0.74      0.91      0.82        64

    accuracy                           0.98      1194
   macro avg       0.87      0

# All Features

**Features**

In [6]:
xTrainAll = xTrain[:, 1:35]
xValAll = xVal[:, 1:35]
xTestAll = xTestData[:, 1:35]

**Unbalanced**

In [7]:
printScores(xTrainAll, yTrainDist, xValAll)

rbf score: 97.32%
linear score: 96.73%
poly score: 97.07%
sigmoid score: 93.97%


**OverSampling**

In [8]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainAll, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValAll)

rbf score: 95.31%
linear score: 94.97%
poly score: 93.30%
sigmoid score: 86.10%


**UnderSampling**

In [9]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainAll, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValAll)

rbf score: 61.98%
linear score: 96.31%
poly score: 92.46%
sigmoid score: 92.13%


# All first Features of each Category

**Features**

In [10]:
xTrainFirsts = xTrain[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xValFirsts = xVal[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xTestFirsts = xTestData[:,[1,2,3, 4, 10, 20, 23, 27,30]]

**Unbalanced Data**

In [11]:
printScores(xTrainFirsts, yTrainDist, xValFirsts)

rbf score: 96.31%
linear score: 96.06%
poly score: 96.31%
sigmoid score: 93.80%


**OverSampling**

In [12]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

rbf score: 92.38%
linear score: 93.47%
poly score: 88.86%
sigmoid score: 77.47%


**UnderSampling**

In [13]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

rbf score: 76.55%
linear score: 93.89%
poly score: 93.80%
sigmoid score: 93.55%


# All second Features of each Category

**Features**

In [14]:
xTrainSeconds = xTrain[:,[1,2,3, 5, 11, 21, 24, 28,31]]
xValSeconds = xVal[:,[1,2,3, 5, 11, 21, 24, 28,31]]

**Unbalanced**

In [15]:
printScores(xTrainSeconds, yTrainDist, xValSeconds)

rbf score: 96.73%
linear score: 96.48%
poly score: 96.57%
sigmoid score: 93.22%


**OverSampling**

In [16]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

rbf score: 93.55%
linear score: 89.78%
poly score: 89.87%
sigmoid score: 82.83%


**UnderSampling**

In [17]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

rbf score: 59.80%
linear score: 84.67%
poly score: 81.66%
sigmoid score: 90.54%


# All third features of each Category

**Features**

In [18]:
xTrainThirds = xTrain[:,[1,2,3, 6, 12, 22, 25, 29,32]]
xValThirds = xVal[:,[1,2,3, 6, 12, 22, 25, 29,32]]

**Unbalanced**

In [19]:
printScores(xTrainThirds, yTrainDist, xValThirds)

rbf score: 96.57%
linear score: 95.81%
poly score: 96.82%
sigmoid score: 93.38%


**OverSampling**

In [20]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

rbf score: 91.54%
linear score: 83.00%
poly score: 87.19%
sigmoid score: 72.78%


**UnderSampling**

In [21]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

rbf score: 57.29%
linear score: 77.14%
poly score: 72.78%
sigmoid score: 74.87%


# Controls and Liquidity Ratios

**Features**

In [22]:
xTrainLR = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9]]
xValLR = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9]]

**Unbalanced**

In [23]:
printScores(xTrainLR, yTrainDist, xValLR)

rbf score: 96.31%
linear score: 95.90%
poly score: 96.40%
sigmoid score: 93.80%


**OverSampling**

In [24]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

rbf score: 89.87%
linear score: 85.09%
poly score: 86.60%
sigmoid score: 79.40%


**UnderSampling**

In [25]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

rbf score: 31.66%
linear score: 73.20%
poly score: 71.61%
sigmoid score: 86.35%


# Controls and Profitability Ratios

**Features**

In [26]:
xTrainPR = xTrain[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
xValPR = xVal[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

**Unbalanced**

In [27]:
printScores(xTrainPR, yTrainDist, xValPR)

rbf score: 95.98%
linear score: 95.90%
poly score: 95.98%
sigmoid score: 93.55%


**OverSampling**

In [28]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

rbf score: 86.93%
linear score: 84.25%
poly score: 85.51%
sigmoid score: 78.56%


**UnderSampling**

In [29]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

rbf score: 31.16%
linear score: 83.67%
poly score: 79.98%
sigmoid score: 73.45%


# Controls and Profitability Ratios Booleans

In [30]:
xTrainPRB = xTrain[:,[1, 2, 3, 13, 14, 15, 16, 19]]
xValPRB = xVal[:,[1, 2, 3, 13, 14, 15, 16, 19]]

**Unbalanced**

In [31]:
printScores(xTrainPRB, yTrainDist, xValPRB)

rbf score: 95.90%
linear score: 95.90%
poly score: 95.81%
sigmoid score: 94.64%


**OverSampling**

In [32]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

rbf score: 87.86%
linear score: 83.67%
poly score: 81.16%
sigmoid score: 64.66%


**UnderSampling**

In [33]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

rbf score: 29.90%
linear score: 35.68%
poly score: 72.95%
sigmoid score: 65.91%


# Controls and Profitability Ratios Non-Boolean

In [34]:
xTrainPRNB = xTrain[:,[1, 2, 3, 10, 11, 12, 17, 18]]
xValPRNB = xVal[:,[1, 2, 3, 10, 11, 12, 17, 18]]

**Unbalanced**

In [35]:
printScores(xTrainPRB, yTrainDist, xValPRB)

rbf score: 95.90%
linear score: 95.90%
poly score: 95.81%
sigmoid score: 94.64%


**OverSampling**

In [36]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

rbf score: 86.60%
linear score: 83.84%
poly score: 85.93%
sigmoid score: 74.46%


**UnderSampling**

In [37]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

rbf score: 36.10%
linear score: 62.40%
poly score: 55.44%
sigmoid score: 40.12%


# Constants and Capital Structure

**Features**

In [38]:
xTrainCS = xTrain[:,[1, 2, 3, 20, 21, 22]]
xValCS = xVal[:,[1, 2, 3, 20, 21, 22]]

**Unbalanced**

In [39]:
printScores(xTrainCS, yTrainDist, xValCS)

rbf score: 96.65%
linear score: 95.98%
poly score: 96.57%
sigmoid score: 93.47%


**OverSampling**

In [40]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

rbf score: 90.62%
linear score: 89.95%
poly score: 82.75%
sigmoid score: 57.62%


**UnderSampling**

In [41]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

rbf score: 64.91%
linear score: 90.62%
poly score: 86.93%
sigmoid score: 87.44%


# New Tests 4/20/2020

**Features**

In [82]:
xTrainNT = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]
xValNT = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]
xTestNT = xTestData[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]

**Unbalanced**

In [43]:
printScores(xTrainNT, yTrainDist, xValNT)

rbf score: 96.23%
linear score: 95.90%
poly score: 96.40%
sigmoid score: 93.38%


**OverSampling**

In [83]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

rbf score: 90.95%
linear score: 85.59%
poly score: 87.94%
sigmoid score: 80.32%


**UnderSampling**

In [45]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

rbf score: 53.94%
linear score: 86.26%
poly score: 85.26%
sigmoid score: 82.83%


# Best UnderSampling Performer

In [46]:
# xTrainBal, yTrainBal = nr.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="poly", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake1.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [85]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="poly", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake2.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [87]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestNT)

# svc = svm.SVC(kernel="linear", probability=True)
# svc.fit(xTrainScaled, yTrainBal)
# probs = svc.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("SVCTake2Extra.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [71]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainImp, yTrainDist.ravel())
# # Countdown for hidden layers and perceptrons
# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestImp)

# clf = MLPClassifier(activation='relu',hidden_layer_sizes=(10,9,8,7,6,5,4,3,2,1), random_state=1, max_iter=2000)
# clf.fit(xTrainScaled, yTrainBal)
# probs = clf.predict_proba(xTestScaled)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("NeuralNetworkTake1.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()
# print("Converged")

Converged


In [86]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainImp, yTrainDist.ravel())
# This is for...
scale = StandardScaler().fit(xTrainBal)
xTrainScaled = scale.transform(xTrainBal)
xTestScaled = scale.transform(xTestImp)

clf = MLPClassifier(activation='relu',hidden_layer_sizes=(28,25,22,19,16,13,10,7,4,1), random_state=1, max_iter=4000)
clf.fit(xTrainScaled, yTrainBal)
probs = clf.predict_proba(xTestScaled)
ids = xTestIds.tolist()
probs = probs.tolist()

f = open("NeuralNetworkTake2.txt", "w+")
f.write("Unique Id,DIST\n")

for i in range(len(ids)):
    f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
f.close()
print("Converged")

Converged
