# KNN Testing

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import NearMiss 
from sklearn.preprocessing import StandardScaler 

In [2]:
xTestData = np.genfromtxt("data/X_test.txt", delimiter = None, skip_header=1)
xTrainData = np.genfromtxt("data/X_train.txt", delimiter = None, skip_header=1)
yTrainData = np.genfromtxt("data/Y_train.txt", delimiter = None, skip_header=1)

In [3]:
xTrain, xVal, yTrain, yVal = train_test_split(xTrainData, yTrainData, test_size=0.25, shuffle = True)

In [4]:
yTrainDist = yTrain[:, 1]
yTrainZ = yTrain[:, 0]
yValDist = yVal[:,1]
yValZ = yVal[:,0]

xTestIds = xTestData[:,0]

#OverSampler
sm = SMOTE(random_state = 2)

#UnderSampler
nr = NearMiss()

**Helper method for printing Score for KNN(1-50 neighbors)**

In [5]:
def printScores(xTrain, yTrain, xVal):
    scale = StandardScaler().fit(xTrain)
    xTrainScaled = scale.transform(xTrain)
    xValScaled = scale.transform(xVal)

    knn = KNeighborsClassifier(n_neighbors=1)

    for i in range(0, 55, 5):
        if i == 0:
            i = 1
        knn.n_neighbors = i
        knn.fit(xTrainScaled, yTrain.ravel())
        perc = "{:.2%}".format(knn.score(xValScaled, yValDist))
        print( str(i) + " neighbor(s) score: " + str(perc))

# All Features

**Features**

In [57]:
xTrainAll = xTrain[:, 1:35]
xValAll = xVal[:, 1:35]
xTestAll = xTestData[:, 1:35]

**Unbalanced**

In [54]:
printScores(xTrainAll, yTrainDist, xValAll)

1 neighbor(s) score: 96.06%
5 neighbor(s) score: 96.48%
10 neighbor(s) score: 96.15%
15 neighbor(s) score: 96.06%
20 neighbor(s) score: 95.90%
25 neighbor(s) score: 95.90%
30 neighbor(s) score: 95.64%
35 neighbor(s) score: 95.73%
40 neighbor(s) score: 95.48%
45 neighbor(s) score: 95.39%
50 neighbor(s) score: 95.48%


**OverSampling**

In [55]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainAll, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValAll)

1 neighbor(s) score: 94.56%
5 neighbor(s) score: 93.97%
10 neighbor(s) score: 93.13%
15 neighbor(s) score: 91.46%
20 neighbor(s) score: 90.87%
25 neighbor(s) score: 90.87%
30 neighbor(s) score: 90.54%
35 neighbor(s) score: 90.28%
40 neighbor(s) score: 90.45%
45 neighbor(s) score: 89.70%
50 neighbor(s) score: 89.95%


**UnderSampling**

In [56]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainAll, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValAll)

1 neighbor(s) score: 68.17%
5 neighbor(s) score: 71.44%
10 neighbor(s) score: 77.05%
15 neighbor(s) score: 76.13%
20 neighbor(s) score: 79.90%
25 neighbor(s) score: 81.07%
30 neighbor(s) score: 88.69%
35 neighbor(s) score: 88.61%
40 neighbor(s) score: 92.29%
45 neighbor(s) score: 91.12%
50 neighbor(s) score: 93.38%


# All first Features of each Category

**Features**

In [6]:
xTrainFirsts = xTrain[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xValFirsts = xVal[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xTestFirsts = xTestData[:,[1,2,3, 4, 10, 20, 23, 27,30]]

**Unbalanced Data**

In [7]:
printScores(xTrainFirsts, yTrainDist, xValFirsts)

1 neighbor(s) score: 96.82%
5 neighbor(s) score: 96.31%
10 neighbor(s) score: 95.90%
15 neighbor(s) score: 95.90%
20 neighbor(s) score: 95.39%
25 neighbor(s) score: 95.23%
30 neighbor(s) score: 95.06%
35 neighbor(s) score: 95.23%
40 neighbor(s) score: 95.23%
45 neighbor(s) score: 95.31%
50 neighbor(s) score: 95.56%


**OverSampling**

In [8]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

1 neighbor(s) score: 95.64%
5 neighbor(s) score: 93.55%
10 neighbor(s) score: 92.29%
15 neighbor(s) score: 90.62%
20 neighbor(s) score: 90.12%
25 neighbor(s) score: 88.86%
30 neighbor(s) score: 88.94%
35 neighbor(s) score: 88.02%
40 neighbor(s) score: 88.11%
45 neighbor(s) score: 87.27%
50 neighbor(s) score: 87.52%


**UnderSampling**

In [9]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

1 neighbor(s) score: 84.25%
5 neighbor(s) score: 92.04%
10 neighbor(s) score: 95.06%
15 neighbor(s) score: 94.14%
20 neighbor(s) score: 94.89%
25 neighbor(s) score: 94.89%
30 neighbor(s) score: 95.14%
35 neighbor(s) score: 95.23%
40 neighbor(s) score: 95.64%
45 neighbor(s) score: 95.14%
50 neighbor(s) score: 94.72%


# All second Features of each Category

**Features**

In [10]:
xTrainSeconds = xTrain[:,[1,2,3, 5, 11, 21, 24, 28,31]]
xValSeconds = xVal[:,[1,2,3, 5, 11, 21, 24, 28,31]]

**Unbalanced**

In [11]:
printScores(xTrainSeconds, yTrainDist, xValSeconds)

1 neighbor(s) score: 95.98%
5 neighbor(s) score: 97.24%
10 neighbor(s) score: 96.15%
15 neighbor(s) score: 96.57%
20 neighbor(s) score: 96.23%
25 neighbor(s) score: 96.40%
30 neighbor(s) score: 96.31%
35 neighbor(s) score: 96.06%
40 neighbor(s) score: 96.06%
45 neighbor(s) score: 95.98%
50 neighbor(s) score: 95.98%


**OverSampling**

In [12]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

1 neighbor(s) score: 95.39%
5 neighbor(s) score: 92.88%
10 neighbor(s) score: 92.55%
15 neighbor(s) score: 91.71%
20 neighbor(s) score: 92.13%
25 neighbor(s) score: 91.29%
30 neighbor(s) score: 91.71%
35 neighbor(s) score: 91.37%
40 neighbor(s) score: 91.29%
45 neighbor(s) score: 91.04%
50 neighbor(s) score: 90.70%


**UnderSampling**

In [13]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

1 neighbor(s) score: 68.51%
5 neighbor(s) score: 75.96%
10 neighbor(s) score: 83.50%
15 neighbor(s) score: 82.66%
20 neighbor(s) score: 85.93%
25 neighbor(s) score: 84.09%
30 neighbor(s) score: 86.43%
35 neighbor(s) score: 86.60%
40 neighbor(s) score: 90.37%
45 neighbor(s) score: 90.79%
50 neighbor(s) score: 92.80%


# All third features of each Category

**Features**

In [14]:
xTrainThirds = xTrain[:,[1,2,3, 6, 12, 22, 25, 29,32]]
xValThirds = xVal[:,[1,2,3, 6, 12, 22, 25, 29,32]]

**Unbalanced**

In [15]:
printScores(xTrainThirds, yTrainDist, xValThirds)

1 neighbor(s) score: 95.31%
5 neighbor(s) score: 96.57%
10 neighbor(s) score: 96.15%
15 neighbor(s) score: 95.73%
20 neighbor(s) score: 95.31%
25 neighbor(s) score: 95.31%
30 neighbor(s) score: 95.23%
35 neighbor(s) score: 95.14%
40 neighbor(s) score: 95.06%
45 neighbor(s) score: 94.97%
50 neighbor(s) score: 95.06%


**OverSampling**

In [16]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

1 neighbor(s) score: 93.05%
5 neighbor(s) score: 89.78%
10 neighbor(s) score: 90.20%
15 neighbor(s) score: 88.61%
20 neighbor(s) score: 89.45%
25 neighbor(s) score: 88.19%
30 neighbor(s) score: 88.53%
35 neighbor(s) score: 87.52%
40 neighbor(s) score: 87.44%
45 neighbor(s) score: 87.02%
50 neighbor(s) score: 87.77%


**UnderSampling**

In [17]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

1 neighbor(s) score: 67.50%
5 neighbor(s) score: 74.37%
10 neighbor(s) score: 80.49%
15 neighbor(s) score: 80.32%
20 neighbor(s) score: 83.58%
25 neighbor(s) score: 83.75%
30 neighbor(s) score: 85.26%
35 neighbor(s) score: 85.68%
40 neighbor(s) score: 86.60%
45 neighbor(s) score: 86.52%
50 neighbor(s) score: 86.77%


# Controls and Liquidity Ratios

**Features**

In [18]:
xTrainLR = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9]]
xValLR = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9]]

**Unbalanced**

In [19]:
printScores(xTrainLR, yTrainDist, xValLR)

1 neighbor(s) score: 95.14%
5 neighbor(s) score: 95.39%
10 neighbor(s) score: 95.98%
15 neighbor(s) score: 95.81%
20 neighbor(s) score: 95.56%
25 neighbor(s) score: 95.90%
30 neighbor(s) score: 95.64%
35 neighbor(s) score: 95.64%
40 neighbor(s) score: 95.90%
45 neighbor(s) score: 95.98%
50 neighbor(s) score: 96.15%


**OverSampling**

In [20]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

1 neighbor(s) score: 93.22%
5 neighbor(s) score: 91.04%
10 neighbor(s) score: 90.20%
15 neighbor(s) score: 88.02%
20 neighbor(s) score: 87.52%
25 neighbor(s) score: 86.85%
30 neighbor(s) score: 86.01%
35 neighbor(s) score: 85.59%
40 neighbor(s) score: 85.68%
45 neighbor(s) score: 85.43%
50 neighbor(s) score: 85.68%


**UnderSampling**

In [21]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

1 neighbor(s) score: 36.68%
5 neighbor(s) score: 37.35%
10 neighbor(s) score: 40.62%
15 neighbor(s) score: 39.45%
20 neighbor(s) score: 41.12%
25 neighbor(s) score: 42.13%
30 neighbor(s) score: 45.31%
35 neighbor(s) score: 46.82%
40 neighbor(s) score: 50.50%
45 neighbor(s) score: 51.76%
50 neighbor(s) score: 54.52%


# Controls and Profitability Ratios

**Features**

In [22]:
xTrainPR = xTrain[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
xValPR = xVal[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

**Unbalanced**

In [23]:
printScores(xTrainPR, yTrainDist, xValPR)

1 neighbor(s) score: 94.64%
5 neighbor(s) score: 95.39%
10 neighbor(s) score: 95.48%
15 neighbor(s) score: 95.48%
20 neighbor(s) score: 95.56%
25 neighbor(s) score: 95.56%
30 neighbor(s) score: 95.39%
35 neighbor(s) score: 95.48%
40 neighbor(s) score: 95.48%
45 neighbor(s) score: 95.48%
50 neighbor(s) score: 95.31%


**OverSampling**

In [24]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

1 neighbor(s) score: 91.46%
5 neighbor(s) score: 88.86%
10 neighbor(s) score: 87.86%
15 neighbor(s) score: 85.43%
20 neighbor(s) score: 86.01%
25 neighbor(s) score: 85.18%
30 neighbor(s) score: 85.09%
35 neighbor(s) score: 84.25%
40 neighbor(s) score: 85.43%
45 neighbor(s) score: 84.76%
50 neighbor(s) score: 84.84%


**UnderSampling**

In [25]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

1 neighbor(s) score: 33.42%
5 neighbor(s) score: 35.26%
10 neighbor(s) score: 39.87%
15 neighbor(s) score: 36.35%
20 neighbor(s) score: 39.36%
25 neighbor(s) score: 38.94%
30 neighbor(s) score: 43.38%
35 neighbor(s) score: 41.62%
40 neighbor(s) score: 46.06%
45 neighbor(s) score: 47.40%
50 neighbor(s) score: 50.84%


# Controls and Profitability Ratios Booleans

In [26]:
xTrainPRB = xTrain[:,[1, 2, 3, 13, 14, 15, 16, 19]]
xValPRB = xVal[:,[1, 2, 3, 13, 14, 15, 16, 19]]

**Unbalanced**

In [27]:
printScores(xTrainPRB, yTrainDist, xValPRB)

1 neighbor(s) score: 94.97%
5 neighbor(s) score: 96.06%
10 neighbor(s) score: 95.39%
15 neighbor(s) score: 95.39%
20 neighbor(s) score: 95.14%
25 neighbor(s) score: 95.23%
30 neighbor(s) score: 95.06%
35 neighbor(s) score: 95.06%
40 neighbor(s) score: 94.97%
45 neighbor(s) score: 95.06%
50 neighbor(s) score: 94.97%


**OverSampling**

In [28]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

1 neighbor(s) score: 90.87%
5 neighbor(s) score: 90.62%
10 neighbor(s) score: 90.62%
15 neighbor(s) score: 88.19%
20 neighbor(s) score: 88.44%
25 neighbor(s) score: 87.86%
30 neighbor(s) score: 88.53%
35 neighbor(s) score: 88.02%
40 neighbor(s) score: 88.78%
45 neighbor(s) score: 88.27%
50 neighbor(s) score: 88.27%


**UnderSampling**

In [29]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

1 neighbor(s) score: 25.04%
5 neighbor(s) score: 27.72%
10 neighbor(s) score: 28.56%
15 neighbor(s) score: 24.62%
20 neighbor(s) score: 26.21%
25 neighbor(s) score: 24.96%
30 neighbor(s) score: 26.05%
35 neighbor(s) score: 26.13%
40 neighbor(s) score: 28.48%
45 neighbor(s) score: 28.81%
50 neighbor(s) score: 34.00%


# Controls and Profitability Ratios Non-Boolean

In [30]:
xTrainPRNB = xTrain[:,[1, 2, 3, 10, 11, 12, 17, 18]]
xValPRNB = xVal[:,[1, 2, 3, 10, 11, 12, 17, 18]]

**Unbalanced**

In [31]:
printScores(xTrainPRB, yTrainDist, xValPRB)

1 neighbor(s) score: 94.97%
5 neighbor(s) score: 96.06%
10 neighbor(s) score: 95.39%
15 neighbor(s) score: 95.39%
20 neighbor(s) score: 95.14%
25 neighbor(s) score: 95.23%
30 neighbor(s) score: 95.06%
35 neighbor(s) score: 95.06%
40 neighbor(s) score: 94.97%
45 neighbor(s) score: 95.06%
50 neighbor(s) score: 94.97%


**OverSampling**

In [32]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

1 neighbor(s) score: 90.03%
5 neighbor(s) score: 86.68%
10 neighbor(s) score: 85.18%
15 neighbor(s) score: 82.08%
20 neighbor(s) score: 82.16%
25 neighbor(s) score: 80.49%
30 neighbor(s) score: 81.24%
35 neighbor(s) score: 80.49%
40 neighbor(s) score: 80.40%
45 neighbor(s) score: 79.06%
50 neighbor(s) score: 79.31%


**UnderSampling**

In [33]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

1 neighbor(s) score: 56.53%
5 neighbor(s) score: 68.84%
10 neighbor(s) score: 79.31%
15 neighbor(s) score: 79.31%
20 neighbor(s) score: 82.75%
25 neighbor(s) score: 82.41%
30 neighbor(s) score: 85.01%
35 neighbor(s) score: 84.92%
40 neighbor(s) score: 85.18%
45 neighbor(s) score: 84.92%
50 neighbor(s) score: 86.01%


# Constants and Capital Structure

**Features**

In [34]:
xTrainCS = xTrain[:,[1, 2, 3, 20, 21, 22]]
xValCS = xVal[:,[1, 2, 3, 20, 21, 22]]

**Unbalanced**

In [35]:
printScores(xTrainCS, yTrainDist, xValCS)

1 neighbor(s) score: 96.65%
5 neighbor(s) score: 96.90%
10 neighbor(s) score: 96.57%
15 neighbor(s) score: 96.23%
20 neighbor(s) score: 95.64%
25 neighbor(s) score: 95.64%
30 neighbor(s) score: 95.73%
35 neighbor(s) score: 95.64%
40 neighbor(s) score: 95.31%
45 neighbor(s) score: 95.39%
50 neighbor(s) score: 95.23%


**OverSampling**

In [36]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

1 neighbor(s) score: 95.06%
5 neighbor(s) score: 92.71%
10 neighbor(s) score: 93.13%
15 neighbor(s) score: 91.88%
20 neighbor(s) score: 91.96%
25 neighbor(s) score: 91.54%
30 neighbor(s) score: 90.87%
35 neighbor(s) score: 90.28%
40 neighbor(s) score: 90.62%
45 neighbor(s) score: 90.20%
50 neighbor(s) score: 90.70%


**UnderSampling**

In [37]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

1 neighbor(s) score: 43.55%
5 neighbor(s) score: 54.27%
10 neighbor(s) score: 67.42%
15 neighbor(s) score: 73.20%
20 neighbor(s) score: 63.23%
25 neighbor(s) score: 64.41%
30 neighbor(s) score: 70.02%
35 neighbor(s) score: 68.17%
40 neighbor(s) score: 66.42%
45 neighbor(s) score: 68.26%
50 neighbor(s) score: 71.86%


# New Tests 4/20/2020

**Features**

In [38]:
xTrainNT = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]
xValNT = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]

**Unbalanced**

In [39]:
printScores(xTrainNT, yTrainDist, xValNT)

1 neighbor(s) score: 94.97%
5 neighbor(s) score: 95.64%
10 neighbor(s) score: 95.48%
15 neighbor(s) score: 95.64%
20 neighbor(s) score: 95.48%
25 neighbor(s) score: 95.64%
30 neighbor(s) score: 95.56%
35 neighbor(s) score: 95.56%
40 neighbor(s) score: 95.39%
45 neighbor(s) score: 95.39%
50 neighbor(s) score: 95.14%


**OverSampling**

In [40]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

1 neighbor(s) score: 93.72%
5 neighbor(s) score: 91.46%
10 neighbor(s) score: 90.62%
15 neighbor(s) score: 89.78%
20 neighbor(s) score: 89.61%
25 neighbor(s) score: 89.45%
30 neighbor(s) score: 89.36%
35 neighbor(s) score: 88.78%
40 neighbor(s) score: 89.03%
45 neighbor(s) score: 88.53%
50 neighbor(s) score: 88.53%


**UnderSampling**

In [41]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

1 neighbor(s) score: 52.51%
5 neighbor(s) score: 56.62%
10 neighbor(s) score: 63.82%
15 neighbor(s) score: 63.15%
20 neighbor(s) score: 69.85%
25 neighbor(s) score: 70.44%
30 neighbor(s) score: 85.93%
35 neighbor(s) score: 85.26%
40 neighbor(s) score: 90.87%
45 neighbor(s) score: 90.95%
50 neighbor(s) score: 92.71%


# Best Performer Unbalanced

In [42]:
# xTrainFirsts = xTrain[:,[1,2,3, 4, 10, 20, 23, 27,30]]
# xTestFirsts = xTest[:,[1,2,3, 4, 10, 20, 23, 27,30]]

# knn = KNeighborsClassifier(n_neighbors=10)
# knn.fit(xTrainFirsts, yTrainDist)
# probs = knn.predict_proba(xTestFirsts)
# ids = ySampleIds.tolist()
# probs = probs.tolist()

# f = open("knnTesting.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()




In [43]:
# xTrainLR = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9]]
# xTestLR = xTest[:,[1,2,3, 4, 5, 6, 7, 8,9]]

# knn = KNeighborsClassifier(n_neighbors=30)
# knn.fit(xTrainLR, yTrainDist)
# probs = knn.predict_proba(xTestLR)
# ids = ySampleIds.tolist()
# probs = probs.tolist()

# f = open("knnTestingSub2.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [44]:
# xTrainLR = xTrain[:,[1, 2, 3, 4, 18]]
# xTestLR = xTest[:,[1, 2, 3, 4, 18]]

# knn = KNeighborsClassifier(n_neighbors=20)
# knn.fit(xTrainLR, yTrainDist)
# probs = knn.predict_proba(xTestLR)
# ids = ySampleIds.tolist()
# probs = probs.tolist()

# f = open("knnTestingTake4.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [45]:
# xTrainLR = xTrain[:,[1,2,3, 4, 5, 6, 7, 8, 9, 26]]
# xTestLR = xTest[:,[1,2,3, 4, 5, 6, 7, 8, 9, 26]]

# knn = KNeighborsClassifier(n_neighbors=30)
# knn.fit(xTrainLR, yTrainDist)
# probs = knn.predict_proba(xTestLR)
# ids = ySampleIds.tolist()
# probs = probs.tolist()

# f = open("knnTestingTake5.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

# Best OverSampling Performer

In [46]:
# xTrainBal, yTrainBal = sm.fit_sample(xTrainFirsts, yTrainDist.ravel())

# scale = StandardScaler().fit(xTrainBal)
# xTrainScaled = scale.transform(xTrainBal)
# xTestScaled = scale.transform(xTestFirsts)

# knn = KNeighborsClassifier(n_neighbors=20)
# knn.fit(xTrainScaled, yTrainBal)
# probs = knn.predict_proba(xTestFirsts)
# ids = xTestIds.tolist()
# probs = probs.tolist()

# f = open("knnTestingTake6.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [62]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainAll, yTrainDist.ravel())

scale = StandardScaler().fit(xTrainBal)
xTrainScaled = scale.transform(xTrainBal)
xTestScaled = scale.transform(xTestAll)

knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(xTrainScaled, yTrainBal)
probs = knn.predict_proba(xTestScaled)
ids = xTestIds.tolist()
probs = probs.tolist()

f = open("knnTestingTake7.txt", "w+")
f.write("Unique Id,DIST\n")

for i in range(len(ids)):
    f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
f.close()