# KNN Testing

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import NearMiss 
from sklearn.preprocessing import StandardScaler 

In [2]:
xTestData = np.genfromtxt("data/X_test.txt", delimiter = None, skip_header=1)
xTrainData = np.genfromtxt("data/X_train.txt", delimiter = None, skip_header=1)
yTrainData = np.genfromtxt("data/Y_train.txt", delimiter = None, skip_header=1)

In [3]:
xTrain, xVal, yTrain, yVal = train_test_split(xTrainData, yTrainData, test_size=0.25, shuffle = True)

In [4]:
yTrainDist = yTrain[:, 1]
yTrainZ = yTrain[:, 0]
yValDist = yVal[:,1]
yValZ = yVal[:,0]

xTestIds = xTestData[:,0]

#OverSampler
sm = SMOTE(random_state = 2)

#UnderSampler
nr = NearMiss()

**Helper method for printing Score for KNN(1-50 neighbors)**

In [5]:
def printScores(xTrain, yTrain, xVal):
    scale = StandardScaler().fit(xTrain)
    xTrainScaled = scale.transform(xTrain)
    xValScaled = scale.transform(xVal)

    knn = KNeighborsClassifier(n_neighbors=1)

    for i in range(0, 55, 5):
        if i == 0:
            i = 1
        knn.n_neighbors = i
        knn.fit(xTrainScaled, yTrain.ravel())
        print( str(i) + " neighbor(s) score: " + str(knn.score(xValScaled, yValDist)))

# All first Features of each Category

**Features**

In [6]:
xTrainFirsts = xTrain[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xValFirsts = xVal[:,[1,2,3, 4, 10, 20, 23, 27,30]]
xTestFirsts = xTestData[:,[1,2,3, 4, 10, 20, 23, 27,30]]

**Unbalanced Data**

In [7]:
printScores(xTrainFirsts, yTrainDist, xValFirsts)

1 neighbor(s) score: 0.9614740368509213
5 neighbor(s) score: 0.9681742043551089
10 neighbor(s) score: 0.9681742043551089
15 neighbor(s) score: 0.9623115577889447
20 neighbor(s) score: 0.9614740368509213
25 neighbor(s) score: 0.9606365159128978
30 neighbor(s) score: 0.9597989949748744
35 neighbor(s) score: 0.9597989949748744
40 neighbor(s) score: 0.9606365159128978
45 neighbor(s) score: 0.9614740368509213
50 neighbor(s) score: 0.9597989949748744


**OverSampling**

In [8]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

1 neighbor(s) score: 0.9581239530988275
5 neighbor(s) score: 0.9321608040201005
10 neighbor(s) score: 0.9246231155778895
15 neighbor(s) score: 0.8969849246231156
20 neighbor(s) score: 0.9020100502512562
25 neighbor(s) score: 0.890284757118928
30 neighbor(s) score: 0.8936348408710217
35 neighbor(s) score: 0.8835845896147404
40 neighbor(s) score: 0.8835845896147404
45 neighbor(s) score: 0.8785594639865997
50 neighbor(s) score: 0.8802345058626466


**UnderSampling**

In [9]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainFirsts, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValFirsts)

1 neighbor(s) score: 0.8107202680067002
5 neighbor(s) score: 0.8726968174204355
10 neighbor(s) score: 0.9413735343383585
15 neighbor(s) score: 0.958961474036851
20 neighbor(s) score: 0.958961474036851
25 neighbor(s) score: 0.9581239530988275
30 neighbor(s) score: 0.957286432160804
35 neighbor(s) score: 0.9581239530988275
40 neighbor(s) score: 0.9581239530988275
45 neighbor(s) score: 0.958961474036851
50 neighbor(s) score: 0.9556113902847572


# All second Features of each Category

**Features**

In [10]:
xTrainSeconds = xTrain[:,[1,2,3, 5, 11, 21, 24, 28,31]]
xValSeconds = xVal[:,[1,2,3, 5, 11, 21, 24, 28,31]]

**Unbalanced**

In [11]:
printScores(xTrainSeconds, yTrainDist, xValSeconds)

1 neighbor(s) score: 0.9715242881072027
5 neighbor(s) score: 0.9715242881072027
10 neighbor(s) score: 0.9656616415410385
15 neighbor(s) score: 0.9656616415410385
20 neighbor(s) score: 0.9623115577889447
25 neighbor(s) score: 0.958961474036851
30 neighbor(s) score: 0.9597989949748744
35 neighbor(s) score: 0.9614740368509213
40 neighbor(s) score: 0.958961474036851
45 neighbor(s) score: 0.958961474036851
50 neighbor(s) score: 0.957286432160804


**OverSampling**

In [12]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

1 neighbor(s) score: 0.9681742043551089
5 neighbor(s) score: 0.9522613065326633
10 neighbor(s) score: 0.9430485762144054
15 neighbor(s) score: 0.932998324958124
20 neighbor(s) score: 0.9346733668341709
25 neighbor(s) score: 0.9296482412060302
30 neighbor(s) score: 0.9304857621440537
35 neighbor(s) score: 0.9262981574539364
40 neighbor(s) score: 0.923785594639866
45 neighbor(s) score: 0.916247906197655
50 neighbor(s) score: 0.9154103852596315


**UnderSampling**

In [13]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainSeconds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValSeconds)

1 neighbor(s) score: 0.6608040201005025
5 neighbor(s) score: 0.6976549413735343
10 neighbor(s) score: 0.7512562814070352
15 neighbor(s) score: 0.7420435510887772
20 neighbor(s) score: 0.7956448911222781
25 neighbor(s) score: 0.8140703517587939
30 neighbor(s) score: 0.8634840871021775
35 neighbor(s) score: 0.8676716917922948
40 neighbor(s) score: 0.9179229480737019
45 neighbor(s) score: 0.9170854271356784
50 neighbor(s) score: 0.9380234505862647


# All third features of each Category

**Features**

In [14]:
xTrainThirds = xTrain[:,[1,2,3, 6, 12, 22, 25, 29,32]]
xValThirds = xVal[:,[1,2,3, 6, 12, 22, 25, 29,32]]

**Unbalanced**

In [15]:
printScores(xTrainThirds, yTrainDist, xValThirds)

1 neighbor(s) score: 0.9614740368509213
5 neighbor(s) score: 0.9631490787269682
10 neighbor(s) score: 0.9564489112227805
15 neighbor(s) score: 0.9581239530988275
20 neighbor(s) score: 0.957286432160804
25 neighbor(s) score: 0.9564489112227805
30 neighbor(s) score: 0.9564489112227805
35 neighbor(s) score: 0.957286432160804
40 neighbor(s) score: 0.9556113902847572
45 neighbor(s) score: 0.9556113902847572
50 neighbor(s) score: 0.9539363484087102


**OverSampling**

In [16]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

1 neighbor(s) score: 0.9413735343383585
5 neighbor(s) score: 0.9095477386934674
10 neighbor(s) score: 0.9020100502512562
15 neighbor(s) score: 0.8886097152428811
20 neighbor(s) score: 0.8927973199329984
25 neighbor(s) score: 0.8877721943048577
30 neighbor(s) score: 0.8869346733668342
35 neighbor(s) score: 0.8785594639865997
40 neighbor(s) score: 0.8793969849246231
45 neighbor(s) score: 0.8701842546063652
50 neighbor(s) score: 0.8752093802345059


**UnderSampling**

In [17]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainThirds, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValThirds)

1 neighbor(s) score: 0.6063651591289783
5 neighbor(s) score: 0.6976549413735343
10 neighbor(s) score: 0.7839195979899497
15 neighbor(s) score: 0.7855946398659966
20 neighbor(s) score: 0.8534338358458962
25 neighbor(s) score: 0.8592964824120602
30 neighbor(s) score: 0.8869346733668342
35 neighbor(s) score: 0.8777219430485762
40 neighbor(s) score: 0.8927973199329984
45 neighbor(s) score: 0.8927973199329984
50 neighbor(s) score: 0.897822445561139


# Controls and Liquidity Ratios

**Features**

In [18]:
xTrainLR = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9]]
xValLR = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9]]

**Unbalanced**

In [19]:
printScores(xTrainLR, yTrainDist, xValLR)

1 neighbor(s) score: 0.9606365159128978
5 neighbor(s) score: 0.9597989949748744
10 neighbor(s) score: 0.9623115577889447
15 neighbor(s) score: 0.9581239530988275
20 neighbor(s) score: 0.9606365159128978
25 neighbor(s) score: 0.9606365159128978
30 neighbor(s) score: 0.9597989949748744
35 neighbor(s) score: 0.9597989949748744
40 neighbor(s) score: 0.9597989949748744
45 neighbor(s) score: 0.9606365159128978
50 neighbor(s) score: 0.9597989949748744


**OverSampling**

In [20]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

1 neighbor(s) score: 0.9463986599664992
5 neighbor(s) score: 0.9137353433835846
10 neighbor(s) score: 0.9003350083752094
15 neighbor(s) score: 0.8802345058626466
20 neighbor(s) score: 0.8793969849246231
25 neighbor(s) score: 0.871859296482412
30 neighbor(s) score: 0.8693467336683417
35 neighbor(s) score: 0.8609715242881072
40 neighbor(s) score: 0.864321608040201
45 neighbor(s) score: 0.8567839195979899
50 neighbor(s) score: 0.8559463986599665


**UnderSampling**

In [21]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainLR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValLR)

1 neighbor(s) score: 0.36515912897822445
5 neighbor(s) score: 0.39112227805695143
10 neighbor(s) score: 0.42797319932998323
15 neighbor(s) score: 0.43132328308207707
20 neighbor(s) score: 0.46901172529313234
25 neighbor(s) score: 0.4564489112227806
30 neighbor(s) score: 0.4966499162479062
35 neighbor(s) score: 0.4899497487437186
40 neighbor(s) score: 0.5100502512562815
45 neighbor(s) score: 0.518425460636516
50 neighbor(s) score: 0.5485762144053601


# Controls and Profitability Ratios

**Features**

In [22]:
xTrainPR = xTrain[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]
xValPR = xVal[:,[1, 2, 3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]]

**Unbalanced**

In [23]:
printScores(xTrainPR, yTrainDist, xValPR)

1 neighbor(s) score: 0.9380234505862647
5 neighbor(s) score: 0.9581239530988275
10 neighbor(s) score: 0.9564489112227805
15 neighbor(s) score: 0.9514237855946399
20 neighbor(s) score: 0.9556113902847572
25 neighbor(s) score: 0.9556113902847572
30 neighbor(s) score: 0.9556113902847572
35 neighbor(s) score: 0.9547738693467337
40 neighbor(s) score: 0.9539363484087102
45 neighbor(s) score: 0.9539363484087102
50 neighbor(s) score: 0.9539363484087102


**OverSampling**

In [24]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

1 neighbor(s) score: 0.9187604690117253
5 neighbor(s) score: 0.8877721943048577
10 neighbor(s) score: 0.890284757118928
15 neighbor(s) score: 0.873534338358459
20 neighbor(s) score: 0.8835845896147404
25 neighbor(s) score: 0.8710217755443886
30 neighbor(s) score: 0.871859296482412
35 neighbor(s) score: 0.8626465661641541
40 neighbor(s) score: 0.8685092127303182
45 neighbor(s) score: 0.8584589614740369
50 neighbor(s) score: 0.8659966499162479


**UnderSampling**

In [25]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPR, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPR)

1 neighbor(s) score: 0.3927973199329983
5 neighbor(s) score: 0.4204355108877722
10 neighbor(s) score: 0.550251256281407
15 neighbor(s) score: 0.5460636515912898
20 neighbor(s) score: 0.6256281407035176
25 neighbor(s) score: 0.6088777219430486
30 neighbor(s) score: 0.6323283082077052
35 neighbor(s) score: 0.6231155778894473
40 neighbor(s) score: 0.6566164154103853
45 neighbor(s) score: 0.6381909547738693
50 neighbor(s) score: 0.6649916247906198


# Controls and Profitability Ratios Booleans

In [26]:
xTrainPRB = xTrain[:,[1, 2, 3, 13, 14, 15, 16, 19]]
xValPRB = xVal[:,[1, 2, 3, 13, 14, 15, 16, 19]]

**Unbalanced**

In [27]:
printScores(xTrainPRB, yTrainDist, xValPRB)

1 neighbor(s) score: 0.9472361809045227
5 neighbor(s) score: 0.957286432160804
10 neighbor(s) score: 0.9581239530988275
15 neighbor(s) score: 0.9581239530988275
20 neighbor(s) score: 0.9564489112227805
25 neighbor(s) score: 0.9556113902847572
30 neighbor(s) score: 0.9547738693467337
35 neighbor(s) score: 0.9539363484087102
40 neighbor(s) score: 0.9539363484087102
45 neighbor(s) score: 0.9530988274706867
50 neighbor(s) score: 0.9522613065326633


**OverSampling**

In [28]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

1 neighbor(s) score: 0.9053601340033501
5 neighbor(s) score: 0.9103852596314908
10 neighbor(s) score: 0.9095477386934674
15 neighbor(s) score: 0.9020100502512562
20 neighbor(s) score: 0.9036850921273032
25 neighbor(s) score: 0.890284757118928
30 neighbor(s) score: 0.8936348408710217
35 neighbor(s) score: 0.8886097152428811
40 neighbor(s) score: 0.8852596314907872
45 neighbor(s) score: 0.8802345058626466
50 neighbor(s) score: 0.8802345058626466


**UnderSampling**

In [29]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRB)

1 neighbor(s) score: 0.29229480737018426
5 neighbor(s) score: 0.35678391959798994
10 neighbor(s) score: 0.4765494137353434
15 neighbor(s) score: 0.5125628140703518
20 neighbor(s) score: 0.5711892797319933
25 neighbor(s) score: 0.5820770519262981
30 neighbor(s) score: 0.5912897822445561
35 neighbor(s) score: 0.585427135678392
40 neighbor(s) score: 0.5946398659966499
45 neighbor(s) score: 0.5921273031825796
50 neighbor(s) score: 0.6063651591289783


# Controls and Profitability Ratios Non-Boolean

In [30]:
xTrainPRNB = xTrain[:,[1, 2, 3, 10, 11, 12, 17, 18]]
xValPRNB = xVal[:,[1, 2, 3, 10, 11, 12, 17, 18]]

**Unbalanced**

In [31]:
printScores(xTrainPRB, yTrainDist, xValPRB)

1 neighbor(s) score: 0.9472361809045227
5 neighbor(s) score: 0.957286432160804
10 neighbor(s) score: 0.9581239530988275
15 neighbor(s) score: 0.9581239530988275
20 neighbor(s) score: 0.9564489112227805
25 neighbor(s) score: 0.9556113902847572
30 neighbor(s) score: 0.9547738693467337
35 neighbor(s) score: 0.9539363484087102
40 neighbor(s) score: 0.9539363484087102
45 neighbor(s) score: 0.9530988274706867
50 neighbor(s) score: 0.9522613065326633


**OverSampling**

In [32]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

1 neighbor(s) score: 0.9045226130653267
5 neighbor(s) score: 0.8685092127303182
10 neighbor(s) score: 0.8651591289782244
15 neighbor(s) score: 0.847571189279732
20 neighbor(s) score: 0.8433835845896147
25 neighbor(s) score: 0.8274706867671692
30 neighbor(s) score: 0.8241206030150754
35 neighbor(s) score: 0.8107202680067002
40 neighbor(s) score: 0.8115577889447236
45 neighbor(s) score: 0.8023450586264657
50 neighbor(s) score: 0.8031825795644891


**UnderSampling**

In [33]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainPRNB, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValPRNB)

1 neighbor(s) score: 0.4556113902847571
5 neighbor(s) score: 0.5301507537688442
10 neighbor(s) score: 0.6365159128978225
15 neighbor(s) score: 0.7085427135678392
20 neighbor(s) score: 0.7788944723618091
25 neighbor(s) score: 0.7964824120603015
30 neighbor(s) score: 0.8467336683417085
35 neighbor(s) score: 0.8484087102177554
40 neighbor(s) score: 0.8760469011725294
45 neighbor(s) score: 0.8752093802345059
50 neighbor(s) score: 0.8886097152428811


# Constants and Capital Structure

**Features**

In [34]:
xTrainCS = xTrain[:,[1, 2, 3, 20, 21, 22]]
xValCS = xVal[:,[1, 2, 3, 20, 21, 22]]

**Unbalanced**

In [35]:
printScores(xTrainCS, yTrainDist, xValCS)

1 neighbor(s) score: 0.9681742043551089
5 neighbor(s) score: 0.9698492462311558
10 neighbor(s) score: 0.9631490787269682
15 neighbor(s) score: 0.9564489112227805
20 neighbor(s) score: 0.9564489112227805
25 neighbor(s) score: 0.9581239530988275
30 neighbor(s) score: 0.957286432160804
35 neighbor(s) score: 0.9564489112227805
40 neighbor(s) score: 0.9556113902847572
45 neighbor(s) score: 0.9547738693467337
50 neighbor(s) score: 0.9539363484087102


**OverSampling**

In [36]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

1 neighbor(s) score: 0.9371859296482412
5 neighbor(s) score: 0.9304857621440537
10 neighbor(s) score: 0.9338358458961474
15 neighbor(s) score: 0.9179229480737019
20 neighbor(s) score: 0.9154103852596315
25 neighbor(s) score: 0.9003350083752094
30 neighbor(s) score: 0.8969849246231156
35 neighbor(s) score: 0.890284757118928
40 neighbor(s) score: 0.890284757118928
45 neighbor(s) score: 0.8894472361809045
50 neighbor(s) score: 0.8911222780569514


**UnderSampling**

In [37]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainCS, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValCS)

1 neighbor(s) score: 0.5644891122278057
5 neighbor(s) score: 0.7713567839195979
10 neighbor(s) score: 0.8165829145728644
15 neighbor(s) score: 0.7814070351758794
20 neighbor(s) score: 0.8132328308207705
25 neighbor(s) score: 0.8207705192629816
30 neighbor(s) score: 0.7981574539363484
35 neighbor(s) score: 0.7663316582914573
40 neighbor(s) score: 0.7638190954773869
45 neighbor(s) score: 0.7546063651591289
50 neighbor(s) score: 0.7495812395309883


# New Tests 4/20/2020

**Features**

In [38]:
xTrainNT = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]
xValNT = xVal[:,[1,2,3, 4, 5, 6, 7, 8,9,26]]

**Unbalanced**

In [39]:
printScores(xTrainNT, yTrainDist, xValNT)

1 neighbor(s) score: 0.9530988274706867
5 neighbor(s) score: 0.9597989949748744
10 neighbor(s) score: 0.9639865996649917
15 neighbor(s) score: 0.964824120603015
20 neighbor(s) score: 0.9623115577889447
25 neighbor(s) score: 0.9631490787269682
30 neighbor(s) score: 0.9631490787269682
35 neighbor(s) score: 0.9639865996649917
40 neighbor(s) score: 0.9623115577889447
45 neighbor(s) score: 0.9623115577889447
50 neighbor(s) score: 0.9606365159128978


**OverSampling**

In [40]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

1 neighbor(s) score: 0.9380234505862647
5 neighbor(s) score: 0.9103852596314908
10 neighbor(s) score: 0.9061976549413735
15 neighbor(s) score: 0.8844221105527639
20 neighbor(s) score: 0.8819095477386935
25 neighbor(s) score: 0.8785594639865997
30 neighbor(s) score: 0.8743718592964824
35 neighbor(s) score: 0.8710217755443886
40 neighbor(s) score: 0.871859296482412
45 neighbor(s) score: 0.8659966499162479
50 neighbor(s) score: 0.8668341708542714


**UnderSampling**

In [41]:
xTrainBal, yTrainBal = nr.fit_sample(xTrainNT, yTrainDist.ravel())

printScores(xTrainBal, yTrainBal, xValNT)

1 neighbor(s) score: 0.5050251256281407
5 neighbor(s) score: 0.5703517587939698
10 neighbor(s) score: 0.6256281407035176
15 neighbor(s) score: 0.6113902847571189
20 neighbor(s) score: 0.6440536013400335
25 neighbor(s) score: 0.6381909547738693
30 neighbor(s) score: 0.695142378559464
35 neighbor(s) score: 0.6842546063651591
40 neighbor(s) score: 0.7537688442211056
45 neighbor(s) score: 0.7554438860971524
50 neighbor(s) score: 0.8618090452261307


# Best Performer Unbalanced

In [42]:
# xTrainFirsts = xTrain[:,[1,2,3, 4, 10, 20, 23, 27,30]]
# xTestFirsts = xTest[:,[1,2,3, 4, 10, 20, 23, 27,30]]

# knn = KNeighborsClassifier(n_neighbors=10)
# knn.fit(xTrainFirsts, yTrainDist)
# probs = knn.predict_proba(xTestFirsts)
# ids = ySampleIds.tolist()
# probs = probs.tolist()

# f = open("knnTesting.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()




In [43]:
# xTrainLR = xTrain[:,[1,2,3, 4, 5, 6, 7, 8,9]]
# xTestLR = xTest[:,[1,2,3, 4, 5, 6, 7, 8,9]]

# knn = KNeighborsClassifier(n_neighbors=30)
# knn.fit(xTrainLR, yTrainDist)
# probs = knn.predict_proba(xTestLR)
# ids = ySampleIds.tolist()
# probs = probs.tolist()

# f = open("knnTestingSub2.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [44]:
# xTrainLR = xTrain[:,[1, 2, 3, 4, 18]]
# xTestLR = xTest[:,[1, 2, 3, 4, 18]]

# knn = KNeighborsClassifier(n_neighbors=20)
# knn.fit(xTrainLR, yTrainDist)
# probs = knn.predict_proba(xTestLR)
# ids = ySampleIds.tolist()
# probs = probs.tolist()

# f = open("knnTestingTake4.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

In [45]:
# xTrainLR = xTrain[:,[1,2,3, 4, 5, 6, 7, 8, 9, 26]]
# xTestLR = xTest[:,[1,2,3, 4, 5, 6, 7, 8, 9, 26]]

# knn = KNeighborsClassifier(n_neighbors=30)
# knn.fit(xTrainLR, yTrainDist)
# probs = knn.predict_proba(xTestLR)
# ids = ySampleIds.tolist()
# probs = probs.tolist()

# f = open("knnTestingTake5.txt", "w+")
# f.write("Unique Id,DIST\n")

# for i in range(len(ids)):
#     f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
# f.close()

# Best OverSampling Performer

In [46]:
xTrainBal, yTrainBal = sm.fit_sample(xTrainFirsts, yTrainDist.ravel())

scale = StandardScaler().fit(xTrainBal)
xTrainScaled = scale.transform(xTrainBal)
xTestScaled = scale.transform(xTestFirsts)

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(xTrainScaled, yTrainBal)
probs = knn.predict_proba(xTestFirsts)
ids = xTestIds.tolist()
probs = probs.tolist()

f = open("knnTestingTake6.txt", "w+")
f.write("Unique Id,DIST\n")

for i in range(len(ids)):
    f.write(str(int(ids[i])) + "," + str(probs[i][1]) + "\n")
    
f.close()

(1194, 9)
