In [1]:
# load data
import pandas as pd
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

In [2]:
# take target out of training set
Y = train_data['type']
train_data = train_data.drop(['type', 'id', 'color'], axis=1)
test_data = test_data.drop(['id', 'color'], axis=1)

In [3]:
#standardize values
train_data = (train_data - train_data.mean()) / train_data.std()
test_data = (test_data - test_data.mean()) / test_data.std()

In [9]:
from sklearn import linear_model
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

alphas = [0.1, 0.5, 0.7, 1.0, 1.2, 2, 5, 10, 20, 100]
regs = ["l1", "l2"]
#scores and param for OneVsRest
scores = []
param = []
#scores and param for OneVsOne
scores2 = []
param2 = []
for alpha in alphas:
    for reg in regs:
        #tuning for OneVsRest
        lm = OneVsRestClassifier(linear_model.LogisticRegression(penalty = reg, C = alpha, solver = 'liblinear'))
        scores.append(cross_val_score(lm, train_data, Y, scoring="accuracy", cv = 2).mean())
        param.append([alpha, reg])
        #tuning for OneVsOne
        lm2 = OneVsOneClassifier(linear_model.LogisticRegression(penalty = reg, C = alpha, solver = 'liblinear'))
        scores2.append(cross_val_score(lm2, train_data, Y, scoring="accuracy", cv = 2).mean())
        param2.append([alpha, reg])
OvRscores = pd.DataFrame({'parameter': param, 'score': scores})
OvOscores = pd.DataFrame({'parameter2': param2, 'score':scores2})

print('One vs Rest\n', OvRscores.sort_values(by = 'score', ascending = False))
print('One vs One\n', OvOscores.sort_values(by = 'score', ascending = False))

One vs Rest
     parameter     score
19  [100, l2]  0.741136
18  [100, l1]  0.741136
17   [20, l2]  0.738418
16   [20, l1]  0.738418
15   [10, l2]  0.738418
14   [10, l1]  0.738418
13    [5, l2]  0.738418
12    [5, l1]  0.738418
6   [1.0, l1]  0.735657
11    [2, l2]  0.732984
10    [2, l1]  0.732984
9   [1.2, l2]  0.732984
8   [1.2, l1]  0.732984
7   [1.0, l2]  0.730310
4   [0.7, l1]  0.730310
3   [0.5, l2]  0.730310
5   [0.7, l2]  0.727592
2   [0.5, l1]  0.727592
1   [0.1, l2]  0.716941
0   [0.1, l1]  0.711462
One vs One
    parameter2     score
7   [1.0, l2]  0.759983
1   [0.1, l2]  0.757309
9   [1.2, l2]  0.757266
5   [0.7, l2]  0.754636
11    [2, l2]  0.754592
3   [0.5, l2]  0.751962
10    [2, l1]  0.751918
6   [1.0, l1]  0.751918
13    [5, l2]  0.751918
15   [10, l2]  0.749244
8   [1.2, l1]  0.749201
18  [100, l1]  0.746571
17   [20, l2]  0.746571
16   [20, l1]  0.746571
19  [100, l2]  0.746571
14   [10, l1]  0.746571
12    [5, l1]  0.746571
4   [0.7, l1]  0.746571
2   [0.5, l1]  

In [10]:
ovr = OneVsRestClassifier(linear_model.LogisticRegression(penalty = "l2", C = 100, solver = 'liblinear'))
ovr_fitted = ovr.fit(train_data, Y)
res1 = ovr_fitted.predict(test_data)

In [12]:
ovo = OneVsOneClassifier(linear_model.LogisticRegression(penalty = "l2", C = 1, solver = 'liblinear'))
ovo_fitted = ovo.fit(train_data, Y)
res2 = ovo_fitted.predict(test_data)

In [27]:
# cross validation for PCA KNN
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

neigh = [1, 3, 7, 10]
Ks = [1, 2, 3]
scores = []
param = []
for K in Ks: 
    pca = PCA(n_components = K, svd_solver='arpack')
    pca.fit(train_data)
    pca_train = pca.transform(train_data)
    print("processed... K:", K)
    for n in neigh:
        knn = KNeighborsClassifier(n_neighbors = n)
        scores.append(cross_val_score(knn, train_data, Y, scoring="accuracy", cv = 10).mean())
        param.append([n, K])
        print("processed... n:", n)
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

processed... K: 1
processed... n: 1
processed... n: 3
processed... n: 7
processed... n: 10
processed... K: 2
processed... n: 1
processed... n: 3
processed... n: 7
processed... n: 10
processed... K: 3
processed... n: 1
processed... n: 3
processed... n: 7
processed... n: 10
   parameter     score
2     [7, 1]  0.733034
6     [7, 2]  0.733034
10    [7, 3]  0.733034
3    [10, 1]  0.723648
7    [10, 2]  0.723648
11   [10, 3]  0.723648
1     [3, 1]  0.692203
5     [3, 2]  0.692203
9     [3, 3]  0.692203
0     [1, 1]  0.654767
4     [1, 2]  0.654767
8     [1, 3]  0.654767


In [28]:
pca = PCA(n_components = 1, svd_solver='arpack')
pca.fit(train_data)
pca_train = pca.transform(train_data)
pca_test = pca.transform(test_data)
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(pca_train, Y)
res3 = knn.predict(pca_test)

In [23]:
import math

# RBF function
def rbf(data, centers, sigma):
    res = np.ndarray(shape = (len(data), len(centers)))
    i = 0
    for row in data:
        tmp = []
        for center in centers:
            tmp.append(math.exp(-1.0 * sum(np.square(row - center)) / (2 * sigma * sigma)))
        res[i, :] = tmp
        i = i + 1
    return res
# select k centers from data
def selectCenters(data, k):
    tmp = np.random.choice(len(data), k)
    return data[tmp,:]

In [25]:
# cross validation on RBF KNN
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

neigh = [1, 10, 20]
sigmas = [1.0, 2.0, 4.0]
Ks = [100, 400, 800, 1600]
scores = []
param = []
for K in Ks:
    centers = selectCenters(train_data.as_matrix(), K)
    print("processed... k:", K)
    for sigma in sigmas:
        rbfX = rbf(train_data.as_matrix(), centers, sigma)
        print("processed... sigma:", sigma)
        for n in neigh:
            knn = KNeighborsClassifier(n_neighbors = n)
            scores.append(cross_val_score(knn, train_data, Y, scoring="accuracy", cv = 10).mean())
            param.append([n, sigma, K])
            print("processed... n:", n)
scores = pd.DataFrame({'parameter': param, 'score': scores})
print(scores.sort_values(by = 'score', ascending = False))

  # This is added back by InteractiveShellApp.init_path()
  


processed... k: 100
processed... sigma: 1.0
processed... n: 1
processed... n: 10
processed... n: 20


  


processed... sigma: 2.0
processed... n: 1
processed... n: 10
processed... n: 20


  


processed... sigma: 4.0
processed... n: 1
processed... n: 10
processed... n: 20


  # This is added back by InteractiveShellApp.init_path()
  


processed... k: 400
processed... sigma: 1.0
processed... n: 1
processed... n: 10
processed... n: 20


  


processed... sigma: 2.0
processed... n: 1
processed... n: 10
processed... n: 20


  


processed... sigma: 4.0
processed... n: 1
processed... n: 10
processed... n: 20
processed... k: 800


  # This is added back by InteractiveShellApp.init_path()
  


processed... sigma: 1.0
processed... n: 1
processed... n: 10
processed... n: 20


  


processed... sigma: 2.0
processed... n: 1
processed... n: 10
processed... n: 20


  


processed... sigma: 4.0
processed... n: 1
processed... n: 10
processed... n: 20
processed... k: 1600


  # This is added back by InteractiveShellApp.init_path()
  


processed... sigma: 1.0
processed... n: 1
processed... n: 10
processed... n: 20


  


processed... sigma: 2.0
processed... n: 1
processed... n: 10
processed... n: 20


  


processed... sigma: 4.0
processed... n: 1
processed... n: 10
processed... n: 20
          parameter     score
13   [10, 2.0, 400]  0.723648
16   [10, 4.0, 400]  0.723648
25   [10, 4.0, 800]  0.723648
34  [10, 4.0, 1600]  0.723648
4    [10, 2.0, 100]  0.723648
28  [10, 1.0, 1600]  0.723648
7    [10, 4.0, 100]  0.723648
22   [10, 2.0, 800]  0.723648
10   [10, 1.0, 400]  0.723648
31  [10, 2.0, 1600]  0.723648
19   [10, 1.0, 800]  0.723648
1    [10, 1.0, 100]  0.723648
23   [20, 2.0, 800]  0.713640
29  [20, 1.0, 1600]  0.713640
20   [20, 1.0, 800]  0.713640
17   [20, 4.0, 400]  0.713640
35  [20, 4.0, 1600]  0.713640
14   [20, 2.0, 400]  0.713640
11   [20, 1.0, 400]  0.713640
32  [20, 2.0, 1600]  0.713640
8    [20, 4.0, 100]  0.713640
5    [20, 2.0, 100]  0.713640
2    [20, 1.0, 100]  0.713640
26   [20, 4.0, 800]  0.713640
30   [1, 2.0, 1600]  0.654767
27   [1, 1.0, 1600]  0.654767
33   [1, 4.0, 1600]  0.654767
0     [1, 1.0, 100]  0.654767
24    [1, 4.0, 800]  0.654767
21    [1, 2.0, 800] 

In [30]:
centers = selectCenters(train_data.as_matrix(), 400)
rbfX = rbf(train_data.as_matrix(), centers, 2.0)
rbfTest = rbf(test_data.as_matrix(), centers, 2.0)
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(rbfX, Y)
res4 = knn.predict(rbfTest)

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [31]:
# save predictions
sample_data = pd.read_csv("../data/sample_submission.csv")
sample_data['type'] = res4
sample_data.to_csv('../prediction4.csv', index = False)