## Imports and loading data

In [1]:
from collections import Counter

import numpy as np
import pandas as pd

from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [2]:
np.set_printoptions(precision=4)

In [3]:
X = np.genfromtxt('data/tra°bindata.csv', delimiter=',')
y = np.genfromtxt('data/trainlabel.csv', delimiter=',')
X_val = np.genfromtxt('data/testdata.csv', delimiter=',')

## Standardizing

In [4]:
scaler = preprocessing.StandardScaler().fit(X)

In [5]:
X_std = scaler.transform(X)
X_val_std = scaler.transform(X_val)

## Nested cross validation for hyperparameter tuning and performance measurement

In [6]:
def nested_CV(model, model_param_grid, X, y):
    
    outer_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0).split(X, y)
    inner_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    (scores, test_scores, model_list) = ([], [], [])
    
    for k, (train, test) in enumerate(outer_kfold):                         
        gs = GridSearchCV(
                    estimator=model,
                    param_grid=model_param_grid,
                    scoring='accuracy',
                    cv=inner_kfold,
                    n_jobs=-1)
        
        gs.fit(X[train], y[train])
        best_model = gs.best_estimator_
        best_inner_score = gs.best_score_
        test_scores.append(best_inner_score)
        model_list.append(best_model)
        
        best_model.fit(X[train], y[train])
        y_pred = best_model.predict(X[test])
        
        cur_acc = accuracy_score(y_true=y[test], y_pred=y_pred)
        scores.append(cur_acc)
        
    return scores, test_scores, model_list

### Logistic Regression

In [7]:
lr_model = LogisticRegression(random_state=0)
lr_param_grid=[{'C': [0.1, 1, 10, 50, 100, 500]}]

In [8]:
lr_scores, lr_test_scores, lr_models = nested_CV(lr_model, lr_param_grid, X_std, y)

In [9]:
np.array([np.mean(lr_scores), np.std(lr_scores)])

array([ 0.9223,  0.0136])

In [10]:
np.array([np.mean(lr_test_scores), np.std(lr_test_scores)])

array([ 0.9235,  0.0034])

In [11]:
for model in lr_models:
    print(model.C)

50
500
500
10
100


### Support Vector Machine

In [12]:
svm_model = SVC(random_state=0)
svm_param_grid=[{'C': [0.01, 0.1, 1, 10, 50, 100], 'kernel':['linear', 'rbf']}]

In [13]:
svm_scores, smv_test_scores, smv_models = nested_CV(svm_model, svm_param_grid, X_std, y)

In [14]:
np.array([np.mean(svm_scores), np.std(svm_scores)])

array([ 0.9258,  0.011 ])

In [15]:
np.array([np.mean(smv_test_scores), np.std(smv_test_scores)])

array([ 0.9276,  0.0029])

In [16]:
for model in smv_models:
    print(model.C, model.kernel)

1 rbf
10 rbf
1 rbf
1 linear
1 rbf


### K-Nearest Neighbours

In [17]:
knn_model = KNeighborsClassifier(metric='minkowski')
knn_param_grid= [{'n_neighbors': [1, 2, 3, 4, 5, 6], 'p': [1, 2]}]

In [18]:
knn_scores, knn_test_scores, knn_models = nested_CV(knn_model, knn_param_grid, X_std, y)

In [19]:
np.array([np.mean(knn_scores), np.std(knn_scores)])

array([ 0.9078,  0.01  ])

In [20]:
np.array([np.mean(knn_test_scores), np.std(knn_test_scores)])

array([ 0.9026,  0.0047])

In [21]:
for model in knn_models:
    print(model.n_neighbors, model.p)

5 1
5 1
3 1
5 1
5 1


### Random Forest

In [22]:
rf_model = RandomForestClassifier(random_state=0, n_jobs=-1)
rf_param_grid = [{'n_estimators': [50, 150, 300, 500], 'criterion': ['gini', 'entropy']}]

In [23]:
rf_scores, rf_test_scores, rf_models = nested_CV(rf_model, rf_param_grid, X_std, y)

In [24]:
np.array([np.mean(rf_scores), np.std(rf_scores)])

array([ 0.9484,  0.0065])

In [25]:
np.array([np.mean(rf_test_scores), np.std(rf_test_scores)])

array([ 0.9484,  0.0023])

In [26]:
for model in rf_models:
    print(model.n_estimators, model.criterion)

50 gini
500 entropy
150 gini
50 entropy
300 entropy


### Naive Bayes

In [27]:
nb_model = GaussianNB()
nb_param_grid = [{}]

In [28]:
nb_scores, nb_test_scores, nb_models = nested_CV(nb_model, nb_param_grid, X_std, y)

In [29]:
np.array([np.mean(nb_scores), np.std(nb_scores)])

array([ 0.8335,  0.0104])

In [30]:
np.array([np.mean(nb_test_scores), np.std(nb_test_scores)])

array([ 0.8358,  0.0043])

## Predicting on test set

In [31]:
predictions = np.zeros(X_val_std.shape[0])

chosen_lr = LogisticRegression(random_state=0, C=500)
chosen_svm = SVC(random_state=0, C=1, kernel='rbf')
chosen_knn = KNeighborsClassifier(metric='minkowski', n_neighbors=5, p=1)
chosen_rf = RandomForestClassifier(random_state=0, n_estimators=500, criterion='entropy',n_jobs=-1)
chosen_nb = GaussianNB()

for chosen_model in [chosen_lr, chosen_svm, chosen_knn, chosen_rf, chosen_nb]:
    
    chosen_model.fit(X_std, y)
    y_test_pred = chosen_model.predict(X_val_std)
    predictions = predictions + y_test_pred

In [32]:
predictions[predictions < 3] = 0
predictions[predictions >= 3] = 1

In [33]:
Counter(predictions)

Counter({0.0: 863, 1.0: 517})

In [46]:
np.savetxt(fname='project1_20483569.csv', X=predictions, fmt='%d')