In [2]:
import numpy as np
import csv

In [3]:
def load_data():
    with open('./data/data_no_pandas.csv', 'r') as data_file:
        reader = csv.reader(data_file)
        matrix = [row for row in reader]
    return matrix
    
def load_features():
    with open('./data/mock_timings.csv', 'r') as data_file:
        reader = csv.reader(data_file)
        matrix = [row for row in reader]
    return matrix

def standardize(matrix):
    return (matrix - np.mean(matrix, axis=0)) / np.std(matrix, axis=0)

In [7]:
features_list = load_features()
#print(features_list[1:])
features = np.array(features_list[1:])
floats = features.astype(float)
training_features = floats[:, 0:9]

features_to_standardize = features[:, :4]

floats = features_to_standardize.astype(float)
standardized = standardize(floats)

print("Standard Deviation of each Row should be 1: ", np.std(standardized, axis=0))
print("Mean of each Row should be 0: ", np.round(np.mean(standardized, axis=0)))

print(np.shape(standardized))
timing_labels = np.argmin(training_features[:, 4:], axis=1)
print(np.shape(timing_labels))

X = standardized
y = timing_labels


Standard Deviation of each Row should be 1:  [1. 1. 1. 1.]
Mean of each Row should be 0:  [-0. -0. -0.  0.]
(1000, 4)
(1000,)


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=2)

svc = svm.SVC(kernel="rbf", C=0.5, gamma=2)
forest = RandomForestClassifier(n_estimators=100)

svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)
print('SVM accuracy: ', np.mean(y_test == y_pred_svc))

forest.fit(X_train, y_train)
y_pred_forest = svc.predict(X_test)
print('Forest accuracy: ', np.mean(y_test == y_pred_forest))

cv = StratifiedKFold(n_splits=5, random_state=5, shuffle=True)

svc_accuracy = cross_val_score(svc, standardized, timing_labels, cv=cv, scoring='accuracy')
forest_accuracy = cross_val_score(forest, standardized, timing_labels, cv=cv, scoring='accuracy')

print(svc_accuracy)
print(forest_accuracy)


SVM accuracy:  0.7933333333333333
Forest accuracy:  0.7933333333333333
[0.795 0.76  0.77  0.8   0.795]
[0.775 0.755 0.76  0.795 0.775]


In [55]:
from sklearn.model_selection import GridSearchCV

param_grid = [
  {'C': [0.1, 0.5, 1, 3, 5, 7, 10, 25, 50, 100], 
   'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 1.5, 2, 3], 
   'kernel': ['rbf']},
]

classifier = GridSearchCV(svm.SVC(), param_grid)
classifier.fit(standardized, timing_labels)
print(classifier.best_estimator_)

SVC(C=0.5, gamma=2)
