In [1]:
import numpy
from sklearn import svm
import gzip


def generate_features(data_ist):
    feature_list = []
    for data in data_ist:
        curr_features = []
        curr_features.append(numpy.amin(data['speed']))
        curr_features.append(numpy.mean(data['speed']))
        curr_features.append(numpy.amax(data['speed']))
        curr_features.append(numpy.amin(data['heart_rate']))
        curr_features.append(numpy.mean(data['heart_rate']))
        curr_features.append(numpy.amax(data['heart_rate']))

        feature_list.append(curr_features)
    return feature_list

def correctness(pred, real):
    total = 0
    errors = 0
    for i in range(len(pred)):
        total += 1
        if pred[i] != real[i]:
            errors += 1

    return 1 - errors / total

In [10]:
print("Reading data...")

tin = gzip.open('../RunningBikingTrainValidTest/training_set_running_and_biking.json.gz', 'rb')
vin = gzip.open('../RunningBikingTrainValidTest/validation_set_running_and_biking.json.gz', 'rb')

train_list = []
valid_list = []

for l in tin:
    l = l.decode('ascii')
    dic = eval(l)
    train_list.append(dic)
for l in vin:
    l = l.decode('ascii')
    dic = eval(l)
    valid_list.append(dic)

tin.close()
vin.close()

print("done")

Reading data...
done


In [14]:
X_train_list = generate_features(train_list)
y_train_list = [b['sport'] is 'run' for b in train_list]
X_valid_list = generate_features(valid_list)
y_valid_list = [b['sport'] is 'run' for b in valid_list]

KeyboardInterrupt: 

In [12]:
print("Starting")
print("kernel is linear")
print("Features are Min, Avg and Max of speed and HR")

c_list = [0.001,0.01,0.1,1,2,3,4,5,6,7,8,9,10, 100, 500, 1000, 1500, 2000]

train_errors = []
valid_errors = []
for c in c_list:
    print("Running for c = " + str(c))
    clf = svm.SVC(kernel='linear', C=c)
    clf.fit(X_train_list, y_train_list)

    train_predictions = clf.predict(X_train_list)
    valid_predictions = clf.predict(X_valid_list)

    train_errors.append(correctness(train_predictions, y_train_list))
    valid_errors.append(correctness(valid_predictions, y_valid_list))

    print("{:.6}\t{:.6}".format(train_errors[len(train_errors) - 1], valid_errors[len(valid_errors) - 1]))

Starting
kernel is linear
Features are Min, Avg and Max of speed and HR
Running for c = 0.001
0.975048	0.974144
Running for c = 0.01
0.975333	0.974109
Running for c = 0.1
0.975378	0.974109
Running for c = 1
0.975258	0.974179
Running for c = 2
0.975243	0.974109
Running for c = 3
0.975078	0.974109
Running for c = 4
0.975183	0.974039
Running for c = 5
0.975183	0.974144
Running for c = 6
0.975168	0.974109
Running for c = 7
0.975108	0.974144
Running for c = 8
0.975048	0.974424
Running for c = 9
0.974868	0.974424
Running for c = 10
0.975183	0.974389
Running for c = 100
0.974673	0.974319
Running for c = 500
0.975033	0.974459
Running for c = 1000
0.974868	0.974459
Running for c = 1500
0.974898	0.974319
Running for c = 2000
0.974673	0.974354


In [13]:
print("Reading data...")
testin = gzip.open('../RunningBikingTrainValidTest/test_set_running_and_biking.json.gz', 'rb')
test_list = []
for l in testin:
    l = l.decode('ascii')
    dic = eval(l)
    test_list.append(dic)
testin.close()
print("done")

Reading data...
done


In [15]:
X_test_list = generate_features(test_list)
y_test_list = [b['sport'] is 'run' for b in test_list]

print("Testing the running-biking classifier")
print("kernel is linear, C is 1000")
print("Features are Min, Avg and Max of speed and HR")
C_val = 1000

# Build the classifier
clf = svm.SVC(kernel='linear', C = C_val)
clf.fit(X_train_list, y_train_list)

test_predictions = clf.predict(X_test_list)
test_errors = correctness(test_predictions, y_test_list)
print(str(test_errors))

Testing the running-biking classifier
kernel is linear, C is 1000
Features are Min, Avg and Max of speed and HR
0.9733129703687377
