In [2]:
import pandas as pd
import torch
import onlinehd
from time import time
import sklearn.preprocessing

In [3]:
with open('UCI HAR Dataset/features.txt') as f:
    features = [line.split()[1] for line in f.readlines()]
temp = []

for f in features:
    while f in temp:
        f = f + '_'
    temp.append(f)
features = temp

In [4]:
X_train = pd.read_csv('UCI HAR Dataset/train/X_train.txt', delim_whitespace=True, header=None, names=features)
Y_train = pd.read_csv('UCI HAR Dataset/train/Y_train.txt', squeeze = True, header=None)

In [5]:
X_test = pd.read_csv('UCI HAR Dataset/test/X_test.txt', delim_whitespace=True, header=None, names=features)
Y_test = pd.read_csv('UCI HAR Dataset/test/Y_test.txt', squeeze = True, header=None)

In [6]:
scaler = sklearn.preprocessing.Normalizer().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
X_train = torch.from_numpy(X_train).float()
Y_train = torch.from_numpy(Y_train.to_numpy()).long() - 1
X_test = torch.from_numpy(X_test).float()
Y_test = torch.from_numpy(Y_test.to_numpy()).long() - 1

In [8]:
classes = Y_train.unique().size(0)
features = X_train.size(1)

In [9]:
print('Training Dataset Shape =', tuple(X_train.shape))
print('Testing Dataset Shape =', tuple(X_test.shape))
print('Number of Classes =', classes)

Training Dataset Shape = (7352, 561)
Testing Dataset Shape = (2947, 561)
Number of Classes = 6


In [10]:
out = pd.DataFrame(columns=['learning rate', 'epochs', 'dimension', 'bootstrap','testing accuracy', 'training time'])

In [11]:
def learn_and_test(X_train, Y_train, X_test, Y_test, dim, lr, epoch, bs):
    global out
    model = onlinehd.OnlineHD(classes, features, dim=dim)
    
    if torch.cuda.is_available():
        X_train = X_train.cuda()
        Y_train = Y_train.cuda()
        X_test = X_test.cuda()
        Y_test = Y_test.cuda()
        model = model.to('cuda')
        
    print('Training...')
    t = time()
    encoded = model.encode(X_train)
    probs = 0.0
    binomial = torch.distributions.binomial.Binomial(probs=1-probs)
    encoded = encoded * binomial.sample(encoded.shape).cuda() * (1.0 / (1 - probs))
    model = model.fit(encoded, Y_train, encoded = True, bootstrap=bs, lr=lr, epochs=epoch, batch_size=1024)
    t = time() - t
    print('Validating...')
    yhat = model(X_train)
    yhat_test = model(X_test)
    acc = (Y_train == yhat).float().mean()
    acc_test = (Y_test == yhat_test).float().mean()
    out = out.append(pd.Series([lr, epoch, dim, bs, acc_test.cpu().item(), t], index = ['learning rate', 'epochs', 'dimension', 'bootstrap', 'testing accuracy', 'training time']), ignore_index=True)
    print(f'{acc = :6f}')
    print(f'{acc_test = :6f}')
    print(f'{t = :6f}')


In [12]:
for lr in [0.2, 0.3, 0.4, 0.5]:
    for epoch in [20, 40, 60]:
        for dim in [5000, 7500, 10000]:
            for bs in [0.25, 0.5]:
                print('===============================')
                print(lr, epoch, dim, bs)
                learn_and_test(X_train, Y_train, X_test, Y_test, dim , lr, epoch, bs)

0.2 20 5000 0.25
Training...
Validating...
acc = 0.968716
acc_test = 0.916186
t = 5.376006
0.2 20 5000 0.5
Training...
Validating...
acc = 0.951714
acc_test = 0.908721
t = 3.378966
0.2 20 7500 0.25
Training...
Validating...
acc = 0.968580
acc_test = 0.920258
t = 4.669000
0.2 20 7500 0.5
Training...
Validating...
acc = 0.949401
acc_test = 0.902952
t = 3.683999
0.2 20 10000 0.25
Training...
Validating...
acc = 0.970076
acc_test = 0.917543
t = 4.863031
0.2 20 10000 0.5
Training...
Validating...
acc = 0.951442
acc_test = 0.907703
t = 3.843999
0.2 40 5000 0.25
Training...
Validating...
acc = 0.980005
acc_test = 0.929420
t = 5.567997
0.2 40 5000 0.5
Training...
Validating...
acc = 0.969804
acc_test = 0.922633
t = 4.535000
0.2 40 7500 0.25
Training...
Validating...
acc = 0.979733
acc_test = 0.928062
t = 5.745001
0.2 40 7500 0.5
Training...
Validating...
acc = 0.970348
acc_test = 0.921615
t = 4.735031
0.2 40 10000 0.25
Training...
Validating...
acc = 0.981638
acc_test = 0.927384
t = 5.967995
0

In [13]:
out['testing accuracy'].max()

0.9368849992752075

In [14]:
out.to_csv('UCI_HAR_result.csv',index=None)