# Hyperdimensional Computing on Speech Recognition

In [1]:
%matplotlib inline
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import time
import HD_classifier
from dataloader import *
# reload(HD_classifier)



## UCI

In [7]:
data_dir = 'data/UCI/'
sample_per_class = 1000
num_level = 200
dimension = 10000

from sklearn.linear_model import LogisticRegression as LR
from xgboost import XGBClassifier


X_train, y_train = load_uci(data_dir + 'Train_Arabic_Digit.txt', 660, sample_per_class)
X_test, y_test = load_uci(data_dir + 'Test_Arabic_Digit.txt', 220, sample_per_class)
clf = XGBClassifier().fit(X_train, y_train)
print 'XGBoost baseline:'
print 'training: {}, testing: {}'.format(clf.score(X_train, y_train), clf.score(X_test, y_test))

intervals = np.linspace(-15, 10, num_level + 1)
disc_train = np.digitize(X_train, intervals)
disc_test = np.digitize(X_test, intervals)
hdc = HD_classifier.HDClassifier(dimension).fit(disc_train, y_train, num_level)
preds_train = hdc.predict(disc_train)
preds_test = hdc.predict(disc_test)
print 'HDC:'
print 'training: {}, testing: {}'.format((y_train == preds_train).mean(), (y_test == preds_test).mean())

XGBoost baseline:
training: 0.6582, testing: 0.2955
HDC:
training: 0.3877, testing: 0.2095


## ISOLET

In [2]:
data_dir = 'data/ISOLET/'
sample_per_class = 1000
num_level = 50
dimension = 10000

from xgboost import XGBClassifier


X_train, y_train = load_isolet(data_dir + 'isolet1+2+3+4.data')
X_test, y_test = load_isolet(data_dir + 'isolet5.data')
clf = XGBClassifier().fit(X_train, y_train)
print 'XGBoost baseline:'
print 'training: {}, testing: {}'.format(clf.score(X_train, y_train), clf.score(X_test, y_test))

intervals = np.linspace(-15, 10, num_level + 1)
disc_train = np.digitize(X_train, intervals)
disc_test = np.digitize(X_test, intervals)
hdc = HD_classifier.HDClassifier(dimension).fit(disc_train, y_train, num_level)
preds_train = hdc.predict(disc_train)
preds_test = hdc.predict(disc_test)
print 'HDC:'
print 'training: {}, testing: {}'.format((y_train == preds_train).mean(), (y_test == preds_test).mean())

KeyboardInterrupt: 

In [None]:
# CV
print 'sample per class | discrete levels | hyperdimension | training accuracy | testing accuracy'
for sample_per_class in [800]:
    X_train, y_train = load_uci(data_dir + 'Train_Arabic_Digit.txt', 660, sample_per_class)
    X_test, y_test = load_uci(data_dir + 'Test_Arabic_Digit.txt', 220, sample_per_class)
    for num_level in [200]:
        intervals = np.linspace(-15, 10, num_level + 1)
        disc_train = np.digitize(X_train, intervals)
        disc_test = np.digitize(X_test, intervals)
        for dimension in [10000]:
            t = time.time()
            hdc = HD_classifier.HDClassifier(dimension).fit(disc_train, y_train, num_level)
            preds_train = hdc.predict(disc_train)
            preds_test = hdc.predict(disc_test)
            t = time.time() - t
            print '{:<18} {:<17} {:<16} {:<19} {:<18} {}'.format(sample_per_class,
                                                              num_level,
                                                              dimension,
                                                              (y_train == preds_train).mean(),
                                                              (y_test == preds_test).mean())