In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.linear_model import LogisticRegression
from glob import glob #finds all pathnames matching specified pattern
import os 

from sklearn.preprocessing import StandardScaler

In [3]:
#Read Data
def prepare_train_data(file_name):
    """Read and prepare training data"""
    data = pd.read_csv(file_name)
    #events file
    events_fname = file_name.replace('_data', '_events')
    #read event file
    labels = pd.read_csv(events_fname)
    clean = data.drop(['id'], axis=1) # remove id
    labels = labels.drop(['id'], axis=1)
    return clean, labels

In [4]:
def prepare_test_data(file_name):
    """Read and prepare test data"""
    data = pd.read_csv(file_name)
    return data

In [5]:
def preprocess_training_data(X):
    X_prep = scaler.fit_transform(X) #computes mean, sd and then transforms
    #Do any other preprocessing here beyonf standardizing
    return X_prep

In [6]:
def preprocess_testing_data(X):
    X_prep=scaler.transform(X)
    #do here your preprocessing
    return X_prep

In [7]:
#Num Subjects
subjects = range(1, 2)
ids_total = []
predicted_total = []

In [8]:
# training subsample.if want to downsample the training data
subsample = 100


In [9]:
#Standardize features by removing mean and scaling to unit variance
scaler = StandardScaler() #just prepares for later use

In [29]:
#column names for labels
labels = ['HandStart','FirstDigitTouch','BothStartLoadPhase','LiftOff','Replace','BothReleased']

In [17]:
#loop over subjects and 8 series for training data + 2 series for test data
for subject in subjects:
    raw_labels = []
    raw_data = []
    
    #Read Training Data
    fnames = glob('./data/train/subj%d_series*_data.csv' % (subject))
    for fname in fnames:
        data, labels = prepare_train_data(fname)
        raw_data.append(data)
        raw_labels.append(labels)
        
    X = pd.concat(raw_data)
    Y = pd.concat(raw_labels)
    
    #Read Test Data
    fnames = glob('./data/test/subj%d_series*_data.csv' % (subject))
    test = []
    idx = []
    for fname in fnames:
        data = prepare_test_data(fname)
        test.append(data)
        idx.append(np.array(data['id']))
    X_test = pd.concat(test)
    ids = np.concatenate(idx)
    ids_total.append(ids)
    X_test = X_test.drop(['id'], axis=1) #remove id
        

In [18]:
#Form into numpy array and transform to floats
X_train  = np.asarray(X.astype(float))
Y_train = np.asarray(Y.astype(float))
X_test = np.asarray(X_test.astype(float))


In [19]:
print(X_train.shape)
#1422392 rows are the 30 trials for 8 series for (1) subject with a sampling rate at 500 Hz (500 samples per second)
# or 1 sample every 2 ms (also meaning a sample lasts 2 ms)

(1422392, 32)


In [20]:
print(Y_train.shape)

(1422392, 6)


In [21]:
#Train Classifiers
logReg = LogisticRegression()

In [23]:
scaler= StandardScaler()
predicted = np.empty((X_test.shape[0], len(labels)))
X_train = preprocess_training_data(X_train)
X_test = preprocess_testing_data(X_test)


In [34]:
for subject in subjects:
    for i in range(len(labels)):
        y_train = Y_train[:,i]
        print('Train subject %d, class %s' % (subject, labels[i]))
        logReg.fit(X_train[::subsample, :], y_train[::subsample])
        predicted[:,i] = logReg.predict_proba(X_test)[:,1]
    
        predicted_total.append(predicted)
    

Train subject 1, class HandStart
Train subject 1, class FirstDigitTouch
Train subject 1, class BothStartLoadPhase
Train subject 1, class LiftOff
Train subject 1, class Replace
Train subject 1, class BothReleased


In [None]:
temp = pd.DataFrame(predicted_total)

In [None]:
temp

In [None]:
#Output file
output = 'exploration.csv'
submission = pd.DataFrame(index=np.concatenate(ids_tot), columns=labels, data=np.concatenate(predicted_total))

#write file
submission.to.csv(output, index_label='id', float_format='%.3f')