In [1]:
import pandas as pd
import numpy as np
from io import StringIO
import os

In [2]:
def read_datafile(fn):
    with open(fn, 'r') as f:
        header_1 = f.readline()
        subject = header_1.split('.')[0][2:]
        header_2 = f.readline()
        header_3 = f.readline()
        header_4 = f.readline()
        stimulus = ' '.join(header_4.split(' ')[1:3])
        header_5 = f.readline()
        rest = f.read()
        if(len(rest)>0):
            data_trial = pd.read_csv(StringIO(rest), sep=' ', header=None)
            data_trial.columns = ['trial', 'sensor', 'sample', 'value']
            data_trial['subject'] = subject
            data_trial['stimulus'] = stimulus
        else:
            data_trial = None
    return data_trial

In [3]:
data_dir = '/media/sf_VBox_Shared/timeseries/UCI_EEG_alcoholic/data/'

In [6]:
%%prun
dat = pd.DataFrame(columns=['subject', 'stimulus', 'trial', 'sensor', 'sample', 'value'])
for fn in os.listdir(data_dir)[:100]:
    full_fn = os.path.join(data_dir, fn)
    if os.path.isfile(full_fn):
        try:
            data_trial = read_datafile(full_fn)
            if data_trial is not None:
                dat = dat.append(data_trial)
        except Exception as err:
            print(fn, err)

 

In [19]:
dat[['subject', 'trial']].drop_duplicates().shape

(2119, 2)

In [4]:
#%%prun
import time

Xes = []
sensors = []
labels = []
headers = []

t = time.time()
for fn in os.listdir(data_dir):
#for fn in np.random.choice(os.listdir(data_dir), 100):
    full_fn = os.path.join(data_dir, fn)
    if os.path.isfile(full_fn):
        try:
            data_trial = pd.read_csv(full_fn, sep=' ', header=None, comment='#')
            if data_trial is not None:
                data_trial.columns = ['trial', 'sensor', 'sample', 'value']
                pivoted = data_trial.pivot_table(index='sample', columns='sensor', values='value')
                Xes.append(pivoted.as_matrix())
                labels.append(fn[3])
                sensors.append(pivoted.columns)
                with open(full_fn, 'r') as f:
                    header = [f.readline() for i in range(5)]
                    headers.append(header)
        except Exception as err:
            print(fn, err)
print((time.time()-t), 's')

co2c1000367.rd.065 No columns to parse from file
co2c1000367.rd.089 No columns to parse from file
co2c1000367.rd.090 No columns to parse from file
co2c1000367.rd.105 No columns to parse from file
co2c1000367.rd.113 No columns to parse from file
co2c1000367.rd.114 No columns to parse from file
co2c1000367.rd.116 No columns to parse from file
co2c1000367.rd.117 No columns to parse from file
co2c1000367.rd.004 No columns to parse from file
co2c1000367.rd.005 No columns to parse from file
co2c1000367.rd.006 No columns to parse from file
co2c1000367.rd.023 No columns to parse from file
co2c1000367.rd.029 No columns to parse from file
co2c1000367.rd.037 No columns to parse from file
co2c1000367.rd.042 No columns to parse from file
co2c1000367.rd.053 No columns to parse from file
co2c1000367.rd.054 No columns to parse from file
318.12917041778564 s


In [12]:
stimuli = [h[3].split(',')[0].strip() for h in headers]
pd.Series(stimuli).value_counts()

# S1 obj            5477
# S2 match          2757
# S2 nomatch        2728
# S2 match err        60
# S2 nomatch err      35
dtype: int64

In [40]:
subjects = [h[0].split(' ')[1].strip() for h in headers]
trials = [h[3].split('trial ')[-1].strip() for h in headers]

In [41]:
metadata = pd.DataFrame({'subject': subjects, 'trial': trials, 'stimuli': stimuli})
print(metadata.shape)
metadata.head()

(11057, 3)


Unnamed: 0,stimuli,subject,trial
0,# S2 nomatch,co3c0000402.rd,13
1,# S1 obj,co3c0000402.rd,14
2,# S2 nomatch,co3c0000402.rd,15
3,# S1 obj,co3c0000402.rd,16
4,# S2 match,co3c0000402.rd,17


In [42]:
# The shape should be: (num_samples, num_timesteps, num_channels)
Xa = np.array(Xes)
Xa.shape

(11057, 256, 64)

In [43]:
pd.DataFrame(sensors).drop_duplicates()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,AF1,AF2,AF7,AF8,AFZ,C1,C2,C3,C4,C5,...,PO8,POZ,PZ,T7,T8,TP7,TP8,X,Y,nd


In [44]:
# How many subjects do we have for each label?
pd.Series(labels).value_counts()

a    7033
c    4024
dtype: int64

In [45]:
# filter for errors
no_error =  ~metadata.stimuli.str.contains('err')
sum(no_error)

10962

In [46]:
Xa_filtered = Xa[no_error]
Xa_filtered.shape

(10962, 256, 64)

In [53]:
# Do we have NaN values?
np.isnan(Xa_filtered).sum()

0

In [47]:
metadata_filtered = metadata[no_error]

In [64]:
preprocessed_path = '/media/sf_VBox_Shared/timeseries/UCI_EEG_alcoholic/preprocessed/'
np.save(os.path.join(preprocessed_path, 'X.npy'), arr=Xa_filtered)
metadata_filtered.to_csv(os.path.join(preprocessed_path, 'metadata.csv'), index=False)

In [56]:
# Create train and test set
n = Xa_filtered.shape[0]
n_train = int(0.8*n)
n_val = int(0.1*n)
n_test = n - n_train - n_val
print(n_train, n_val, n_test)

ind_perm = np.random.permutation(n)
ind_train = ind_perm[:n_train]
ind_val = ind_perm[n_train:n_train+n_val]
ind_test = ind_perm[-n_test:]

8769 1096 1097


In [65]:
np.save(os.path.join(preprocessed_path, 'X_train.npy'), arr=Xa_filtered[ind_train])
np.save(os.path.join(preprocessed_path, 'X_val.npy'), arr=Xa_filtered[ind_val])
np.save(os.path.join(preprocessed_path, 'X_test.npy'), arr=Xa_filtered[ind_test])

In [71]:
# make binary labels
y = np.zeros((len(labels), 2))
y[:, 0] = [1*(l=='a') for l in labels]
y[:, 1] = [1*(l=='c') for l in labels]
y_filtered = y[no_error]
y_filtered.shape

(10962, 2)

In [72]:
np.save(os.path.join(preprocessed_path, 'y.npy'), arr=y_filtered)
np.save(os.path.join(preprocessed_path, 'y_train.npy'), arr=y_filtered[ind_train])
np.save(os.path.join(preprocessed_path, 'y_val.npy'), arr=y_filtered[ind_val])
np.save(os.path.join(preprocessed_path, 'y_test.npy'), arr=y_filtered[ind_test])