## PD_MIT-CS1PD dataset

### Ground truth loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nqDataLoader as nq  #data loading library
from sklearn.model_selection import train_test_split
import h5py

### Load all the files

In [2]:
# load ground Ground Truth 
cs1PdFr = pd.read_csv( 'MIT-CS1PD/GT_DataPD_MIT-CS1PD.csv' )
# set Patient ID as index
cs1PdFr = cs1PdFr.set_index('pID')
# load ground Ground Truth 
cs2PdFr = pd.read_csv( 'MIT-CS2PD/GT_DataPD_MIT-CS2PD.csv' )
# set Patient ID as index
cs2PdFr = cs2PdFr.set_index('pID')
# show part of Data Frame
# cs2PdFr.head()

In [3]:
filenames = ['file_1', 'file_2']
patients = pd.DataFrame(columns=['id', 'hold', 'label'])
def load_all(record):
    global filenames, patients
    name = record.name
    
    arr = np.empty(shape=(0, 0))
    arr_press = np.empty(shape=(0, 0))
    arr_release = np.empty(shape=(0, 0))
    keyp = pd.Series()
    for filename in filenames:
        keyPressed, htArr, pressArr, releaseArr = \
                nq.getDataFiltHelper( 'MIT-CS1PD/data_MIT-CS1PD/' + cs1PdFr.loc[name][filename])
        arr = np.append(arr, htArr)
        arr_press = np.append(arr_press, pressArr)
        arr_release = np.append(arr_release, releaseArr)
        keyp = keyp.append(pd.Series(keyPressed))
        
    patients = patients.append({'id': name, 'hold': arr, 'key': keyp, 'label': record['gt'], 'press': arr_press, 
                               'release': arr_release}, ignore_index=True)
    return True

def load_all_d2(record):
    global patients
    filenames = ['file_1']
    name = record.name
    
    arr = np.empty(shape=(0, 0))
    arr_press = np.empty(shape=(0, 0))
    arr_release = np.empty(shape=(0, 0))
    keyp = pd.Series()
    for filename in filenames:
        keyPressed, htArr, pressArr, releaseArr = \
                nq.getDataFiltHelper( 'MIT-CS2PD/data_MIT-CS2PD/' + cs2PdFr.loc[name][filename])
        arr = np.append(arr, htArr)
        arr_press = np.append(arr_press, pressArr)
        arr_release = np.append(arr_release, releaseArr)
        keyp = keyp.append(pd.Series(keyPressed))
        
    patients = patients.append({'id': name, 'hold': arr, 'key': keyp, 'label': record['gt'], 'press': arr_press, 
                               'release': arr_release}, ignore_index=True)
    return True

def remove_quotes(row):    
    return list(row.apply(lambda x: x.replace('"', '')))

    
tmp = cs1PdFr.apply(lambda x: load_all(x), axis=1)
tmp = cs2PdFr.apply(lambda x: load_all_d2(x), axis=1)

patients.set_index('id', inplace=True)

patients['key'] = patients['key'].apply(lambda x: remove_quotes(x))
patients = patients.drop('key', axis=1)

  data = np.genfromtxt(fileIn, dtype=None, delimiter=',', skip_header=0)


### MLP Encoding

In [4]:
patients.head()

Unnamed: 0_level_0,hold,label,press,release
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11,"[0.1713, 0.1432, 0.0655, 0.1188, 0.0737, 0.065...",True,"[2.4568, 2.7748, 2.9139, 3.0805, 3.1564, 3.435...","[2.6281, 2.918, 2.9794, 3.1992, 3.2301, 3.5004..."
60,"[0.0895, 0.0954, 0.1738, 0.1078, 0.166, 0.1361...",False,"[2.6084, 2.8721, 3.9054, 5.2092, 5.893, 6.3525...","[2.6979, 2.9675, 4.0792, 5.317, 6.0589, 6.4886..."
67,"[0.1119, 0.1345, 0.3396, 0.1883, 0.1592, 0.134...",True,"[1.3361, 1.9241, 2.6734, 3.1709, 3.6996, 4.082...","[1.4481, 2.0585, 3.0129, 3.3591, 3.8588, 4.217..."
68,"[0.1194, 0.1337, 0.1847, 0.1499, 0.2137, 0.164...",False,"[1.8314, 2.4218, 2.8949, 3.2413, 3.9616, 6.467...","[1.9508, 2.5555, 3.0796, 3.3912, 4.1754, 6.631..."
70,"[0.1528, 0.208, 0.2084, 0.1539, 0.177, 0.2055,...",True,"[0.6356, 6.0042, 6.4526, 8.0744, 9.0555, 9.484...","[0.7884, 6.2122, 6.661, 8.2283, 9.2325, 9.6897..."


### Shift and subtract

In [5]:
def shift_series(arr):
    series = pd.Series(arr)
    shifted = series.shift()
    shifted[0] = 0
#     shifted = shifted[:-1]
    return np.array(shifted)

patients['shifted_release'] = patients['release'].apply(lambda x: shift_series(x))

In [6]:
def subtract_series(record):
#     print('Press', record['press'][:10])
#     print('Release', record['release'][:10])
#     print('Shifted rel', record['shifted_release'][:10])
    return np.round(record['press'] - record['shifted_release'], decimals=4)

patients['between'] = patients.apply(lambda x: subtract_series(x), axis=1)

In [18]:
# pd.DataFrame(patients.iloc[0][['hold', 'between']]).transpose().apply(pd.Series)
# patients.iloc[0][['hold', 'between']]
tmp = pd.Series()
tmp['hold'] = patients.iloc[0]['hold']
tmp['latency'] = patients.iloc[0]['between']
tmp

hold       [0.1713, 0.1432, 0.0655, 0.1188, 0.0737, 0.065...
latency    [2.4568, 0.1467, -0.0041, 0.1011, -0.0428, 0.2...
dtype: object

In [8]:
X_train = patients[['hold', 'between']]
Y_train = patients['label'].astype(int)

In [9]:
# np.stack(X_train.iloc[0]).shape
X_train = X_train.apply(lambda x: np.stack(x), axis=1)
X_train = pd.DataFrame([X_train, Y_train])
X_train = X_train.transpose()
X_train = X_train.rename(columns={'Unnamed 0': 'data'})

In [10]:
X_train.iloc[0]['data'].shape

(2, 4829)

### Chunk and concat

In [11]:
def chunk_one_patient(series, size, overlap):
#     print(series['data'].shape)
    arr1 = np.array(chunk_array(series['data'][0], size, overlap))
    arr2 = np.array(chunk_array(series['data'][1], size, overlap))
    res = np.concatenate([arr1, arr2], axis=1)
#     print(arr)
    target = np.array([series['label'] for i in range(res.shape[0])])
#     print(target)
    return [res, target]

def chunk_array(array, size, overlap):
#     print(array.shape)
    gen = gen_split_overlap(array, size, overlap)
    
    result = []
    for arr in gen:
        # if it's the last arr add zeros (if it's 0.7 of size)
        if arr.shape[0] != size:
            if np.float(arr.shape[0]) / np.float(size) >= 0.7:
                arr = np.concatenate((arr, np.zeros((size - arr.shape[0], ))))
            else:
                continue
        
        result.append(arr)
#         print(arr.shape)
    return result
    

def gen_split_overlap(seq, size, overlap):
    if size < 1 or overlap < 0:
        raise ValueError('size must be >= 1 and overlap >= 0')

    for i in range(0, len(seq) - overlap, size - overlap):            
        yield seq[i: i + size]

In [12]:
chunk_size = 200  # number of data points in one chunk
overlap = 0.5    # overlapping between chunks from 0 to 1
overlap = int(overlap*chunk_size)

# one_patient = chunk_one_patient(X_train.iloc[0], chunk_size, overlap)
# one_patient = one_patient.transpose()
# one_patient.rename(columns={0: "data", 1: "part", 2: 'row', 3: 'target'})

res_df = pd.DataFrame()
res_df = X_train.apply(lambda x: chunk_one_patient(x, chunk_size, overlap), axis=1)
res_df = pd.DataFrame(res_df.apply(pd.Series))
res_df = res_df.rename(columns={0: "data", 1: "target"})

In [13]:
# X_train.iloc[0]['data'].shape
res_df.iloc[4]['data'].shape

(11, 400)

### Save data

In [14]:
# h5f = h5py.File('train_enc_D.h5', 'w')

In [15]:
# h5f.create_group('train')
# h5f.create_dataset('train', data=X_train)
# h5f.create_dataset('target', data=Y_train)
# h5f.close()

In [16]:
# to_json
res_df.to_json('data_encoding_MLP_chunk{chunk}.json'.format(chunk=chunk_size))