## PD_MIT-CS1PD dataset

### Ground truth loading

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nqDataLoader as nq  #data loading library
from sklearn.model_selection import train_test_split
import h5py

### Load all the files

In [49]:
# load ground Ground Truth 
cs1PdFr = pd.read_csv( 'MIT-CS1PD/GT_DataPD_MIT-CS1PD.csv' )
# set Patient ID as index
cs1PdFr = cs1PdFr.set_index('pID')
# load ground Ground Truth 
cs2PdFr = pd.read_csv( 'MIT-CS2PD/GT_DataPD_MIT-CS2PD.csv' )
# set Patient ID as index
cs2PdFr = cs2PdFr.set_index('pID')
# show part of Data Frame
# cs2PdFr.head()

In [50]:
filenames = ['file_1', 'file_2']
patients = pd.DataFrame(columns=['id', 'data', 'label'])
def load_all(record):
    global filenames, patients
    name = record.name
    
    arr = np.empty(shape=(0, 0))
    keyp = pd.Series()
    for filename in filenames:
        keyPressed, htArr, pressArr, releaseArr = \
                nq.getDataFiltHelper( 'MIT-CS1PD/data_MIT-CS1PD/' + cs1PdFr.loc[name][filename])
        arr = np.append(arr, htArr)
        keyp = keyp.append(pd.Series(keyPressed))
        
    patients = patients.append({'id': name, 'data': arr, 'key': keyp, 'label': record['gt']}, ignore_index=True)
    return True

def load_all_d2(record):
    global patients
    filenames = ['file_1']
    name = record.name
    
    arr = np.empty(shape=(0, 0))
    keyp = pd.Series()
    for filename in filenames:
        keyPressed, htArr, pressArr, releaseArr = \
                nq.getDataFiltHelper( 'MIT-CS2PD/data_MIT-CS2PD/' + cs2PdFr.loc[name][filename])
        arr = np.append(arr, htArr)
        keyp = keyp.append(pd.Series(keyPressed))
        
    patients = patients.append({'id': name, 'data': arr, 'key': keyp, 'label': record['gt']}, ignore_index=True)
    return True

def remove_quotes(row):    
    return list(row.apply(lambda x: x.replace('"', '')))

    
tmp = cs1PdFr.apply(lambda x: load_all(x), axis=1)
tmp = cs2PdFr.apply(lambda x: load_all_d2(x), axis=1)

patients.set_index('id', inplace=True)

patients['key'] = patients['key'].apply(lambda x: remove_quotes(x))

### Map key to row

In [51]:
keyss = set()

def add_to_set(row):
    global keyss
    keyss.update(row)
tmp = patients['key'].apply(lambda x: add_to_set(x))
# patients.head()


In [52]:
keyss

{'',
 '0',
 '1',
 '2',
 '3',
 '4',
 '6',
 '9',
 'A',
 'B',
 'C',
 'Caps_Lock',
 'D',
 'Delete',
 'Down',
 'E',
 'End',
 'Escape',
 'F',
 'G',
 'H',
 'I',
 'Insert',
 'J',
 'L',
 'Left',
 'M',
 'Menu',
 'N',
 'Num_Lock',
 'O',
 'P',
 'P_Add',
 'P_Down',
 'P_End',
 'P_Enter',
 'P_Home',
 'P_Insert',
 'P_Left',
 'P_Next',
 'P_Page_Up',
 'P_Subtract',
 'Q',
 'R',
 'Return',
 'Right',
 'S',
 'Super_L',
 'T',
 'Tab',
 'U',
 'Up',
 'V',
 'W',
 'Y',
 'Z',
 '[269025200]',
 '[269025201]',
 '[65027]',
 '[65104]',
 '[65105]',
 'a',
 'acute',
 'apostrophe',
 'b',
 'c',
 'ccedilla',
 'colon',
 'comma',
 'd',
 'e',
 'exclam',
 'exclamdown',
 'f',
 'g',
 'grave',
 'h',
 'i',
 'j',
 'k',
 'l',
 'less',
 'm',
 'masculine',
 'minus',
 'n',
 'ntilde',
 'o',
 'p',
 'parenleft',
 'parenright',
 'period',
 'periodcentered',
 'plus',
 'q',
 'question',
 'quotedbl',
 'r',
 's',
 'semicolon',
 'space',
 't',
 'u',
 'underscore',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [53]:
# cs1PdFr.iloc[0].name
keyboard = {1: ['Escape', '[269025200]', '[269025201]', '[65027]', '[65104]', '[65105]', ''],
            2: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'masculine', 'exclam', 'exclamdown', 'question', 'apostrophe', 'Insert', 'Num_Lock', 'P_Subtract'], 
            3: ['q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', 'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', 'Tab', 'grave', 'plus', 'Delete', 'End', 'P_Add', 'P_Home', 'P_Page_Up'], 
            4: ['a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'A', 'S', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'Caps_Lock', 'ntilde', 'parenleft', 'parenright', 'acute', 'ccedilla', 'quotedbl', 'P_Left', 'Return'], 
            5: ['z', 'x', 'c', 'v', 'b', 'n', 'm', 'Z', 'X', 'C', 'V', 'B', 'N', 'M', 'less', 'colon', 'semicolon', 'period', 'periodcentered', 'comma', 'minus', 'underscore', 'Up', 'P_End', 'P_Down', 'P_Enter', 'P_Next'], 
            6: ['space', 'Super_L', 'Left', 'Menu', 'Down', 'Right', 'P_Insert']}

# 1 is left, 2 is right
leftright = {1: ['Escape', '[269025200]', '[269025201]', '[65027]', '[65104]', '[65105]', '', '1', '2', '3', '4', '5', 'masculine', 
                 'exclam', 'exclamdown', 'q', 'w', 'e', 'r', 't', 'Q', 'W', 'E', 'R', 'T', 'Tab', 'a', 's', 'd', 'f', 'g', 
                 'A', 'S', 'D', 'F', 'G', 'Caps_Lock', 'z', 'x', 'c', 'v', 'Z', 'X', 'C', 'V', 'Super_L', 'less'],
             
             2: ['6', '7', '8', '9', '0', 'question', 'apostrophe', 'Insert', 'Num_Lock', 'P_Subtract', 'y', 'u', 'i', 'o', 'p', 
                 'Y', 'U', 'I', 'O', 'P', 'grave', 'plus', 'Delete', 'End', 'P_Add', 'P_Home', 'P_Page_Up', 'h', 'j', 'k', 'l', 
                 'H', 'J', 'K', 'L', 'ntilde', 'parenleft', 'parenright', 'acute', 'ccedilla', 'quotedbl', 'P_Left', 'Return', 
                 'b', 'n', 'm', 'B', 'N', 'M', 'space', 'Left', 'Menu', 'Down', 'Right', 'P_Insert',
                 'colon', 'semicolon', 'period', 'periodcentered', 'comma', 'minus', 'underscore', 'Up', 'P_End', 'P_Down', 'P_Enter', 'P_Next']}
# patients.head()

In [54]:
def digList(lst):
    temp = []
    for item in lst:
        if type(item) is list:
            temp.append(digList(item))
        else:
            temp.append(item)
    return set(temp)

invDict = {}
for k, v in keyboard.items():
    if type(v) is list:
        items = digList(v)
        for item in items:
            invDict[item] = invDict.get(item, [])
            invDict[item].append(k)
    else:
        invDict[v] = invDict.get(v, [])
        invDict[v].append(k)

keyboard_inv = invDict

invDict = {}
for k, v in leftright.items():
    if type(v) is list:
        items = digList(v)
        for item in items:
            invDict[item] = invDict.get(item, [])
            invDict[item].append(k)
    else:
        invDict[v] = invDict.get(v, [])
        invDict[v].append(k)

leftright_inv = invDict

In [55]:
def key_to_part(value):
    return leftright_inv[value][0]

def map_part(row):
    return list(pd.Series(row).apply(lambda x: key_to_part(x)))

def key_to_row(value):
    return keyboard_inv[value][0]

def map_rows(row):
    return list(pd.Series(row).apply(lambda x: key_to_row(x)))

patients['row'] = patients['key'].apply(lambda x: map_rows(x))
patients['part'] = patients['key'].apply(lambda x: map_part(x))

# patients.head()
# keyboard.values()

### Prepare training set

In [56]:
X_train, Y_train = pd.DataFrame([patients['data'], patients['row'], patients['part']]), patients.label
X_train = X_train.transpose()

X_train['row'] = X_train['row'].apply(lambda x: np.array(x))
X_train['part'] = X_train['part'].apply(lambda x: np.array(x))

In [57]:
X_train.iloc[0]['data'].shape

(4829,)

In [58]:
def find_max(df):
     return max(df['key'].apply(lambda x: len(x)))
    
def find_mean(df):
     return np.mean(df['key'].apply(lambda x: len(x)))
    
def find_min(df):
     return min(df['key'].apply(lambda x: len(x)))
    
maxlen = find_max(patients)
print(maxlen)
print('min is ', find_min(patients))

# def pad_series(series):
#     global maxlen
# #     if isinstance(series, np.ndarray):
# #         series = pd.Series(series)
# #     elif isinstance(series, list):
# #         series = pd.Series(series)
        
#     diff = maxlen - series.shape[0]
#     list_of_zeros = [0 for i in range(diff)] 
# #     return list(series.append(pd.Series(list_of_zeros)))
#     return np.array(series.append(pd.Series(list_of_zeros)))

# X_train['data'] = X_train['data'].apply(lambda x: pad_series(pd.Series(x)))
# X_train['row'] = X_train['row'].apply(lambda x: pad_series(pd.Series(x)))
# X_train['part'] = X_train['part'].apply(lambda x: pad_series(pd.Series(x)))

6368
('min is ', 299)


### Chunk each time series

In [59]:
X_train['target'] = Y_train.astype(int)
X_train.head()

Unnamed: 0_level_0,data,row,part,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11,"[0.1713, 0.1432, 0.0655, 0.1188, 0.0737, 0.065...","[4, 5, 3, 4, 6, 3, 5, 4, 6, 5, 3, 4, 6, 3, 3, ...","[1, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, ...",1
60,"[0.0895, 0.0954, 0.1738, 0.1078, 0.166, 0.1361...","[4, 3, 4, 6, 3, 5, 4, 4, 4, 3, 6, 4, 3, 6, 3, ...","[1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 1, ...",0
67,"[0.1119, 0.1345, 0.3396, 0.1883, 0.1592, 0.134...","[4, 3, 3, 6, 3, 3, 5, 3, 4, 3, 4, 3, 6, 4, 3, ...","[1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, ...",1
68,"[0.1194, 0.1337, 0.1847, 0.1499, 0.2137, 0.164...","[5, 3, 3, 3, 3, 5, 6, 3, 3, 3, 6, 5, 3, 3, 6, ...","[2, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, ...",0
70,"[0.1528, 0.208, 0.2084, 0.1539, 0.177, 0.2055,...","[5, 4, 4, 6, 3, 4, 4, 4, 3, 3, 4, 6, 3, 3, 3, ...","[2, 2, 1, 2, 2, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, ...",1


In [60]:
# def chunk_one_patient(series, size, overlap):
#     row = chunk_array(series['row'], size, overlap)
#     part = chunk_array(series['part'], size, overlap)
#     data = chunk_array(series['data'], size, overlap)
#     target = [series['target'] for i in range(len(row))]
    
#     res = pd.DataFrame([data, part, row, target])
#     res = res.transpose()
#     return res.rename(columns={0: "data", 1: "part", 2: 'row', 3: 'target'})

# def chunk_array(array, size, overlap):
#     gen = gen_split_overlap(array, size, overlap)
    
#     result = []
#     for arr in gen:
#         # if it's the last arr add zeros (if it's 0.7 of size)
#         if arr.shape[0] != size:
#             if np.float(arr.shape[0]) / np.float(size) >= 0.7:
#                 arr = np.concatenate((arr, np.zeros((size - arr.shape[0],))))
#             else:
#                 continue
        
#         result.append(arr)
# #         print(arr.shape)
#     return result
    

# def gen_split_overlap(seq, size, overlap):
#     if size < 1 or overlap < 0:
#         raise ValueError('size must be >= 1 and overlap >= 0')

#     for i in range(0, len(seq) - overlap, size - overlap):            
#         yield seq[i: i + size]

In [61]:
def chunk_one_patient(series, size, overlap):
    row = np.array(chunk_array(series['row'], size, overlap))
    part = np.array(chunk_array(series['part'], size, overlap))
    data = np.array(chunk_array(series['data'], size, overlap))
    target = np.array([series['target'] for i in range(len(row))])
    
    cor = np.stack([data, part, row], axis=0)
    arr = cor.reshape((cor.shape[1], cor.shape[2], cor.shape[0]))
#     res = pd.DataFrame([arr, target])
#     res = res.transpose()
#     return res.rename(columns={0: "data", 1: 'target'})
    return [arr, target]

def chunk_array(array, size, overlap):
    gen = gen_split_overlap(array, size, overlap)
    
    result = []
    for arr in gen:
        # if it's the last arr add zeros (if it's 0.7 of size)
        if arr.shape[0] != size:
            if np.float(arr.shape[0]) / np.float(size) >= 0.7:
                arr = np.concatenate((arr, np.zeros((size - arr.shape[0],))))
            else:
                continue
        
        result.append(arr)
#         print(arr.shape)
    return result
    

def gen_split_overlap(seq, size, overlap):
    if size < 1 or overlap < 0:
        raise ValueError('size must be >= 1 and overlap >= 0')

    for i in range(0, len(seq) - overlap, size - overlap):            
        yield seq[i: i + size]

In [62]:
chunk_size = 20  # number of data points in one chunk
overlap = 0.5     # overlapping between chunks in percents of chunk_size
overlap = int(overlap*chunk_size)

# one_patient = chunk_one_patient(X_train.iloc[0], chunk_size, overlap)
# one_patient = one_patient.transpose()
# one_patient.rename(columns={0: "data", 1: "part", 2: 'row', 3: 'target'})

res_df = pd.DataFrame()
res_df['data'] = X_train.apply(lambda x: chunk_one_patient(x, chunk_size, overlap), axis=1)

In [63]:
def split_column_target(row):
#     print(row.shape)
    return row[1]

def split_column_data(row):
#     print(row.shape)
    return row[0]

res_df['target'] = res_df['data'].apply(lambda x: split_column_target(x))
res_df['data'] = res_df['data'].apply(lambda x: split_column_data(x))

In [64]:
res_df.head()

Unnamed: 0_level_0,data,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
11,"[[[0.1713, 0.1432, 0.0655], [0.1188, 0.0737, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
60,"[[[0.0895, 0.0954, 0.1738], [0.1078, 0.166, 0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
67,"[[[0.1119, 0.1345, 0.3396], [0.1883, 0.1592, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
68,"[[[0.1194, 0.1337, 0.1847], [0.1499, 0.2137, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
70,"[[[0.1528, 0.208, 0.2084], [0.1539, 0.177, 0.2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [65]:
# Concatenate rows all together
train_set = np.array((0, 0, 0))
target_set = np.array(0)

def concat_train(row):
    global train_set
    try: 
        train_set.shape[0]
        train_set = np.concatenate([train_set, row])
    except:
        train_set = row

def concat_target(row):
    global target_set
    try: 
        target_set.shape[0]
        target_set = np.concatenate([target_set, row])
    except:
        target_set = row
        
tmp = res_df['data'].apply(lambda x: concat_train(x))
tmp = res_df['target'].apply(lambda x: concat_target(x))


print('X_train shape', train_set.shape)
print('Y_train shape', target_set.shape)

('X_train shape', (16723, 20, 3))
('Y_train shape', (16723,))


### Split training and testing instances

In [145]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.1, random_state=15)

### Save data

In [66]:
h5f = h5py.File('encoding_LSTM_chunks20.h5', 'w')
h5f.create_dataset('train', data=train_set)
h5f.create_dataset('target', data=target_set)
h5f.close()