# Summary
This notebook includes two main parts:
1. Load original datasets that are given in `pd.DataFrame` and save them in seperate `.npy` files, one for each %patient \times lead%, in a way compatible with `DataLoader` class. 
2. Split patients into `train`, `validation` and `test` samples. It is done once for reproducibility. The current samples are 85%, 10% and 5% for `train`, `validation` and `test` respectively.

In [1]:
import os
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import random
import pickle


# configurables
root = '/home/david/Desktop/projects/thesis/upstream_seq2seq/'
SOURCE_FOLDER = f'{root}/data/raw/'
DESTINATION_FOLDER = f'{root}/data/processed/'
train_ratio = 0.85
validation_ratio = 0.10
test_ratio = 0.5
CHUNKSIZE = 1000




  from .autonotebook import tqdm as notebook_tqdm


## Saving Data

In [2]:
combined_files = [x for x in os.listdir(SOURCE_FOLDER) if 'combined_file' in x]
combined_files.remove('ningbo_1_combined_file.csv')
print(combined_files)

for i, conbined_file in enumerate(combined_files):
    print(f'Start Saving {conbined_file} ({i}/{len(combined_files)})')
    print('\tLoading Data..')

    with pd.read_csv(SOURCE_FOLDER+conbined_file, chunksize=CHUNKSIZE, index_col=0) as reader:
        for chunk in tqdm(reader, desc='Saving files'):
            for j in range(len(chunk)):
                signal, indx = chunk.iloc[j].values[:-1], chunk.iloc[j].values[-1]
                with open(DESTINATION_FOLDER+f'{indx}.npy', 'wb') as f:
                    np.save(f, signal)




['georgia_combined_file.csv', 'china_combined_file.csv', 'ningbo_2_combined_file.csv', 'china_combined_file_targets.csv', 'chapman_combined_file.csv']
Start Saving georgia_combined_file.csv (0/5)
	Loading Data..


Saving files: 125it [17:41,  8.49s/it]


Start Saving china_combined_file.csv (1/5)
	Loading Data..


Saving files: 37it [05:12,  8.45s/it]


Start Saving ningbo_2_combined_file.csv (2/5)
	Loading Data..


Saving files: 334it [49:51,  8.96s/it]


Start Saving china_combined_file_targets.csv (3/5)
	Loading Data..


Saving files: 4it [00:00,  8.92it/s]


Start Saving chapman_combined_file.csv (4/5)
	Loading Data..


Saving files: 90it [14:03,  9.37s/it]


In [3]:
saved_files = os.listdir(DESTINATION_FOLDER)

ningbo_files = [x for x in saved_files if 'ningbo' in x]
georgia_files = [x for x in saved_files if 'georgia' in x]
china_files = [x for x in saved_files if 'china' in x]
chapman_files = [x for x in saved_files if 'chapman' in x]
print('ningbo_files', len(ningbo_files))
print('georgia_files', len(georgia_files))
print('china_files', len(china_files))
print('chapman_files', len(chapman_files))
print('sum', len(ningbo_files)+len(georgia_files)+len(china_files)+len(chapman_files))


ningbo_files 333072
georgia_files 124128
china_files 36984
chapman_files 89256
sum 583440


In [4]:
# sanity check
a = np.load(DESTINATION_FOLDER+f'{indx}.npy',allow_pickle=True)
np.mean(a == chunk.iloc[j].values[:-1])

1.0

In [5]:
# removing two extra files
for f in [x for x in saved_files if '0.0' in x or '1.0' in x]:
    try:
        print(f'Removing {f} from {DESTINATION_FOLDER}')
        os.remove(DESTINATION_FOLDER+f'{f}')
    except:
        print(f'{f} already has been removed from {DESTINATION_FOLDER}')

Removing 0.0.npy from /home/david/Desktop/projects/thesis/upstream_seq2seq//data/processed/
Removing 1.0.npy from /home/david/Desktop/projects/thesis/upstream_seq2seq//data/processed/


## Split into Train-Validation-Test

In [6]:
files = os.listdir(DESTINATION_FOLDER)
[x for x in files if 'chapman' in x and 'JS02897' in x]
datasource_and_person_level_index = {x.split('_')[0]+'_'+x.replace('.npy','').split('_')[-1] for x in files}
print('Total Unique Persons:', len(datasource_and_person_level_index))
print('Firt 10 persons:', list(datasource_and_person_level_index)[:10])

datasource_and_person_level_index = list(datasource_and_person_level_index)
#      set seed    shuffle the list
random.Random(42).shuffle(datasource_and_person_level_index)

N = len(datasource_and_person_level_index)
train_indices = datasource_and_person_level_index[:int(train_ratio*N)]
validation_indices = datasource_and_person_level_index[int(train_ratio*N):int((train_ratio+validation_ratio)*N)]
test_indices = datasource_and_person_level_index[int((train_ratio+validation_ratio)*N):]

print('Train indices No.:', len(train_indices))
print('Validation indices No.:', len(validation_indices))
print('Test indices No.:', len(test_indices))
print('Train indices portion:', len(train_indices)/N)
print('Validation indices portion:', len(validation_indices)/N)
print('Test indices portion:', len(test_indices)/N)



Total Unique Persons: 48623
Firt 10 persons: ['georgia_E02735', 'georgia_E00265', 'china_Q0751', 'ningbo_JS27234', 'ningbo_JS22082', 'ningbo_JS15914', 'ningbo_JS25957', 'georgia_E05352', 'ningbo_JS16023', 'chapman_JS04397']
Train indices No.: 41329
Validation indices No.: 4862
Test indices No.: 2432
Train indices portion: 0.8499886884807601
Validation indices portion: 0.09999383008041463
Test indices portion: 0.05001748143882525


In [7]:
# save in a dictionary
data_splits = {'train':train_indices,
               'validation':validation_indices,
               'test':test_indices}

# create a binary pickle file 
f = open(DESTINATION_FOLDER+"splits.pkl","wb")

# write the python object (dict) to pickle file
pickle.dump(data_splits,f)

# close file
f.close()


In [8]:
with open(DESTINATION_FOLDER+"splits.pkl", 'rb') as handle:
    b = pickle.load(handle)

print(data_splits == b)

True


# QA

In [17]:
with open(DESTINATION_FOLDER+"splits.pkl", 'rb') as handle:
    splits = pickle.load(handle)
print('splits')
print('Train: ', [x for x in splits['train'] if 'split' in x])
print('Val: ', [x for x in splits['validation'] if 'split' in x])
print('Test: ', [x for x in splits['test'] if 'split' in x])


print('noise')
print('Train: ', [x for x in splits['train'] if 'noise' in x])
print('Val: ', [x for x in splits['validation'] if 'noise' in x])
print('Test: ', [x for x in splits['test'] if 'noise' in x])


splits
Train:  []
Val:  []
Test:  []
noise
Train:  ['noise_1', 'noise_2']
Val:  []
Test:  []


In [14]:
splits['train'].remove('splits.pkl_splits.pkl')

In [18]:
splits['train'].remove('noise_1')
splits['train'].remove('noise_2')


In [19]:
print('Train: ', [x for x in splits['train'] if 'split' in x])
print('Val: ', [x for x in splits['validation'] if 'split' in x])
print('Test: ', [x for x in splits['test'] if 'split' in x])
print('Train: ', [x for x in splits['train'] if 'noise' in x])
print('Val: ', [x for x in splits['validation'] if 'noise' in x])
print('Test: ', [x for x in splits['test'] if 'noise' in x])


Train:  []
Val:  []
Test:  []
Train:  []
Val:  []
Test:  []


In [20]:
# resave

# create a binary pickle file 
f = open(DESTINATION_FOLDER+"splits.pkl","wb")
# write the python object (dict) to pickle file
pickle.dump(splits,f)