# Summary
This notebook includes two main parts:
1. Load original datasets that are given in `pd.DataFrame` and save them in seperate `.npy` files, one for each %patient \times lead%, in a way compatible with `DataLoader` class. 
2. Split patients into `train`, `validation` and `test` samples. It is done once for reproducibility. The current samples are 85%, 10% and 5% for `train`, `validation` and `test` respectively.

In [1]:
import os
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import random
import pickle


# configurables
root = '/home/david/Desktop/projects/thesis/upstream_seq2seq/'
SOURCE_FOLDER = f'{root}/data/raw/'
DESTINATION_FOLDER = f'{root}/data/processed/'
train_ratio = 0.85
validation_ratio = 0.10
test_ratio = 0.5
CHUNKSIZE = 1000




  from .autonotebook import tqdm as notebook_tqdm


## Saving Data

In [8]:
combined_files = [x for x in os.listdir(SOURCE_FOLDER) if 'combined_file' in x]
combined_files.remove('ningbo_1_combined_file.csv')
print(combined_files)

for i, conbined_file in enumerate(combined_files):
    print(f'Start Saving {conbined_file} ({i}/{len(combined_files)})')
    print('\tLoading Data..')

    with pd.read_csv(SOURCE_FOLDER+conbined_file, chunksize=CHUNKSIZE, index_col=0) as reader:
        for chunk in tqdm(reader, desc='Saving files'):
            for j in range(len(chunk)):
                signal, indx = chunk.iloc[j].values[:-1], chunk.iloc[j].values[-1]
                with open(DESTINATION_FOLDER+f'{indx}.npy', 'wb') as f:
                    np.save(f, signal)




['georgia_combined_file.csv', 'china_combined_file_targets.csv', 'ningbo_2_combined_file.csv', 'china_combined_file.csv', 'chapman_combined_file.csv']
Start Saving georgia_combined_file.csv (0/5)
	Loading Data..


Saving files: 125it [18:13,  8.74s/it]


Start Saving china_combined_file_targets.csv (1/5)
	Loading Data..


Saving files: 4it [00:02,  1.58it/s]


Start Saving ningbo_2_combined_file.csv (2/5)
	Loading Data..


Saving files: 334it [48:57,  8.80s/it]


Start Saving china_combined_file.csv (3/5)
	Loading Data..


Saving files: 37it [05:28,  8.89s/it]


Start Saving chapman_combined_file.csv (4/5)
	Loading Data..


Saving files: 90it [13:11,  8.79s/it]


In [11]:
os.getcwd()

'/home/david/Desktop/projects/thesis/upstream_seq2seq/notebooks'

In [12]:
saved_files = os.listdir(DESTINATION_FOLDER)

ningbo_files = [x for x in saved_files if 'ningbo' in x]
georgia_files = [x for x in saved_files if 'georgia' in x]
china_files = [x for x in saved_files if 'china' in x]
chapman_files = [x for x in saved_files if 'chapman' in x]
print('ningbo_files', len(ningbo_files))
print('georgia_files', len(georgia_files))
print('china_files', len(china_files))
print('chapman_files', len(chapman_files))
print('sum', len(ningbo_files)+len(georgia_files)+len(china_files)+len(chapman_files))


ningbo_files 333072
georgia_files 124128
china_files 36984
chapman_files 89256
sum 583440


In [13]:
# sanity check
a = np.load(DESTINATION_FOLDER+f'{indx}.npy',allow_pickle=True)
np.mean(a == chunk.iloc[j].values[:-1])

1.0

In [14]:
# removing two extra files
for f in [x for x in saved_files if '0.0' in x or '1.0' in x]:
    try:
        print(f'Removing {f} from {DESTINATION_FOLDER}')
        os.remove(DESTINATION_FOLDER+f'{f}')
    except:
        print(f'{f} already has been removed from {DESTINATION_FOLDER}')

Removing 0.0.npy from /home/david/Desktop/projects/thesis/upstream_seq2seq//data/processed/
Removing 1.0.npy from /home/david/Desktop/projects/thesis/upstream_seq2seq//data/processed/


## Split into Train-Validation-Test

In [15]:
files = os.listdir(DESTINATION_FOLDER)
[x for x in files if 'chapman' in x and 'JS02897' in x]
datasource_and_person_level_index = {x.split('_')[0]+'_'+x.replace('.npy','').split('_')[-1] for x in files}
print('Total Unique Persons:', len(datasource_and_person_level_index))
print('Firt 10 persons:', list(datasource_and_person_level_index)[:10])

datasource_and_person_level_index = list(datasource_and_person_level_index)
#      set seed    shuffle the list
random.Random(42).shuffle(datasource_and_person_level_index)

N = len(datasource_and_person_level_index)
train_indices = datasource_and_person_level_index[:int(train_ratio*N)]
validation_indices = datasource_and_person_level_index[int(train_ratio*N):int((train_ratio+validation_ratio)*N)]
test_indices = datasource_and_person_level_index[int((train_ratio+validation_ratio)*N):]

print('Train indices No.:', len(train_indices))
print('Validation indices No.:', len(validation_indices))
print('Test indices No.:', len(test_indices))
print('Train indices portion:', len(train_indices)/N)
print('Validation indices portion:', len(validation_indices)/N)
print('Test indices portion:', len(test_indices)/N)



Total Unique Persons: 48620
Firt 10 persons: ['chapman_JS04059', 'ningbo_JS22372', 'georgia_E03662', 'ningbo_JS32070', 'ningbo_JS24598', 'georgia_E01319', 'ningbo_JS20186', 'georgia_E01207', 'ningbo_JS15672', 'ningbo_JS16933']
Train indices No.: 41327
Validation indices No.: 4862
Test indices No.: 2431
Train indices portion: 0.85
Validation indices portion: 0.1
Test indices portion: 0.05


In [16]:
# save in a dictionary
data_splits = {'train':train_indices,
               'validation':validation_indices,
               'test':test_indices}

# create a binary pickle file 
f = open(DESTINATION_FOLDER+"splits.pkl","wb")

# write the python object (dict) to pickle file
pickle.dump(data_splits,f)

# close file
f.close()


In [17]:
with open(DESTINATION_FOLDER+"splits.pkl", 'rb') as handle:
    b = pickle.load(handle)

print(data_splits == b)

True
