In [1]:
import os
import numpy as np
from tqdm import tqdm
import glob
import pandas as pd

In [2]:
clean_train_folder = '/ihome/lshangguan/adr121/Hearables/data_new/clean_train_dataset'
noisy_train_folder = '/ihome/lshangguan/adr121/Hearables/data_new/noisy_train_dataset'
acc_train_folder = '/ihome/lshangguan/adr121/Hearables/data_new/acc_train_dataset'

clean_val_folder ='/ihome/lshangguan/adr121/Hearables/data_new/clean_val_dataset'
noisy_val_folder ='/ihome/lshangguan/adr121/Hearables/data_new/noisy_val_dataset'
acc_val_folder ='/ihome/lshangguan/adr121/Hearables/data_new/acc_val_dataset'


clean_test_folder = '/ihome/lshangguan/adr121/Hearables/data_new/clean_test_dataset'
noisy_test_folder = '/ihome/lshangguan/adr121/Hearables/data_new/noisy_test_dataset'
acc_test_folder = '/ihome/lshangguan/adr121/Hearables/data_new/acc_test_dataset'



serialized_train_folder = '/ix/lshangguan/adr121/serialized_train_data'
serialized_val_folder = '/ix/lshangguan/adr121/serialized_val_data'
serialized_test_folder = '/ix/lshangguan/adr121/serialized_test_data'

window_size = 1000  # about 1 second of samples
sample_rate = 125


In [3]:
def slice_signal(file, window_size, stride, sample_rate):
    """
    Helper function for slicing signals
    by window size and sample rate with [1-stride] percent overlap (default 50%).
    """
    # pass
    df = pd.read_csv(file)
    df=df.iloc[:,1:]
    # num_rows, num_cols = df.shape
    hop = int(window_size * stride)
    slices = []
    for end_idx in range(window_size, len(df), hop):
        start_idx = end_idx - window_size
        slice_sig = df[start_idx:end_idx]
        slices.append(slice_sig)
    return slices

In [4]:
def process_and_serialize(data_type):
    stride=0.5
    if data_type=='train':
        clean_folder = clean_train_folder
        noisy_folder = noisy_train_folder
        acc_folder = acc_train_folder
        serialized_folder = serialized_train_folder
    elif data_type=='val':
        clean_folder = clean_val_folder
        noisy_folder = noisy_val_folder
        acc_folder = acc_val_folder
        serialized_folder = serialized_val_folder
    else:
        clean_folder = clean_test_folder
        noisy_folder = noisy_test_folder
        acc_folder = acc_test_folder
        serialized_folder = serialized_test_folder
    if not os.path.exists(serialized_folder):
        os.makedirs(serialized_folder)
    clean_files = glob.glob(clean_folder+"/*.csv")
    noisy_files = glob.glob(noisy_folder+"/*.csv")
    acc_files = glob.glob(acc_folder+"/*.csv")

    clean_files=sorted(clean_files, key=lambda x: int(x.split('_')[-2]))
    noisy_files=sorted(noisy_files, key=lambda x: int(x.split('_')[-2]))
    # acc_files=sorted(acc_files, key=lambda x: int(x.split('_')[-2]))

    acc_file=acc_files[0]

 
    for clean_file,noisy_file in tqdm(zip(clean_files,noisy_files)):
    
        clean_sliced = slice_signal(clean_file, window_size, stride, sample_rate)
        noisy_sliced = slice_signal(noisy_file, window_size, stride, sample_rate)
        acc_sliced = slice_signal(acc_file, window_size, stride, sample_rate)
        filename = os.path.basename(clean_file).split('.')[0]

        for idx, slice_tuple in enumerate(zip(clean_sliced, noisy_sliced,acc_sliced)):
            pair = np.array([slice_tuple[0], slice_tuple[1], slice_tuple[2]])
            np.save(os.path.join(serialized_folder, '{}_{}'.format(filename, idx+1)), arr=pair)
            # np.save(os.path.join(serialized_folder, '{}_{}_acc'.format(filename, idx+1)), arr=acc_sliced)
        
            # if not os.path.exists(filename[:-5]):
            #     os.makedirs(filename[:-5])
            # np.save(os.path.join(serialized_folder, '{}_{}'.format(filename, idx)), arr=pair)


In [5]:
def data_verify(data_type):
    if data_type == 'train':
        serialized_folder = serialized_train_folder
    elif data_type  == 'val':
        serialized_folder = serialized_val_folder
    else:
        serialized_folder = serialized_test_folder

    for root, dirs, files in os.walk(serialized_folder):
        files=[f for f in files if not f.startswith('.')]
        for filename in tqdm(files, desc='Verifying serialized {} signals'.format(data_type)):
            data_pair = np.load(os.path.join(root, filename),allow_pickle=True)
            if data_pair.shape[1] != window_size:
                print('Snippet length not {} : {} instead'.format(window_size, data_pair.shape[1]))
                break

In [None]:
if __name__ == '__main__':
    process_and_serialize('train')
    data_verify('train')
    process_and_serialize('val')
    data_verify('val')
    process_and_serialize('test')
    data_verify('test')