In [14]:
import datasets
from datasets import load_dataset

In [15]:
from glob import glob
from pathlib import Path
from sklearn.model_selection import train_test_split

In [16]:
base_dir = './data/ecg_model'

In [17]:
participants = [Path(p).stem for p in glob(base_dir + '/*.csv')]
train_participants, test_participants = train_test_split(participants, test_size=0.2)
train_participants, val_participants = train_test_split(train_participants, test_size=0.25)

In [18]:
ds = load_dataset(
    base_dir,
    trust_remote_code=True,
    train_participants=train_participants,
    val_participants=val_participants,
    test_participants=test_participants,
    num_proc=10
)

Generating fit split: 574698000 examples [02:06, 4557633.68 examples/s]
Generating test split: 203368000 examples [00:47, 4315392.42 examples/s]
Generating validate split: 201612000 examples [00:47, 4289247.22 examples/s]


In [19]:
len(ds['fit'])

574698000

In [None]:
import scipy 

original_fs = 1000 
new_fs = 500
ecg_resampled = scipy.signal.resample(ds['fit'], int(len(ds['fit']) * new_fs / original_fs))

In [35]:
sample = ds['fit'][0:0+1000]['signal']
len(sample)

1000

In [43]:
sample = ds['fit'][0:0+1000]
len(scipy.signal.resample(sample['signal'], int(1000 * new_fs / original_fs)))

500

In [45]:
def resample(x):
    return {
        'signal': scipy.signal.resample(x['signal'], int(len(x['signal']) * new_fs / original_fs)),
        'label': scipy.signal.resample(x['label'], int(len(x['label']) * new_fs / original_fs)),
    }
ds = ds.map(resample, batched=True, batch_size=1000)

Map: 100%|██████████| 574698000/574698000 [09:06<00:00, 1052410.78 examples/s]
Map: 100%|██████████| 203368000/203368000 [03:10<00:00, 1064833.71 examples/s]
Map: 100%|██████████| 201612000/201612000 [03:12<00:00, 1047759.18 examples/s]


In [46]:
len(ds['fit']) # expected = 581.033.000 / 2 = 290.516.500

287349000

In [47]:
ds = ds.with_format("torch")

In [48]:
ds.save_to_disk('./data/combined')

Saving the dataset (10/10 shards): 100%|██████████| 287349000/287349000 [00:45<00:00, 6295005.00 examples/s]
Saving the dataset (4/4 shards): 100%|██████████| 101684000/101684000 [00:17<00:00, 5839040.02 examples/s]
Saving the dataset (4/4 shards): 100%|██████████| 100806000/100806000 [00:21<00:00, 4680856.17 examples/s]


In [13]:
datasets.load_from_disk('./data/combined')

DatasetDict({
    fit: Dataset({
        features: ['signal', 'label'],
        num_rows: 581033000
    })
    test: Dataset({
        features: ['signal', 'label'],
        num_rows: 198028000
    })
    validate: Dataset({
        features: ['signal', 'label'],
        num_rows: 200617000
    })
})