In [None]:
#!pip install wfdb

In [None]:
import os

import wfdb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# **Utils**

In [None]:
symbol_to_label = {'p': 1, 'N': 2, 't': 3}

def create_annotation_mask(atr_symbols, atr_points, size):
    mask = np.zeros(size)
    for symbol in ['p', 'N', 't']:
        onset_list = np.where(atr_symbols == symbol)[0] - 1
        for _, onset in enumerate(onset_list):
            onset_idx = atr_points[onset]
            try:
                offset_idx = atr_points[onset + 2]
                mask[onset_idx:offset_idx] = symbol_to_label[symbol]
            except:
                offset_idx = atr_points[-1]
                mask[onset_idx:offset_idx] = symbol_to_label[symbol]
    return mask

# **LUDB database**
https://physionet.org/content/ludb/1.0.0/

In [None]:
#!wget -r -N -c -np https://physionet.org/files/ludb/1.0.0/

In [None]:
def create_LUDB_dataset(records, leads, folder_path):
    signals_train, masks_train = [], []
    signals_test, masks_test = [], []
    
    train_indexes = set(np.random.choice(200, 160, replace=False))
    test_indexes = set(np.arange(200)) - train_indexes

    for record_idx, name in enumerate(records):
        record_path = os.path.join(folder_path, name)
        record = wfdb.rdrecord(record_path)

        for lead_idx, lead in enumerate(leads):

            atr_lead = wfdb.rdann(record_path, extension=f'atr_{lead}')
            atr_symbols = np.array(atr_lead.symbol)
            atr_points = atr_lead.sample

            lead = record.p_signal[:, lead_idx]
            mask = create_annotation_mask(atr_symbols, atr_points, 5000)

            start_idx, end_idx = 1000, 4000
        
            signal = lead[start_idx:end_idx]
            mask = mask[start_idx:end_idx]
            if record_idx in train_indexes:
                signals_train.append(signal)
                masks_train.append(mask)
            else:
                signals_test.append(signal)
                masks_test.append(mask)
    
    return np.array(signals_train), np.array(masks_train), np.array(signals_test), np.array(masks_test)

In [None]:
folder_path = 'physionet.org/files/ludb/1.0.0'
records = pd.read_csv(os.path.join(folder_path, 'RECORDS'), names=['num']).num.values.astype(str)
leads = ['i', 'ii', 'iii', 'avr', 'avl', 'avf', 'v1', 'v2', 'v3', 'v4', 'v5', 'v6']
print('Число ЭКГ записей:', len(records))

Число ЭКГ записей: 200


In [None]:
ludb_signals_train, ludb_masks_train, ludb_signals_test, ludb_masks_test = create_LUDB_dataset(records, leads, folder_path)
print('Размерность тренировочного датасета:', ludb_signals_train.shape)
print('Размерность тестового датасета:', ludb_signals_test.shape)

Размерность тренировочного датасета: (1920, 3000)
Размерность тестового датасета: (480, 3000)


In [None]:
with open('ludb_signals_train.npy', 'wb') as f:
    np.save(f, ludb_signals_train)
with open('ludb_signals_test.npy', 'wb') as f:
    np.save(f, ludb_signals_test)
with open('ludb_masks_train.npy', 'wb') as f:
    np.save(f, ludb_masks_train)
with open('ludb_masks_test.npy', 'wb') as f:
    np.save(f, ludb_masks_test)

# **QT database**
https://physionet.org/content/qtdb/1.0.0/

In [None]:
#!wget -r -N -c -np https://physionet.org/files/qtdb/1.0.0/

In [None]:
def get_suitable_files(records):
    suitable_records = []
    for _, record in enumerate(records):
        atr_lead = wfdb.rdann(os.path.join(folder_path, record), extension='q1c')
        atr_symbols = np.unique(np.array(atr_lead.symbol))
        is_fit = 't' in atr_symbols and 'N' in atr_symbols and 'p' in atr_symbols
        if is_fit:
            suitable_records.append(record)
        else:
            print(f'File {record} does not fully annotated.')
    return suitable_records

def get_data_without_gaps(signal, mask, size, signals, masks, gap=100):
    start = 0
    end = start + size

    while end < mask.shape[0]:
        gap_len = 0 # num of not annotated points
        is_okay = True
        
        for i in range(start, end):
            label = mask[i]
            if gap_len > gap:
                is_okay = False
                break
            elif label == 0:
                gap_len += 1
            else:
                gap_len = 0

        if is_okay:
            signals.append(signal[start:end])
            masks.append(mask[start:end])
        
        start += size
        end = start + size

In [None]:
def create_QTDB_dataset(records, folder_path):
    SIZE = 3000
    signals_train, masks_train = [], []
    signals_test, masks_test = [], []
    
    train_indexes = set(np.random.choice(98, 78, replace=False))
    test_indexes = set(np.arange(98)) - train_indexes

    for record_idx, name in enumerate(records):
        record_path = os.path.join(folder_path, name)
        record = wfdb.rdrecord(record_path)

        for ext in ['q1c', 'pu1']:
            atr_lead = wfdb.rdann(record_path, extension=ext)
            atr_symbols = np.array(atr_lead.symbol)
            atr_points = atr_lead.sample

            lead = record.p_signal[:, 1] # get V2 lead
            diff = 225000 - lead.shape[0]
            if diff:
                lead = np.append(lead, [0] * diff)

            mask = create_annotation_mask(atr_symbols, atr_points, 225000)
                
            start_idx = np.where(mask != 0)[0][0]
            end_idx = np.where(mask != 0)[0][-1]
                
            signal = lead[start_idx:end_idx]
            mask = mask[start_idx:end_idx]
            if record_idx in train_indexes:
                get_data_without_gaps(signal, mask, SIZE, signals_train, masks_train)
            else:
                get_data_without_gaps(signal, mask, SIZE, signals_test, masks_test)
        
    return np.array(signals_train), np.array(masks_train), np.array(signals_test), np.array(masks_test)

In [None]:
folder_path = 'physionet.org/files/qtdb/1.0.0'
records = pd.read_csv(os.path.join(folder_path, 'RECORDS'), names=['num']).num.values
records = get_suitable_files(records)
print('Число ЭКГ сигналов:', len(records))

File sel102 does not fully annotated.
File sel221 does not fully annotated.
File sel232 does not fully annotated.
File sel310 does not fully annotated.
File sel35 does not fully annotated.
File sel37 does not fully annotated.
File sel50 does not fully annotated.
Число ЭКГ сигналов: 98


In [None]:
qtdb_signals_train, qtdb_masks_train, qtdb_signals_test, qtdb_masks_test = create_QTDB_dataset(records, folder_path)
print('Размерность тренировочного датасета:', qtdb_signals_train.shape)
print('Размерность тестового датасета:', qtdb_signals_test.shape)

Размерность тренировочного датасета: (2551, 3000)
Размерность тестового датасета: (734, 3000)


In [None]:
with open('qtdb_signals_train.npy', 'wb') as f:
    np.save(f, qtdb_signals_train)
with open('qtdb_signals_test.npy', 'wb') as f:
    np.save(f, qtdb_signals_test)
with open('qtdb_masks_train.npy', 'wb') as f:
    np.save(f, qtdb_masks_train)
with open('qtdb_masks_test.npy', 'wb') as f:
    np.save(f, qtdb_masks_test)