<a href="https://colab.research.google.com/github/EML-Labs/Dataset/blob/main/Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install wfdb



In [8]:
import os
import os
import wfdb
import numpy as np
import glob
import csv

In [30]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
def segment_and_label(record_path, record_name, label, window_sec=30, overlap=0.0, save_dir='segments'):
    try:
        record = wfdb.rdrecord(os.path.join(record_path, record_name))
        signal = record.p_signal
        fs = record.fs
        window_size = int(window_sec * fs)
        step_size = int(window_size * (1 - overlap))

        os.makedirs(save_dir, exist_ok=True)

        segment_id = 0
        for start in range(0, len(signal) - window_size + 1, step_size):
            segment = signal[start:start + window_size]
            fname = f"{record_name}_seg{segment_id:03d}.npy"
            np.save(os.path.join(save_dir, fname), segment)
            csv_rows.append([fname, label])
            segment_id += 1

        print(f"Saved {segment_id} segments for {record_name} with label {label}")
    except Exception as e:
        print(f"Failed to process {record_name}: {e}")

In [35]:
raw_data_dir =  "/content/drive/Shareddrives/Datasets/1.0.0" # Adjust the path according to the drive
segment_save_dir = "/content/drive/MyDrive/Datasets/1.0.0" # Adjust the path according to the drive
os.makedirs(segment_save_dir, exist_ok=True)


label_file = os.path.join(segment_save_dir, 'labels.csv')

csv_rows = [['filename', 'label']]

In [36]:
# 5. Get list of records starting with 'p', excluding those ending with 'c'

hea_files = sorted(glob.glob(os.path.join(raw_data_dir, 'p*.hea')))
record_names = [
    os.path.splitext(os.path.basename(f))[0]
    for f in hea_files
    if not os.path.basename(f).endswith('c')
]

print(f"Found {len(record_names)} records.")

Found 0 records.


In [24]:
# Process records
for record_name in record_names:
    try:
        rec_num = int(record_name[1:])  # Get number after 'p'
        label = 1 if rec_num % 2 == 0 else 0  # Even = pre-af (1), Odd = non-af (0)
        segment_and_label(record_path=raw_data_dir,
                          record_name=record_name,
                          label=label,
                          window_sec=30,
                          overlap=0.0,
                          save_dir=segment_save_dir)
    except ValueError:
        print(f"Skipping malformed record name: {record_name}")


In [25]:
# Write labels.csv
with open(label_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(csv_rows)

print(f"Saved label file to: {label_file}")


Saved label file to: /content/drive/MyDrive/Datasets/1.0.0_pro/labels.csv


In [None]:
import torch
from torch.utils.data import Dataset
import numpy as np
import os
import pandas as pd

class ECGSegmentDataset(Dataset):
    def __init__(self, df, segment_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.segment_dir = segment_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        file_path = os.path.join(self.segment_dir, row['filename'])

        ecg_segment = np.load(file_path).astype(np.float32)
        ecg_tensor = torch.from_numpy(ecg_segment).transpose(0, 1)  # [channels, samples]
        label = torch.tensor(row['label'], dtype=torch.long)

        if self.transform:
            ecg_tensor = self.transform(ecg_tensor)

        return ecg_tensor, label



In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [None]:
segment_dir = '/content/drive/MyDrive/Datasets/1.0.0_processed'
label_csv = os.path.join(segment_dir, 'labels.csv')

# Load labels
df = pd.read_csv(label_csv)

# Stratified split
train_df, test_df = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=42)

# Print class balance
print("Train label distribution:\n", train_df['label'].value_counts())
print("Test label distribution:\n", test_df['label'].value_counts())

Train label distribution:
 label
0    1350
1    1350
Name: count, dtype: int64
Test label distribution:
 label
0    150
1    150
Name: count, dtype: int64


In [None]:
train_dataset = ECGSegmentDataset(train_df, segment_dir)
test_dataset = ECGSegmentDataset(test_df, segment_dir)



In [None]:
dir = '/content/drive/MyDrive/Datasets/2.0'

os.makedirs(dir,exist_ok=True)


test_csv_path = os.path.join(dir, 'test.csv')
train_csv_path = os.path.join(dir, 'train.csv')

train_df.to_csv(train_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)


