# Understanding the Data

In [7]:
import wfdb
import pandas as pd
import os

record_path = '/Users/evanzimm/GitHub/python-example-2025/code15_output/exams_part0/113'

if not os.path.exists(record_path + '.dat') or not os.path.exists(record_path + '.hea'):
    print("Error: The .dat or .hea file is missing.")

# Read the .dat file along with the .hea (header) file using wfdb
record = wfdb.rdrecord(record_path)

print(record.__dict__)

# The ECG signal data is stored in record.d_signal
# Convert the signal data to a pandas DataFrame
#df = pd.DataFrame(record.d_signal)

# If you want to see the first few rows of the DataFrame
#print(df.head())


{'record_name': '113', 'n_sig': 12, 'fs': 400, 'counter_freq': None, 'base_counter': None, 'sig_len': 4096, 'base_time': None, 'base_date': None, 'comments': ['Age: 68', 'Sex: Female', 'Chagas label: False', 'Source: CODE-15%'], 'sig_name': ['I', 'II', 'III', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6'], 'p_signal': array([[ 7.767,  2.82 , -4.947, ...,  3.826,  3.948,  4.367],
       [ 7.747,  2.801, -4.946, ...,  3.811,  3.934,  4.353],
       [ 7.713,  2.783, -4.93 , ...,  3.803,  3.921,  4.337],
       ...,
       [ 3.432,  0.98 , -2.452, ...,  1.839,  1.746,  2.304],
       [ 3.458,  0.984, -2.474, ...,  1.827,  1.733,  2.286],
       [ 3.48 ,  0.982, -2.499, ...,  1.818,  1.718,  2.276]]), 'd_signal': None, 'e_p_signal': None, 'e_d_signal': None, 'file_name': ['113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat'], 'fmt': ['16', '16', '16', '16', '16', '16', '16', '16', '16', '16', '16', '16']

Key Details from the record.__dict__ Output:

record_name: '113' This is the name of the ECG record (usually derived from the filename without extension).

n_sig: 12 The number of signals (ECG leads) in the dataset. This record has 12 leads, corresponding to the 12-lead ECG.

fs: 400 The sampling frequency (Hz) is 400, meaning that the signal is sampled 400 times per second.

sig_len: 4096 The length of the signal data (number of samples in each ECG lead).

sig_name: ['I', 'II', 'III', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6'] The names of the 12 ECG leads.

comments: ['Age: 68', 'Sex: Female', 'Chagas label: False', 'Source: CODE-15%']

Additional information about the record, including patient demographics (age, sex) and labels (Chagas status).

p_signal: (Array of signal data) This is the actual ECG signal data in float32 format, represented as a 2D array with shape (4096, 12) (i.e., 4096 samples, 12 leads).

d_signal: NoneThis field is not populated in this case. It may be used for digitally scaled signals, but in this case, it doesn't contain any data.

adc_gain: [1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0] The gain applied to each of the ECG leads, here set to 1000 for each lead.

baseline: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

The baseline value for each lead, typically 0. units: ['mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV', 'mV'] The units of the signals, which are in millivolts (mV) for each lead.

checksum: [28904, -4132, 32499, 18601, -9558, -23504, 27243, 15474, 28365, 22578, -12925, -14130]

And More...

In [93]:
# Convert the signal data to a pandas DataFrame
df = pd.DataFrame(record.p_signal)

# Set the column names as the lead names (from the 'sig_name' attribute)
df.columns = record.sig_name

# Show the first few rows of the DataFrame
print(df.head())

       I     II    III    AVR    AVL    AVF     V1     V2     V3     V4  \
0  7.767  2.820 -4.947 -5.290  6.357 -1.059  4.451  3.186  4.451  3.826   
1  7.747  2.801 -4.946 -5.270  6.342 -1.071  4.460  3.194  4.447  3.811   
2  7.713  2.783 -4.930 -5.247  6.320 -1.075  4.465  3.207  4.448  3.803   
3  7.682  2.766 -4.916 -5.225  6.299 -1.074  4.475  3.216  4.447  3.799   
4  7.660  2.752 -4.909 -5.206  6.281 -1.075  4.497  3.224  4.451  3.796   

      V5     V6  
0  3.948  4.367  
1  3.934  4.353  
2  3.921  4.337  
3  3.912  4.322  
4  3.910  4.322  


I am pretty sure all the data is scaled by a factor of 1000 but that is not important

In [94]:
print("DF Shape: " + str(df.shape))



DF Shape: (4096, 12)


In [95]:
print(df.tail())

          I     II    III    AVR    AVL    AVF     V1     V2     V3     V4  \
4091  3.376  0.981 -2.395 -2.179  2.883 -0.704  2.930  1.460  2.152  1.848   
4092  3.399  0.976 -2.424 -2.188  2.912 -0.724  2.927  1.448  2.149  1.845   
4093  3.432  0.980 -2.452 -2.203  2.940 -0.736  2.921  1.441  2.143  1.839   
4094  3.458  0.984 -2.474 -2.217  2.961 -0.745  2.907  1.432  2.131  1.827   
4095  3.480  0.982 -2.499 -2.228  2.986 -0.758  2.902  1.421  2.122  1.818   

         V5     V6  
4091  1.749  2.313  
4092  1.745  2.309  
4093  1.746  2.304  
4094  1.733  2.286  
4095  1.718  2.276  


In [96]:
# Check non-zero

# Count 0s in each column
zero_counts = df.eq(0).sum(axis=0)

print(zero_counts)

I      3
II     0
III    2
AVR    0
AVL    1
AVF    4
V1     0
V2     0
V3     0
V4     0
V5     0
V6     0
dtype: int64


In [97]:
# Missing Data?

missing_counts = df.isnull().sum()
print(missing_counts)

I      0
II     0
III    0
AVR    0
AVL    0
AVF    0
V1     0
V2     0
V3     0
V4     0
V5     0
V6     0
dtype: int64


In [47]:
import h5py
import numpy as np

f = h5py.File('/Users/evanzimm/GitHub/python-example-2025/samitrop_input/exams.hdf5', 'r')

print(list(f.keys()))
tracings = f['tracings']
print(tracings.shape)
print(tracings[3])

# Get ids
# traces_ids = np.array(f['id_exam'])
#x = f['signal']

['tracings']
(1631, 4096, 12)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [48]:
record_path = '/Users/evanzimm/GitHub/python-example-2025/samitrop_output/20129'

if not os.path.exists(record_path + '.dat') or not os.path.exists(record_path + '.hea'):
    print("Error: The .dat or .hea file is missing.")

# Read the .dat file along with the .hea (header) file using wfdb
record = wfdb.rdrecord(record_path)

print(record.__dict__)


{'record_name': '20129', 'n_sig': 12, 'fs': 400, 'counter_freq': None, 'base_counter': None, 'sig_len': 2934, 'base_time': None, 'base_date': None, 'comments': ['Age: 65', 'Sex: Female', 'Chagas label: True', 'Source: SaMi-Trop'], 'sig_name': ['I', 'II', 'III', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6'], 'p_signal': array([[-0.215, -0.147,  0.068, ...,  0.274, -0.127, -0.235],
       [-0.266, -0.195,  0.071, ...,  0.316, -0.175, -0.295],
       [-0.099, -0.066,  0.033, ...,  0.163, -0.004, -0.083],
       ...,
       [-0.238, -0.195,  0.043, ...,  0.28 , -0.156, -0.272],
       [-0.215, -0.166,  0.049, ...,  0.274, -0.137, -0.244],
       [-0.046, -0.035,  0.012, ...,  0.061, -0.029, -0.052]]), 'd_signal': None, 'e_p_signal': None, 'e_d_signal': None, 'file_name': ['20129.dat', '20129.dat', '20129.dat', '20129.dat', '20129.dat', '20129.dat', '20129.dat', '20129.dat', '20129.dat', '20129.dat', '20129.dat', '20129.dat'], 'fmt': ['16', '16', '16', '16', '16', '16', '16', '16

Some signals originally have a duration of 10 seconds (10 * 400 = 4000 samples) and others of 7 seconds (7 * 400 = 2800 samples). In order to make them all have the same size (4096 samples), we fill them with zeros on both sizes. For instance, for a 7 seconds ECG signal with 2800 samples we include 648 samples at the beginning and 648 samples at the end, yielding 4096 samples that are then saved in the hdf5 dataset. 

By signals in this case, they mean all of the 12 signals (i.e. within one signal there are 12 leads, but it is not like one lead when for 10 seconds and another went for 7 I think).

No Mixing of Durations Across Leads: It’s not like one lead is sampled for 7 seconds and another for 10 seconds. All leads have the same duration for each exam. The difference in duration (7 or 10 seconds) is across different exams, not across different leads in a single exam.

Same Sampling Rate for All Leads: All 12 leads are sampled at the same rate (400 Hz in this case) for the same duration. So, if the exam lasts 7 seconds or 10 seconds, all 12 leads have data sampled at the same intervals for that duration.

In your case, the signal represents the complete set of 12 leads for an individual exam, sampled at the same time over the duration of that exam. The key point is that all 12 leads in a single exam are aligned and sampled for the same duration, and the entire 12-lead data is treated as one signal in the dataset.

# How does this fit Optimal Stopping Theory, Optimal Sampling Theory, and Belief Networks?

Here is a rough idea:

With dynamic belief networks we can model how the conditional probabilities change over time. We are interested in predicting Chagras. 

We have 2800 samples or 4096 samples, etc, meaning that one obersvation has many time steps. That seems quite large. The solution could be to somehow come up with larger time intervals and then define discrete states from those intervals? Just need to define states over the samples.

To tie this into optimal stopping theory, maybe there is a way to predict Chagras before the final state? Tbh this doesn't seem to be very beneficial because we are talking about a 7 second window. Predicting after 7 seconds vs after 3 seconds doesn't seem that beneficial.

Maybe it doesn't need to be a dynamic network... (but maybe that contradicts my research interests)

# Coming up with a dataset

 SaMi-Trop dataset contains all positive records (1,631 records from Brazil). CODE-15 dataset also contains records from Brazil from around the same time, early 2010s. Chagras labels are self-reported for that dataset. PTB-XL dataset contains records from Europe in the 90's and all patients are likely negatigve.

 The positive and negative labels for the dataset should be 50/50 split. To start, I will create a dataset that includes all the SaMi-Trop data (and positive labels from CODE-15?), as well as an equal number of records with negative labels from CODE-15. For these two datasets, most records have a sampling duration of 7.3 or 10.2s, and a sampling frequencty of 400hz. 

 Every record in the dataset should have the same duration and frequency because that makes it easy to come up with discrete states.

## Positive Labels in CODE-15

In [16]:
# Loop through the data in both CODE-15 output folders and find all the positive records
from pathlib import Path
import wfdb

def getPositiveRecords(path):
    """
    Loop through all the .hea files in a folder and find all the positive records.
    Return a list of filenames of all the positive records.
    """

    # Specify the directory path
    directory = Path(path)

    positive_records = []


    # Loop through all files in the directory
    for file in directory.iterdir():
        # if it is a header file, check if it is a positive label
        if file.is_file() and file.suffix == '.hea':
            file_name_without_extension = file.stem

            if file.name == '113.hea':
                print(path + '/' + file_name_without_extension)

            header = wfdb.rdheader(path + '/' + file_name_without_extension)

            if file.name == '113.hea':
                print(header.__dict__)

            if header.__dict__['comments'][2] == 'Chagas label: True':
                positive_records.append(file.name)
            
    
    return positive_records

In [17]:
Code15_exams0_positive = getPositiveRecords('/Users/evanzimm/GitHub/python-example-2025/code15_output/exams_part0')

/Users/evanzimm/GitHub/python-example-2025/code15_output/exams_part0/113
{'record_name': '113', 'n_sig': 12, 'fs': 400, 'counter_freq': None, 'base_counter': None, 'sig_len': 4096, 'base_time': None, 'base_date': None, 'comments': ['Age: 68', 'Sex: Female', 'Chagas label: False', 'Source: CODE-15%'], 'sig_name': ['I', 'II', 'III', 'AVR', 'AVL', 'AVF', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6'], 'p_signal': None, 'd_signal': None, 'e_p_signal': None, 'e_d_signal': None, 'file_name': ['113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat', '113.dat'], 'fmt': ['16', '16', '16', '16', '16', '16', '16', '16', '16', '16', '16', '16'], 'samps_per_frame': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'skew': [None, None, None, None, None, None, None, None, None, None, None, None], 'byte_offset': [None, None, None, None, None, None, None, None, None, None, None, None], 'adc_gain': [1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0, 

In [19]:
print(Code15_exams0_positive)

['1009988.hea', '10163.hea', '1020143.hea', '102085.hea', '1020860.hea', '1020861.hea', '1045453.hea', '104832.hea', '1088268.hea', '1092558.hea', '1097244.hea', '1097957.hea', '1101228.hea', '1106740.hea', '1125189.hea', '1127506.hea', '113212.hea', '1133183.hea', '1134788.hea', '1136480.hea', '1138801.hea', '1140788.hea', '114173.hea', '1156277.hea', '115721.hea', '1172034.hea', '1172990.hea', '117915.hea', '1196487.hea', '1197846.hea', '1198535.hea', '1222567.hea', '1224050.hea', '1244617.hea', '1252491.hea', '1254816.hea', '126444.hea', '1268155.hea', '1271015.hea', '1272282.hea', '1279443.hea', '1284876.hea', '1291946.hea', '1296655.hea', '129732.hea', '1300.hea', '1302303.hea', '1306361.hea', '1317956.hea', '1319530.hea', '1325823.hea', '1330870.hea', '1346632.hea', '1348356.hea', '1371488.hea', '137977.hea', '1394331.hea', '1401767.hea', '1408718.hea', '142108.hea', '1429931.hea', '1456644.hea', '147125.hea', '1490631.hea', '1496954.hea', '1523972.hea', '1532651.hea', '153934.he

In [21]:
print(len(Code15_exams0_positive))

402


In [None]:
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import wfdb


# Function to process each file
def process_file(file, path):
    # if it is a header file, check if it is a positive label
    if file.is_file() and file.suffix == '.hea':
        file_name_without_extension = file.stem

        # Assuming wfdb.rdheader requires a full path to the file
        header = wfdb.rdheader(str(path / file_name_without_extension))

        if header.__dict__['comments'][2] == 'Chagas label: True':
            return file.name
    
    return None  # Return the list of positive records for this file

# Main function to get the positive records
def getPositiveRecordsOptimized(path):
    # Convert the path to a Path object
    directory = Path(path)

    # Use glob to get all .hea files in the directory
    files_to_process = list(directory.glob('*.hea'))

    # Process files in parallel using ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=8) as executor:
        # Map the process_file function to the list of files
        results = executor.map(process_file, files_to_process, [directory]*len(files_to_process))

    # Filter out None values and return the list of positive records
    positive_records = [result for result in results if result is not None]

    return positive_records

    

In [9]:
Code15_exams0_positive_optimized = getPositiveRecordsOptimized('/Users/evanzimm/GitHub/python-example-2025/code15_output/exams_part0')

In [11]:
print(len(Code15_exams0_positive_optimized))

402


In [23]:
if(Code15_exams0_positive_optimized == Code15_exams0_positive):
    print("optimization works")

optimization works


It doesn't seem to actually make a big difference (over-engineerered tbh).

In [24]:
Code15_exams1_positive = getPositiveRecordsOptimized('/Users/evanzimm/GitHub/python-example-2025/code15_output/exams_part1')

In [25]:
print(len(Code15_exams1_positive))

417


Positive examples = 417 + 402 + 800ish = around 1600. Now the question is what the sampling frequency should be?

First I need to start building this dataset in a folder. This will make it easier to compare meta-data across records.

In [27]:
import shutil
from pathlib import Path

def copy_files(file_list, source_dir, target_dir, other_suffix):
    """
    Copy files from the source directory to the target directory, including
    a file with the same stem but a different suffix.

    :param file_list: List of filenames (with suffixes) to copy.
    :param source_dir: Path to the source directory.
    :param target_dir: Path to the target directory.
    :param other_suffix: The suffix of the other file to copy (e.g., ".txt" to ".hea").
    """
    # Convert source and target directories to Path objects
    source_dir = Path(source_dir)
    target_dir = Path(target_dir)

    # Make sure the target directory exists, create it if not
    # target_dir.mkdir(parents=True, exist_ok=True)

    # Loop through the list of filenames
    for file_name in file_list:
        source_file = source_dir / file_name  # Full path of the primary file

        # Check if the primary file exists
        if source_file.is_file():
            # Copy the primary file to the target directory
            target_file = target_dir / file_name
            shutil.copy(source_file, target_file)
            # print(f"Copied {file_name} to {target_dir}")

            # Get the stem (filename without extension) to find the corresponding file with the other suffix
            file_stem = source_file.stem
            other_file = source_dir / f"{file_stem}{other_suffix}"  # Full path of the other file

            # Check if the corresponding file exists and copy it
            if other_file.is_file():
                target_other_file = target_dir / f"{file_stem}{other_suffix}"
                shutil.copy(other_file, target_other_file)
                # print(f"Copied {other_file.name} to {target_dir}")
            else: 
                print(f"Corresponding file with suffix '{other_suffix}' not found for {file_name}")
        else:
            print(f"File {file_name} does not exist in the source directory.")


In [None]:
# Example usage:
# file_list = ["file1.txt", "file2.hea", "file3.txt"]  # Replace with your list of files

##### Copy positive records from code15 exam part 0 into dbn_dataset #####

exams0_source_dir = "/Users/evanzimm/GitHub/python-example-2025/code15_output/exams_part0"  # Replace with the path to your source directory
target_dir = "/Users/evanzimm/GitHub/python-example-2025/dbn_dataset"  # Replace with the path to your target directory
other_suffix = ".dat"  # The suffix of the other file to copy (e.g., ".hea" for ".txt")

copy_files(Code15_exams0_positive, exams0_source_dir, target_dir, other_suffix)

#TODO: COPY THE FILES OVER

In [None]:
# Example usage:
# file_list = ["file1.txt", "file2.hea", "file3.txt"]  # Replace with your list of files

##### Copy positive records from code15 exam part 1 into dbn_dataset #####

exams1_source_dir = "/Users/evanzimm/GitHub/python-example-2025/code15_output/exams_part1"  # Replace with the path to your source directory
target_dir = "/Users/evanzimm/GitHub/python-example-2025/dbn_dataset"  # Replace with the path to your target directory
other_suffix = ".dat"  # The suffix of the other file to copy (e.g., ".hea" for ".txt")

copy_files(Code15_exams1_positive, exams1_source_dir, target_dir, other_suffix)

#TODO: COPY THE FILES OVER

The next thing I need to consider is how to organize the data into states (timesteps) because DBNs operate on data in states. Different records have different sampling durations, most commonly 7.3 seconds or 10.2 seconds and a sampling frequency of 400Hz. Before I decide how to organize states, let's just group by and count records by sampling frequency (fs) and signal length (sig_len). 

In [19]:
from collections import defaultdict
from pathlib import Path

def countSamplingFrequency(path):
    """
    Return a hashtable with  (key:(sampling frequency, sig_len), value:[.hea files])
    """

    # Create a hash table for grouping (key:(sampling frequency, sig_len), value:[.hea files])
    sampling_counter = defaultdict(list)

    # Loop through all hea files in dbn_dataset
    # Specify the directory path
    directory = Path(path)

    # Loop through all files in the directory
    for file in directory.iterdir():
        # if it is a header file, append the .hea file to the list for the corresponding key (sampling frequency, signal length)
        if file.is_file() and file.suffix == '.hea':
            file_name_without_extension = file.stem

            header = wfdb.rdheader(path + '/' + file_name_without_extension)

            sampling_freq = header.__dict__['fs']
            signal_length = header.__dict__['sig_len']

            sampling_counter[(sampling_freq, signal_length)].append(file.name)

    return sampling_counter


In [20]:
# Group files in dbn_dataset by (sampling frequency, n_sig)
dbn_dataset_path = "/Users/evanzimm/GitHub/python-example-2025/dbn_dataset"

dbn_dataset_samlping_counter = countSamplingFrequency(dbn_dataset_path)

In [21]:
# Print the length of each group (sampling frequency, n_sig) in dbn_dataset_samlping_counter
for group in dbn_dataset_samlping_counter:
    print("Length of group " + str(group) + ": " + str(len(dbn_dataset_samlping_counter[group])))

Length of group (400, 4096): 1084
Length of group (400, 2934): 2004
Length of group (400, 3265): 1
Length of group (400, 3270): 1
Length of group (400, 2325): 1
Length of group (400, 2600): 1
Length of group (400, 1912): 1
Length of group (400, 3960): 1
Length of group (400, 3564): 1
Length of group (400, 3242): 1
Length of group (400, 2980): 1
Length of group (400, 3345): 1
Length of group (400, 1792): 1
Length of group (400, 3874): 7
Length of group (400, 2601): 2
Length of group (400, 3024): 1
Length of group (400, 1924): 1
Length of group (400, 3994): 1
Length of group (400, 3628): 1
Length of group (400, 3021): 1
Length of group (400, 1834): 1
Length of group (400, 2769): 1
Length of group (400, 3841): 1
Length of group (400, 3196): 1
Length of group (400, 3457): 2
Length of group (400, 1872): 1
Length of group (400, 2978): 1
Length of group (400, 1574): 1
Length of group (400, 3900): 1
Length of group (400, 3068): 1
Length of group (400, 2878): 2
Length of group (400, 2730): 1
Le

In [22]:
sampling_freq_groups = dbn_dataset_samlping_counter.keys()
max(sampling_freq_groups, key= lambda x:x[1])

(400, 4096)

In [23]:
min(sampling_freq_groups, key= lambda x:x[1])

(400, 31)

In [24]:
print(dbn_dataset_samlping_counter[(400, 3874)])

['1480438.hea', '1544638.hea', '1675042.hea', '278648.hea', '2931032.hea', '830653.hea', '85479.hea']


In [25]:
# Any records without a sampling frequency of 400?
not_400hz_count = 0

for group in dbn_dataset_samlping_counter:
    if group[0] != 400:
        not_400hz_count += len(dbn_dataset_samlping_counter[group])

print("Number of records without a sampling frequency of 400hz: " + str(not_400hz_count))

Number of records without a sampling frequency of 400hz: 0


In [26]:
# How many records with a signal length less than 2934

less_than_2934_count = 0

for group in dbn_dataset_samlping_counter:
    if group[1] < 2934:
        less_than_2934_count += len(dbn_dataset_samlping_counter[group])


In [28]:
print("Number of records with a signal length less than 2934: " + str(less_than_2934_count))

Number of records with a signal length less than 2934: 78


In [29]:
# How many records with a signal length less than 2934

total_count = 0

for group in dbn_dataset_samlping_counter:
    total_count += len(dbn_dataset_samlping_counter[group])

print(total_count) 

3268


DBNs are supposed to be good at handling missing data. In this case I don't have missing data in the sense that I have NaNs in my dataset, but I do have some signals that are longer than others? Can DBNs natively handle that or do I need to truncate? Or is it not a good idea to construct DBNs on missing data in the first place?

It is not strictly necessary to make observations equal-length for DBN Training. ChatGPT4.5 reccommends keeping natural sequence lengths for classical DBNs. EM-based training naturally handles differing lengths. Also it seems that real-world data will naturally come in varying lengths.

Now I need to copy over about 1634 negative examples to get a 50/50 split in this dataset.

Can I randomly pick out 1634 negative examples from Code15_exams_part0 ?

In [99]:
import random
import wfdb
from pathlib import Path

def get_negative_records(path, num_negative_records):
    # Convert path to Path object
    directory = Path(path)

    # Reservoir sampling approach to select 'num_negative_records' randomly
    negative_records = []
    
    # Counter for the number of negative records found
    count = 0
    
    # Iterate through all .hea files in the directory
    for file in directory.glob('*.hea'):
        file_name_without_extension = file.stem  # Get file name without extension
        
        # Read the header file using wfdb.rdheader
        header = wfdb.rdheader(str(directory / file_name_without_extension))
        
        # Check if the file is a negative record (doesn't contain 'Chagas label: True')
        if header.__dict__['comments'][2] == 'Chagas label: False':
            count += 1
            # Reservoir sampling: once we reach the desired number of negative records, stop
            if len(negative_records) < num_negative_records:
                negative_records.append(file.name)
            else:
                # Randomly replace a file in the list with a new one
                replace_index = random.randint(0, count - 1)
                if replace_index < num_negative_records:
                    negative_records[replace_index] = file.name
    
    # If we found fewer negative records than requested, return them all
    if count < num_negative_records:
        print(f"Warning: Only found {count} negative records. Selecting all of them.")
        return negative_records
    
    return negative_records

# Example usage
path = "/Users/evanzimm/GitHub/python-example-2025/code15_output/exams_part0"  # Replace with the path to your directory
num_negative_records = 1634  # Specify how many negative records you want to randomly select

selected_records = get_negative_records(path, num_negative_records)
#print(f"Selected negative records: {selected_records}")


In [100]:
print(selected_records)

['1255707.hea', '97615.hea', '1000087.hea', '4267042.hea', '2869433.hea', '3080563.hea', '1000353.hea', '1432592.hea', '1393220.hea', '390595.hea', '929735.hea', '969611.hea', '1677062.hea', '1303757.hea', '3785466.hea', '1529808.hea', '744671.hea', '1315133.hea', '1420607.hea', '441745.hea', '2895172.hea', '3223494.hea', '1333446.hea', '1002915.hea', '4218706.hea', '3412125.hea', '2837563.hea', '1496205.hea', '1285889.hea', '394546.hea', '1003741.hea', '817751.hea', '598047.hea', '524291.hea', '1004472.hea', '32788.hea', '674603.hea', '309337.hea', '3055806.hea', '77882.hea', '2881403.hea', '336582.hea', '1572003.hea', '959156.hea', '2744462.hea', '3176703.hea', '215284.hea', '1601181.hea', '1006072.hea', '993611.hea', '990724.hea', '974104.hea', '974790.hea', '766321.hea', '358817.hea', '160779.hea', '1250142.hea', '508699.hea', '10071.hea', '186032.hea', '756.hea', '1251058.hea', '1007363.hea', '3201007.hea', '1375869.hea', '1337168.hea', '2918309.hea', '1007979.hea', '3794324.hea',

In [101]:
print(len(selected_records))

1634


In [102]:
# Copy the files from selected_records (and their corresponding .dat files into dbn_dataset)

exams0_source_dir = "/Users/evanzimm/GitHub/python-example-2025/code15_output/exams_part0"  # Replace with the path to your source directory
target_dir = "/Users/evanzimm/GitHub/python-example-2025/dbn_dataset"  # Replace with the path to your target directory
other_suffix = ".dat"  # The suffix of the other file to copy (e.g., ".hea" for ".txt")

copy_files(selected_records, exams0_source_dir, target_dir, other_suffix)

In [103]:
from pathlib import Path

def file_exists(file_path):
    return Path(file_path).exists()

# Example usage
hea_file_path = "/Users/evanzimm/GitHub/python-example-2025/dbn_dataset/1255707.hea"
dat_file_path = "/Users/evanzimm/GitHub/python-example-2025/dbn_dataset/1255707.dat"

if file_exists(hea_file_path):
    print(f"{hea_file_path} exists.")
else:
    print(f"{hea_file_path} does not exist.")

if file_exists(dat_file_path):
    print(f"{dat_file_path} exists.")
else:
    print(f"{dat_file_path} does not exist.")


/Users/evanzimm/GitHub/python-example-2025/dbn_dataset/1255707.hea exists.
/Users/evanzimm/GitHub/python-example-2025/dbn_dataset/1255707.dat exists.


Now I need to discretize states? but how? One idea is to break down an observation into multiple indepndent segments, but that doesn't seem to be right. I am dealing with high frequency data but I want to make it lower frequency to start.

Idea: Aggregate or downsample the data from high frequency into discrete steps (e.g. one second). This is a common, valid, and practical approach.

This will simplify training, reduce noise, and preserve relevant information.

I need to write a function that takes in an observation and returns the aggregated version of the observation.

'p_signal': array([[ 7.767,  2.82 , -4.947, ...,  3.826,  3.948,  4.367], ..., []])

In [113]:
def temporalAggregation(record_path, aggregation_interval):
    """
    Given a record path, take the signal and temporally aggregate it averaging observations over steps of {interval_length}

    record_path (str): path to the record without the file extension at the end
    interval_length (int): length of the interval of aggregation in seconds?

    return: aggregatedSignal (str) a 2d array of size (original_length / 400, 12)
    """

    # take the record path and read the record
    record = wfdb.rdrecord(record_path)

    original_freq = 400 # samples per second (Hz)
    steps_per_interval = original_freq * aggregation_interval

    # extract p_signal into a dataframe
    df = pd.DataFrame(record.p_signal)

    # Set the column names as the lead names (from the 'sig_name' attribute)
    df.columns = record.sig_name

    num_intervals = int(np.ceil(df.shape[0] / steps_per_interval))
    aggregated_rows = []

    for i in range(num_intervals):
        start = i * steps_per_interval
        end = min((i+1) * steps_per_interval, df.shape[0])
        segment = df.iloc[start:end]

        aggregated_features = segment.mean().to_dict()
        aggregated_features['interval'] = i


        aggregated_rows.append(aggregated_features)

    agg_df = pd.DataFrame(aggregated_rows)

    return agg_df 

In [121]:
test_df = temporalAggregation('/Users/evanzimm/GitHub/python-example-2025/dbn_dataset/1255707', 1)
test_df

Unnamed: 0,I,II,III,AVR,AVL,AVF,V1,V2,V3,V4,V5,V6,interval
0,-0.004715,-0.011178,-0.006485,0.012922,0.00339,-0.006375,0.00055,-0.007457,-0.007938,-0.0121,-0.011545,-0.006265,0
1,0.00021,0.002015,0.001793,0.00358,0.00156,0.004225,-0.000748,-0.007468,-0.007575,-0.010235,-0.005748,-0.004732,1
2,0.000538,-0.005,-0.00554,0.007125,0.005605,-0.00291,-0.006038,-0.009502,-0.007537,-0.015037,-0.013873,-0.010333,2
3,-0.002655,-0.00198,0.00068,0.007118,0.000695,0.001755,-0.0003,-0.001008,0.00799,-0.004578,-0.001955,-0.004152,3
4,-0.004407,-0.03048,-0.02607,0.02222,0.01304,-0.025728,0.010348,-0.002383,-0.001537,-0.016783,-0.012538,-0.005765,4
5,0.02261,0.043335,0.020735,-0.028607,0.00298,0.034365,-0.033378,-0.003175,-0.037447,0.047332,0.048808,0.044408,5
6,-0.004545,-0.026642,-0.022078,0.020652,0.011325,-0.021882,0.002265,-0.00964,-0.00256,-0.02952,-0.028395,-0.031308,6
7,-0.002134,0.05497,0.057104,-0.02159,-0.02709,0.058396,0.025694,0.102433,0.149216,0.096201,0.050478,0.033373,7


I have code to break an observation into one second intervals but I need to add code to add gender, age, and Chagas at each interval

In [126]:
def addGenderAgeChagas(df, record_path):
    """
    Given a dataframe of temporally aggregated data, add gender, age, and chagas columbns
    """

    # take the record path and read the record
    record = wfdb.rdrecord(record_path)

    # 'comments': ['Age: 68', 'Sex: Female', 'Chagas label: False', 'Source: CODE-15%']
    age = int(record.comments[0].split(':')[1])
    
    if record.comments[1].split(':')[1] == ' Male':
        gender = 0
    else:
        gender = 1

    if record.comments[2].split(':')[1] == ' False':
        chagas = 0
    else:
        chagas = 1
    
    column_len = df.shape[0]

    df['age'] = [age] * column_len
    df['gender'] = [gender] * column_len
    df['chagas'] = [chagas] * column_len

    return df



In [127]:
test_df = addGenderAgeChagas(test_df, '/Users/evanzimm/GitHub/python-example-2025/dbn_dataset/1255707')
test_df

Unnamed: 0,I,II,III,AVR,AVL,AVF,V1,V2,V3,V4,V5,V6,interval,age,gender,chagas
0,-0.004715,-0.011178,-0.006485,0.012922,0.00339,-0.006375,0.00055,-0.007457,-0.007938,-0.0121,-0.011545,-0.006265,0,72,1,0
1,0.00021,0.002015,0.001793,0.00358,0.00156,0.004225,-0.000748,-0.007468,-0.007575,-0.010235,-0.005748,-0.004732,1,72,1,0
2,0.000538,-0.005,-0.00554,0.007125,0.005605,-0.00291,-0.006038,-0.009502,-0.007537,-0.015037,-0.013873,-0.010333,2,72,1,0
3,-0.002655,-0.00198,0.00068,0.007118,0.000695,0.001755,-0.0003,-0.001008,0.00799,-0.004578,-0.001955,-0.004152,3,72,1,0
4,-0.004407,-0.03048,-0.02607,0.02222,0.01304,-0.025728,0.010348,-0.002383,-0.001537,-0.016783,-0.012538,-0.005765,4,72,1,0
5,0.02261,0.043335,0.020735,-0.028607,0.00298,0.034365,-0.033378,-0.003175,-0.037447,0.047332,0.048808,0.044408,5,72,1,0
6,-0.004545,-0.026642,-0.022078,0.020652,0.011325,-0.021882,0.002265,-0.00964,-0.00256,-0.02952,-0.028395,-0.031308,6,72,1,0
7,-0.002134,0.05497,0.057104,-0.02159,-0.02709,0.058396,0.025694,0.102433,0.149216,0.096201,0.050478,0.033373,7,72,1,0


I can use this code to process records for building the DBN. Now I have to think about how to build the DBN.