Each timestep = beat, anchored on R-peaks

Each observation = binary feature vector indicating if certain wave events (like P/QRS/T onsets or offsets) fall within a window around that beat

The target latent state = presence/absence of a condition (if labeled)

# Build the Dataset

## Load the Data

In [1]:
import pandas as pd
import numpy as np

# Load intervals CSV
interval_df = pd.read_csv('neurokit_delineation/output/DBN_ecg_all_features.csv')

In [2]:
interval_df.head()

Unnamed: 0,p_peaks,p_onsets,p_offsets,q_peaks,r_onsets,r_offsets,s_peaks,t_peaks,t_onsets,t_offsets,sample_idx,lead,heart_rate,r_peaks,pr_interval,qrs_complex,qt_interval,rr_interval,st_segment
0,0.101,"[nan, 838, 1498, 2168, 2822, 3488, 4160, 4824,...","[188, 886, 1574, 2250, 2874, 3588, 4268, 4932,...",-0.135,"[266, 912, 1586, 2266, 2886, 3596, 4194, 4854,...","[352, 1014, 1726, 2368, 3044, 3712, 4390, 5028...",-0.223,0.174,"[474, 1046, 1808, 2482, 3142, 3806, 4470, 5116...","[566, 1078, 1914, 2566, 3234, 3904, 4572, 5240...",0,II,72.412,[ 153 490 824 1154 1485 1819 2154 2485 2805],83.0,134.25,314.25,663.0,71.778
1,0.124,"[556, 1480, 2534, 3538, 4568, 5554]","[652, 1574, 2626, 3648, 4664, 5640]",-0.152,"[610, 1540, 2588, 3604, 4622, nan]","[778, nan, 2746, 3768, 4780, nan]",-0.159,0.419,"[922, 1830, 2868, 3886, 4922, nan]","[1030, 1946, 3004, 4018, 5036, nan]",1,II,48.178,[ 357 818 1345 1852 2361 2852],95.667,162.0,414.0,998.0,131.5
2,0.037,"[232, 932, 1644, 2278, 2942, 3592, 4180, 4742,...","[304, 996, 1708, 2346, 3004, 3656, 4242, 4808,...",-0.221,"[336, 1034, 1746, 2382, 3038, 3690, 4264, 4836...","[420, 1118, 1832, 2468, 3124, 3776, 4362, 4928...",-0.273,0.578,"[524, 1220, 1930, 2566, 3228, 3878, 4460, 5024...","[616, 1314, 2028, 2664, 3322, 3972, 4556, 5118...",2,II,75.278,[ 190 539 894 1213 1542 1867 2160 2442 2753],65.556,87.75,283.0,640.75,100.222
3,0.177,"[550, 1300, 2088, 2914, 3702, 4494, 5322]","[634, 1386, 2180, 2996, 3804, 4586, 5408]",-0.241,"[658, 1418, 2198, 3028, 3826, 4618, nan]","[740, 1514, 2308, 3134, 3910, 4728, 5530]",-0.201,0.174,"[880, 1622, 2458, 3284, 4064, 4850, 5692]","[992, 1744, 2552, 3348, 4166, 4904, 5782]",3,II,60.362,[ 352 730 1128 1536 1935 2330 2739],89.143,98.0,326.667,795.667,140.857
4,0.173,"[nan, 632, nan, nan, 4782]","[nan, 676, 1944, nan, nan]",,"[nan, nan, nan, nan, nan]","[nan, nan, nan, 4730, nan]",-5.897,0.237,"[nan, nan, nan, 4714, nan]","[nan, nan, nan, nan, nan]",4,II,57.845,[ 210 514 986 2344 2581],44.0,,,1185.5,-16.0


In [3]:
interval_df.shape

(3163, 19)

In [4]:
interval_df.tail()

Unnamed: 0,p_peaks,p_onsets,p_offsets,q_peaks,r_onsets,r_offsets,s_peaks,t_peaks,t_onsets,t_offsets,sample_idx,lead,heart_rate,r_peaks,pr_interval,qrs_complex,qt_interval,rr_interval,st_segment
3158,0.213,"[274, 852, 1514, 2214, 2960, 3636, 4262, 4946,...","[352, 966, 1588, 2288, 3030, 3712, 4338, 5020,...",-0.251,"[312, 950, 1628, 2330, 3056, 3682, 4382, 5042,...","[nan, 1142, nan, 2406, 3154, 3834, 4462, 5140,...",-0.383,0.526,"[nan, 1106, nan, 2586, 3334, 3994, 4636, 5320,...","[nan, nan, nan, 2666, 3396, 4070, 4698, 5386, ...",3185,II,72.147,[ 215 531 836 1187 1559 1898 2212 2551 2885],78.889,116.0,344.8,667.5,139.667
3159,0.158,"[152, 804, 1438, 2090, 2752, 3426, 4102, 4754,...","[250, 896, 1530, 2184, 2848, 3522, 4200, 4848,...",-0.188,"[268, 920, 1488, 2214, 2880, 3476, 4154, 4880,...","[nan, 1164, 1794, 2450, 2992, 3684, 4356, 4982...",-0.138,0.266,"[454, 1102, 1742, 2392, 3050, 3722, 4410, 5046...","[570, 1222, 1852, 2504, 3168, 3840, 4518, 5172...",3186,II,73.46,[ 163 487 803 1130 1462 1798 2137 2461 2778],95.333,201.429,320.75,653.75,11.5
3160,0.207,"[498, 1358, 2272, 3178, 4072, 5036]","[568, 1442, 2360, 3270, 4162, 5116]",-0.031,"[538, 1412, 2326, 3238, 4128, nan]","[684, 1560, 2476, 3386, 4276, 5234]",-0.755,0.03,"[814, 1652, 2528, 3514, 4402, 5318]","[894, 1704, 2682, 3594, 4484, 5440]",3187,II,52.809,[ 318 756 1214 1668 2114 2592],84.0,148.0,343.2,909.6,102.0
3161,-0.052,"[416, 1264, 2102, 2920, 3736, 4552, 5394]","[514, 1360, 2196, 3010, 3832, 4652, 5486]",-0.21,"[480, 1380, 2216, 3032, 3852, 4612, nan]","[646, 1488, 2326, 3138, 3958, 4782, 5612]",-0.352,0.802,"[730, 1568, 2406, 3216, 4040, 4860, 5694]","[858, 1698, 2532, 3348, 4168, 4988, 5818]",3188,II,58.025,[ 296 716 1134 1541 1951 2361 2778],95.143,127.667,336.667,827.333,80.571
3162,0.149,"[464, 998, 1408, 2046, 2578, 3078, 3624, 3964,...","[558, 1084, 1464, 2140, 2660, 3168, 3674, 4024...",-0.182,"[602, 1128, 1440, 2188, 2710, 3220, 3718, 4092...","[nan, 1242, nan, 2268, nan, 3326, 3844, 4186, ...",-0.263,0.053,"[nan, 1316, nan, 2286, nan, 3364, 3968, 4228, ...","[nan, 1354, nan, 2342, nan, 3394, 4024, 4394, ...",3189,II,95.405,[ 324 589 764 1116 1378 1631 1886 2075 2415 ...,75.8,106.333,219.667,524.889,34.286


In [5]:
interval_df.columns

Index(['p_peaks', 'p_onsets', 'p_offsets', 'q_peaks', 'r_onsets', 'r_offsets',
       's_peaks', 't_peaks', 't_onsets', 't_offsets', 'sample_idx', 'lead',
       'heart_rate', 'r_peaks', 'pr_interval', 'qrs_complex', 'qt_interval',
       'rr_interval', 'st_segment'],
      dtype='object')

In [6]:
interval_df.dtypes

p_peaks        float64
p_onsets        object
p_offsets       object
q_peaks        float64
r_onsets        object
r_offsets       object
s_peaks        float64
t_peaks        float64
t_onsets        object
t_offsets       object
sample_idx       int64
lead            object
heart_rate     float64
r_peaks         object
pr_interval    float64
qrs_complex    float64
qt_interval    float64
rr_interval    float64
st_segment     float64
dtype: object

In [7]:
import numpy as np

def parse_array_str(s):
    """
    Parse a string representing an array of numbers into a NumPy array.
    The string is expected to be of the form "[ 153 490 824 ...]" or "[153, 490, 824, ...]".
    """
    # Remove the surrounding brackets and any leading/trailing whitespace
    s = s.strip().strip('[]')
    
    # Replace commas with a space, if present
    s = s.replace(',', ' ')
    
    # Split the string by whitespace to extract the numbers
    number_strings = [num for num in s.split() if num]
    
    # Convert the list of strings to a numpy array of floats (or ints if appropriate)
    try:
        arr = np.array([float(num) for num in number_strings])
        # Optionally convert to int if all values are integers
        if np.all(arr.astype(int) == arr):
            arr = arr.astype(int)
    except Exception as e:
        print("Error parsing string to array:", s, e)
        arr = np.array([])
    
    return arr

# Example usage:
s = "[ 153 490 824 1154 1485 1819 2154 2485 2805]"
parsed = parse_array_str(s)
print(parsed)


[ 153  490  824 1154 1485 1819 2154 2485 2805]


In [8]:
type(parsed)

numpy.ndarray

In [9]:
interval_df['t_onsets'].dtype

dtype('O')

In [10]:
# For a given column, say 'p_onsets', convert each cell to a numpy array
interval_df['p_onsets'] = interval_df['p_onsets'].apply(parse_array_str)
interval_df['p_offsets'] = interval_df['p_offsets'].apply(parse_array_str)
interval_df['r_peaks'] = interval_df['r_peaks'].apply(parse_array_str)
interval_df['r_onsets'] = interval_df['r_onsets'].apply(parse_array_str)
interval_df['r_offsets'] = interval_df['r_offsets'].apply(parse_array_str)

  if np.all(arr.astype(int) == arr):


In [11]:
interval_df.head()['p_onsets'][0]

array([  nan,  838., 1498., 2168., 2822., 3488., 4160., 4824., 5468.])

In [12]:
type(interval_df.head()['t_onsets'][0])

str

In [13]:
interval_df['t_onsets'] = interval_df['t_onsets'].apply(parse_array_str)
interval_df['t_offsets'] = interval_df['t_offsets'].apply(parse_array_str)

  if np.all(arr.astype(int) == arr):


### Beat-wise Window Logic

Create a window around each R-peak (+- x), then check which wave events fall inside the window.  

+- 150ms at 400hz = 150 * 400/1000 = 60 samples.

Create a binary feature vector for each beat.

In [14]:
def get_beatwise_features( 
        p_onsets, p_offsets,  
        r_onsets, r_offsets,  
        t_onsets, t_offsets,
        window_ms=150
    ):
    """
    Returns a list of dictionaries, each representing one beat.
    Each dict has binary indicators for whether an event is within the defined window
    around the r_onset of that beat.
    """
    
    """
    The use of & requires that for each individual element, 
    both conditions must be true; but np.any() then only requires at least one of those elements to pass the test.
    """
    def in_window(event_array, center, window_ms):
        if event_array is None or len(event_array) == 0:
            return 0
        return int(np.any((event_array >= center - window_ms) & (event_array <= center + window_ms)))
    
    beat_features = []
    
    for r in r_onsets: # for each onset in r_onsets for one ecg reading, check if other onsets/offsets occur around that r
        feat = {
            'p_onset_present': in_window(p_onsets, r, window_ms),
            'p_offset_present': in_window(p_offsets, r, window_ms),
            'r_onset_present': 1,  # always 1 since we're anchoring on it
            'r_offset_present': in_window(r_offsets, r, window_ms),
            't_onset_present': in_window(t_onsets, r, window_ms),
            't_offset_present': in_window(t_offsets, r, window_ms)
        }
        beat_features.append(feat)
    
    return beat_features


In [15]:
all_beats_features = []

for idx, row in interval_df.iterrows():
    p_onsets = row['p_onsets']  
    p_offsets = row['p_offsets']  
    # q_peaks = row['q_peaks'] just a float
    r_onsets = row['r_onsets']
    r_offsets = row['r_offsets']
    # s_peaks = row['s_peaks'] just a float
    # t_peaks = row['t_peaks'] just a float
    t_onsets = row['t_onsets']
    t_offsets = row['t_offsets']
    # r_peaks = row['r_peaks'] # not a time series (in mV)
    
    # Generate beat-wise features
    beat_feats = get_beatwise_features( 
        p_onsets, p_offsets,  
        r_onsets, r_offsets,  
        t_onsets, t_offsets,
        window_ms=150
    )
    
    # Add e.g. the 'lead' or 'sample_idx' if you want to keep track
    for b in beat_feats:
        b['row_idx'] = idx
        b['lead'] = row['lead']
    
    all_beats_features.extend(beat_feats)

# Convert to a DataFrame of beat-wise features
beatwise_df = pd.DataFrame(all_beats_features)
beatwise_df.head()


Unnamed: 0,p_onset_present,p_offset_present,r_onset_present,r_offset_present,t_onset_present,t_offset_present,row_idx,lead
0,0,1,1,1,0,0,0,II
1,1,1,1,1,1,0,0,II
2,1,1,1,1,0,0,0,II
3,1,1,1,1,0,0,0,II
4,1,1,1,0,0,0,0,II


In [16]:
beatwise_df.head(20)

Unnamed: 0,p_onset_present,p_offset_present,r_onset_present,r_offset_present,t_onset_present,t_offset_present,row_idx,lead
0,0,1,1,1,0,0,0,II
1,1,1,1,1,1,0,0,II
2,1,1,1,1,0,0,0,II
3,1,1,1,1,0,0,0,II
4,1,1,1,0,0,0,0,II
5,1,1,1,1,0,0,0,II
6,1,1,1,0,0,0,0,II
7,1,1,1,0,0,0,0,II
8,0,0,1,0,0,0,0,II
9,1,1,1,0,0,0,1,II


Each row corresponds to the occurence of an event  +- 150 ms around a r_onset for an ecg_reading (row_dx).  

For the record, I also note that there are many non-timeseries features that could be static variables in this dataset such as q_peaks and p_peaks.  

I need to add chagas as a variable.

The problem is, when we load records, we just load ecg data, not the chagas label. Then when we do wave delineation we drop a few records too, so it is very hard to tell which row index corresponds to which record to get the record chagas label.

In [17]:
from chagas_delineation_loader import load_dbn_data

dbn_dataset_path = "/Users/evanzimm/GitHub/python-example-2025/dbn_dataset"
ecg_array, samp_freq, channel_seq, label_list, gender_list, age_list = load_dbn_data(dbn_dataset_path)

Loaded 3190 records. Dropped 78 that were too short.


In [18]:
len(label_list)

3190

Now when I do wave delineation, some of these labels get dropped but it is unclear which ones. I need to figure that out. How can I tell from the main.py wave delination file what gets dropped? I could just run the delineation again and save the terminal output. I found a way to save dropped indxs by modifying the original code, and I am saving dropped indxs to a csv. Remember to delete TEMP_FEATURES.csv

In [19]:
dropped_df = pd.read_csv('neurokit_delineation/output/dropped.csv')

In [20]:
dropped_df

Unnamed: 0,dropped
0,192
1,220
2,628
3,685
4,834
5,1117
6,1289
7,1487
8,1519
9,1770


In [21]:
indices_to_drop = set(dropped_df['dropped'])
indices_to_drop

{192,
 220,
 628,
 685,
 834,
 1117,
 1289,
 1487,
 1519,
 1770,
 1860,
 1944,
 2061,
 2136,
 2175,
 2236,
 2263,
 2446,
 2448,
 2451,
 2456,
 2605,
 2816,
 2848,
 2869,
 2873,
 2962}

In [22]:
# Filter list: include element only if its index is not in indices_to_drop.
label_list_dropped = [val for idx, val in enumerate(label_list) if idx not in indices_to_drop]
gender_list_dropped = [val for idx, val in enumerate(gender_list) if idx not in indices_to_drop]
age_list_dropped = [val for idx, val in enumerate(age_list) if idx not in indices_to_drop]

In [23]:
len(label_list_dropped)

3163

In [24]:
label_list_dropped = np.array(label_list_dropped)
gender_list_dropped = np.array(gender_list_dropped)
age_list_dropped = np.array(age_list_dropped)

Now my labels match the wave delineation data (indices match). Row indices match the original indices of interval df.

In [25]:
label_list_dropped[beatwise_df['row_idx']]

array([0, 0, 0, ..., 1, 1, 1])

In [26]:
len(beatwise_df)

26135

In [27]:
len(label_list_dropped[beatwise_df['row_idx']])

26135

In [40]:
beatwise_df['chagas'] = label_list_dropped[beatwise_df['row_idx']]
beatwise_df['gender'] = gender_list_dropped[beatwise_df['row_idx']]
beatwise_df['age'] = age_list_dropped[beatwise_df['row_idx']]

In [41]:
beatwise_df.head(20)

Unnamed: 0,p_onset_present,p_offset_present,r_onset_present,r_offset_present,t_onset_present,t_offset_present,row_idx,lead,chagas,gender,age
0,0,1,1,1,0,0,0,II,0,0,72
1,1,1,1,1,1,0,0,II,0,0,72
2,1,1,1,1,0,0,0,II,0,0,72
3,1,1,1,1,0,0,0,II,0,0,72
4,1,1,1,0,0,0,0,II,0,0,72
5,1,1,1,1,0,0,0,II,0,0,72
6,1,1,1,0,0,0,0,II,0,0,72
7,1,1,1,0,0,0,0,II,0,0,72
8,0,0,1,0,0,0,0,II,0,0,72
9,1,1,1,0,0,0,1,II,0,0,20


In [42]:
beatwise_df[beatwise_df['row_idx'] == 7]

Unnamed: 0,p_onset_present,p_offset_present,r_onset_present,r_offset_present,t_onset_present,t_offset_present,row_idx,lead,chagas,gender,age
52,0,0,1,0,0,0,7,II,1,0,64
53,1,1,1,1,1,0,7,II,1,0,64
54,1,1,1,1,1,0,7,II,1,0,64
55,1,1,1,1,1,0,7,II,1,0,64
56,1,1,1,1,1,0,7,II,1,0,64
57,1,1,1,1,1,0,7,II,1,0,64
58,0,0,1,0,0,0,7,II,1,0,64


In [43]:
print(label_list_dropped[7])
print(gender_list_dropped[7])
print(age_list_dropped[7])

1
0
64


This is good but age should be discrete.

In [53]:
# Discretize into 5 equally wide bins
beatwise_df['age_bins'] = pd.cut(beatwise_df['age'], bins=4, labels=False, right=False)

In [55]:
beatwise_df[beatwise_df['row_idx'] == 7]

Unnamed: 0,p_onset_present,p_offset_present,r_onset_present,r_offset_present,t_onset_present,t_offset_present,row_idx,lead,chagas,gender,age,age_bins
52,0,0,1,0,0,0,7,II,1,0,64,2
53,1,1,1,1,1,0,7,II,1,0,64,2
54,1,1,1,1,1,0,7,II,1,0,64,2
55,1,1,1,1,1,0,7,II,1,0,64,2
56,1,1,1,1,1,0,7,II,1,0,64,2
57,1,1,1,1,1,0,7,II,1,0,64,2
58,0,0,1,0,0,0,7,II,1,0,64,2


## Build the DBN Network

Only need to define two timesteps.

In [56]:
from pgmpy.models import DynamicBayesianNetwork as DBN

dbn = DBN()

# Static variables influence latent state at each timestep separately (no temporal edges for static vars)
static_vars = ['age', 'gender']

# Edges from static variables to HiddenState at time 0
for var in static_vars:
    dbn.add_edge((var, 0), ('chagas', 0))

# Edges from static variables to HiddenState at time 1
for var in static_vars:
    dbn.add_edge((var, 1), ('chagas', 1))

# Latent state temporal edges (usual)
dbn.add_edge(('chagas', 0), ('chagas', 1))

ecg_vars = ['p_onset', 'p_offset', 'r_offset', 't_onset', 't_offset']

for wave in ecg_vars:
    dbn.add_edges_from([
        (('chagas', 0), (wave, 0)),
        (('chagas', 1), (wave, 1))
    ])

  from .autonotebook import tqdm as notebook_tqdm


In [57]:
dbn.nodes()

NodeView((<DynamicNode(age, 0) at 0x1ff91cd8fe0>, <DynamicNode(chagas, 0) at 0x1ff7aa45700>, <DynamicNode(age, 1) at 0x1ff91d94710>, <DynamicNode(chagas, 1) at 0x1ff91bdc170>, <DynamicNode(gender, 0) at 0x1ff91bb8c80>, <DynamicNode(gender, 1) at 0x1ff91b8d310>, <DynamicNode(p_onset, 0) at 0x1ff91e0c230>, <DynamicNode(p_onset, 1) at 0x1ff91e0c290>, <DynamicNode(p_offset, 0) at 0x1ff91e0c320>, <DynamicNode(p_offset, 1) at 0x1ff91e0c3b0>, <DynamicNode(r_offset, 0) at 0x1ff91e0c350>, <DynamicNode(r_offset, 1) at 0x1ff91e0c3e0>, <DynamicNode(t_onset, 0) at 0x1ff91e0c500>, <DynamicNode(t_onset, 1) at 0x1ff91e0c560>, <DynamicNode(t_offset, 0) at 0x1ff91e0c5c0>, <DynamicNode(t_offset, 1) at 0x1ff91e0c620>))

In [75]:
beatwise_df.rename(columns={'p_onset_present': 'p_onset', 'p_offset_present': 'p_offset', 'r_offset_present': 'r_offset', 
    't_onset_present': 't_onset', 't_offset_present': 't_offset'}, inplace=True)

In [77]:
beatwise_df[beatwise_df['row_idx'] == 7]

Unnamed: 0,p_onset,p_offset,r_onset_present,r_offset,t_onset,t_offset,row_idx,lead,chagas,gender,age,age_bins
52,0,0,1,0,0,0,7,II,1,0,64,2
53,1,1,1,1,1,0,7,II,1,0,64,2
54,1,1,1,1,1,0,7,II,1,0,64,2
55,1,1,1,1,1,0,7,II,1,0,64,2
56,1,1,1,1,1,0,7,II,1,0,64,2
57,1,1,1,1,1,0,7,II,1,0,64,2
58,0,0,1,0,0,0,7,II,1,0,64,2


In [81]:
max_steps = beatwise_df.groupby('row_idx').agg("count")

In [84]:
max_steps

Unnamed: 0_level_0,p_onset,p_offset,r_onset_present,r_offset,t_onset,t_offset,lead,chagas,gender,age,age_bins
row_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,9,9,9,9,9,9,9,9,9,9,9
1,6,6,6,6,6,6,6,6,6,6,6
2,9,9,9,9,9,9,9,9,9,9,9
3,7,7,7,7,7,7,7,7,7,7,7
4,5,5,5,5,5,5,5,5,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...
3158,9,9,9,9,9,9,9,9,9,9,9
3159,9,9,9,9,9,9,9,9,9,9,9
3160,6,6,6,6,6,6,6,6,6,6,6
3161,7,7,7,7,7,7,7,7,7,7,7


In [85]:
max(max_steps['p_onset'])

21

In [86]:
max_t = max(max_steps['p_onset'])

We have at most 21 timesteps for some ECG reading!!!

In [87]:
np.argmax(max_steps['p_onset'])

np.int64(280)

In [88]:
beatwise_df[beatwise_df['row_idx'] == 280]

Unnamed: 0,p_onset,p_offset,r_onset_present,r_offset,t_onset,t_offset,row_idx,lead,chagas,gender,age,age_bins
2303,0,0,1,0,0,0,280,II,0,1,60,2
2304,0,0,1,0,0,0,280,II,0,1,60,2
2305,0,0,1,0,0,0,280,II,0,1,60,2
2306,0,0,1,0,0,0,280,II,0,1,60,2
2307,0,0,1,0,0,0,280,II,0,1,60,2
2308,0,0,1,0,0,0,280,II,0,1,60,2
2309,0,0,1,0,0,0,280,II,0,1,60,2
2310,0,0,1,0,0,0,280,II,0,1,60,2
2311,0,0,1,0,0,0,280,II,0,1,60,2
2312,0,0,1,0,0,0,280,II,0,1,60,2


Create the Training Dataset

In [99]:
beatwise_df = beatwise_df.drop(columns=["age"]).rename(columns={"age_bins": "age"})

In [100]:
beatwise_df[beatwise_df['row_idx'] == 280]

Unnamed: 0,p_onset,p_offset,r_onset_present,r_offset,t_onset,t_offset,row_idx,lead,chagas,gender,age
2303,0,0,1,0,0,0,280,II,0,1,2
2304,0,0,1,0,0,0,280,II,0,1,2
2305,0,0,1,0,0,0,280,II,0,1,2
2306,0,0,1,0,0,0,280,II,0,1,2
2307,0,0,1,0,0,0,280,II,0,1,2
2308,0,0,1,0,0,0,280,II,0,1,2
2309,0,0,1,0,0,0,280,II,0,1,2
2310,0,0,1,0,0,0,280,II,0,1,2
2311,0,0,1,0,0,0,280,II,0,1,2
2312,0,0,1,0,0,0,280,II,0,1,2


In [102]:
static_vars = ['age', 'gender']
dynamic_vars = ['p_onset', 'p_offset', 'r_offset', 't_onset', 't_offset', 'chagas']

def createEmptyTrainingDF(static_vars, dynamic_vars, max_timesteps):
    columns = []
    for t in range(max_timesteps): # t = 0, 1, ..., 21
        for var in dynamic_vars + static_vars:
            columns.append((var, t))

    df_all_obs = pd.DataFrame(columns=pd.MultiIndex.from_tuples(columns))

    return df_all_obs

all_obs_df = createEmptyTrainingDF(static_vars, dynamic_vars, max_t)
all_obs_df.columns

MultiIndex([( 'p_onset',  0),
            ('p_offset',  0),
            ('r_offset',  0),
            ( 't_onset',  0),
            ('t_offset',  0),
            (  'chagas',  0),
            (     'age',  0),
            (  'gender',  0),
            ( 'p_onset',  1),
            ('p_offset',  1),
            ...
            (     'age', 19),
            (  'gender', 19),
            ( 'p_onset', 20),
            ('p_offset', 20),
            ('r_offset', 20),
            ( 't_onset', 20),
            ('t_offset', 20),
            (  'chagas', 20),
            (     'age', 20),
            (  'gender', 20)],
           length=168)

Now I have to come up with some function that loops through the entire dbn_dataset folder, processes each record into its own df, and flattens that record into one row and appends it to all_obs_df.

In [109]:
def populateTrainingDF(input_df, output_df, static_vars, dynamic_vars, max_timesteps):
    latent_var = 'chagas'
    all_row_idxs = input_df['row_idx'].unique()

    # this loops through each ECG Reading
    for row in all_row_idxs:
        obs_df = input_df[input_df['row_idx'] == row].reset_index(drop=True) # get the beatwise data specific to a reading
        obs_length = len(obs_df)
        obs_data = {}

        for t in range(max_timesteps):
            if t < obs_length:
                # Explicitly assign observed ECG wave delineation data
                for var in dynamic_vars:
                    obs_data[(var, t)] = obs_df.loc[t, var]
            else:
                # Explicitly pad ECG signals with NaNs
                for var in dynamic_vars:
                    obs_data[(var, t)] = np.nan

            # Explicitly repeat latent variable and static vars at ALL timesteps
            obs_data[(latent_var, t)] = obs_df.loc[0, latent_var]
            for var in static_vars:
                obs_data[(var, t)] = obs_df.loc[0, var]
            
        # Append observation explicitly as a single row
        df_obs = pd.DataFrame([obs_data])
        output_df = pd.concat([output_df, df_obs], ignore_index=True)
    
    return output_df

test_flat_df = populateTrainingDF(beatwise_df, all_obs_df, static_vars, dynamic_vars, max_t)
test_flat_df.head()

Unnamed: 0_level_0,p_onset,p_offset,r_offset,t_onset,t_offset,chagas,age,gender,p_onset,p_offset,...,age,gender,p_onset,p_offset,r_offset,t_onset,t_offset,chagas,age,gender
Unnamed: 0_level_1,0,0,0,0,0,0,0,0,1,1,...,19,19,20,20,20,20,20,20,20,20
0,0,1,1,0,0,0,2,0,1,1,...,2,0,,,,,,0,2,0
1,1,1,0,0,0,0,0,0,1,1,...,0,0,,,,,,0,0,0
2,1,1,1,0,0,0,0,1,1,1,...,0,1,,,,,,0,0,1
3,1,1,1,0,0,0,2,1,1,1,...,2,1,,,,,,0,2,1
4,0,0,0,0,0,0,1,1,0,0,...,1,1,,,,,,0,1,1


In [None]:
test_flat_df.shape # we should have 3163 rows because that is what wave delineation produced

(3163, 168)

In [119]:
print(test_flat_df.iloc[280])

p_onset   0     0
p_offset  0     0
r_offset  0     0
t_onset   0     0
t_offset  0     0
               ..
t_onset   20    0
t_offset  20    0
chagas    20    0
age       20    2
gender    20    1
Name: 280, Length: 168, dtype: object


Now my data is prepared, I should probably do some sort of train/test split? 80/20? Keeping class balance for good measure.

In [120]:
from sklearn.model_selection import train_test_split

class_labels = test_flat_df[('chagas', 0)]

# Perform explicit 80/20 stratified train-test split
df_train, df_test = train_test_split(
    test_flat_df,
    test_size=0.2,
    stratify=class_labels,
    random_state=42  # reproducibility
)

# Verify explicit class balance:
print("Training class distribution:")
print(df_train[('chagas', 0)].value_counts(normalize=True))

print("\nTesting class distribution:")
print(df_test[('chagas', 0)].value_counts(normalize=True))

Training class distribution:
(chagas, 0)
0    0.507115
1    0.492885
Name: proportion, dtype: float64

Testing class distribution:
(chagas, 0)
0    0.507109
1    0.492891
Name: proportion, dtype: float64


In [121]:
print(df_train.dtypes)

p_onset   0     object
p_offset  0     object
r_offset  0     object
t_onset   0     object
t_offset  0     object
                 ...  
t_onset   20    object
t_offset  20    object
chagas    20    object
age       20    object
gender    20    object
Length: 168, dtype: object


You can convert all the columns to a numeric type (specifically to the pandas nullable integer type "Int64") after discretization. This way, you perform the discretization first and then, in a separate step, convert every column to integers. 

This loop will iterate over every column in your DataFrame and attempt to convert it to an integer type. Using errors='coerce' helps ensure that any non-numeric values become NaN (which is acceptable for the nullable "Int64" type) rather than causing an error.

In [122]:
for col in df_train.columns:
    # Use pd.to_numeric to force conversion; errors='coerce' will turn non-numeric entries into NaN,
    # and then .astype('Int64') will convert the column to the pandas nullable integer type.
    df_train[col] = pd.to_numeric(df_train[col], errors='coerce').astype("Int64")

# Check the dtypes
print(df_train.dtypes)

p_onset   0     Int64
p_offset  0     Int64
r_offset  0     Int64
t_onset   0     Int64
t_offset  0     Int64
                ...  
t_onset   20    Int64
t_offset  20    Int64
chagas    20    Int64
age       20    Int64
gender    20    Int64
Length: 168, dtype: object


In [123]:
df_train.shape

(2530, 168)

Now we should be able to fit to the dbn

In [124]:
dbn.fit(df_train)

In [125]:
print(dbn.get_cpds())

[<TabularCPD representing P((t_offset, 0):2 | (chagas, 0):2) at 0x1ff95c81f10>, <TabularCPD representing P((r_offset, 0):2 | (chagas, 0):2) at 0x1ff95c81b20>, <TabularCPD representing P((chagas, 0):2 | (age, 0):4, (gender, 0):2) at 0x1ff95c0f080>, <TabularCPD representing P((age, 0):4) at 0x1ff92b272c0>, <TabularCPD representing P((p_onset, 0):2 | (chagas, 0):2) at 0x1ff95c79040>, <TabularCPD representing P((p_offset, 0):2 | (chagas, 0):2) at 0x1ff93666480>, <TabularCPD representing P((gender, 0):2) at 0x1ff95c826f0>, <TabularCPD representing P((t_onset, 0):2 | (chagas, 0):2) at 0x1ff95c81bb0>, <TabularCPD representing P((t_offset, 1):2 | (chagas, 1):2) at 0x1ff95c82630>, <TabularCPD representing P((r_offset, 1):2 | (chagas, 1):2) at 0x1ff95c82510>, <TabularCPD representing P((chagas, 1):2 | (age, 1):4, (chagas, 0):2, (gender, 1):2) at 0x1ff95c79790>, <TabularCPD representing P((age, 1):4) at 0x1ff95c81f70>, <TabularCPD representing P((p_onset, 1):2 | (chagas, 1):2) at 0x1ff95c80b30>, 

In [126]:
df_test.dtypes

p_onset   0     object
p_offset  0     object
r_offset  0     object
t_onset   0     object
t_offset  0     object
                 ...  
t_onset   20    object
t_offset  20    object
chagas    20    object
age       20    object
gender    20    object
Length: 168, dtype: object

In [127]:
for col in df_test.columns:
    # Use pd.to_numeric to force conversion; errors='coerce' will turn non-numeric entries into NaN,
    # and then .astype('Int64') will convert the column to the pandas nullable integer type.
    df_test[col] = pd.to_numeric(df_test[col], errors='coerce').astype("Int64")

# Check the dtypes
print(df_train.dtypes)

p_onset   0     Int64
p_offset  0     Int64
r_offset  0     Int64
t_onset   0     Int64
t_offset  0     Int64
                ...  
t_onset   20    Int64
t_offset  20    Int64
chagas    20    Int64
age       20    Int64
gender    20    Int64
Length: 168, dtype: object


In [128]:
df_test.shape

(633, 168)

In [129]:
import logging
from pgmpy.inference import DBNInference

# Set up logging to a file
logging.basicConfig(
    filename='inference.log',
    filemode='w',  # Overwrite file on each run; use 'a' to append
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO  # or DEBUG for more details
)
logger = logging.getLogger(__name__)

dbn_infer = DBNInference(dbn)  # your trained DBN

predictions = []
true_labels = []

for idx, row in df_test.iterrows():
    logger.info(f"Processing row index: {idx}")
    
    # Build evidence dict: all observed columns except the latent variable
    evidence = {}
    valid_timesteps = []

    # Identify valid timesteps based on non-NaN ECG data
    for col in df_test.columns:
        var_name, t_step = col  # deconstruct column tuple
        if var_name not in ['chagas', 'gender', 'age'] and pd.notna(row[col]):
            valid_timesteps.append(t_step)
    
    if not valid_timesteps:
        logger.info(f"Row {idx}: No valid timesteps found, skipping row.")
        continue  # or handle the case with no evidence
    
    max_valid_timestep = max(valid_timesteps)
    logger.info(f"Row {idx}: Max valid timestep = {max_valid_timestep}")

    # Build the evidence dict for valid timesteps
    for col in df_test.columns:
        var_name, t_step = col
        if t_step <= max_valid_timestep and var_name != 'chagas':
            evidence[(var_name, t_step)] = row[col]
    
    logger.info(f"Row {idx}: Raw evidence: {evidence}")

    # Clean up the evidence by converting all values to plain ints
    evidence_clean = {}
    for key, value in evidence.items():
        if pd.notna(value):
            try:
                evidence_clean[key] = int(value)
            except Exception as e:
                logger.error(f"Error converting evidence for {key}: {value}")
                raise e
    logger.info(f"Row {idx}: Cleaned evidence: {evidence_clean}")

    # Instead of looping over every t_step, just query the last valid timestep:
    try:
        logger.info(f"Row {idx}: Querying for ('chagas', {max_valid_timestep})")
        query_result = dbn_infer.query(
            variables=[('chagas', max_valid_timestep)],
            evidence=evidence_clean,
        )
        logger.info(f"Row {idx}, timestep {max_valid_timestep}: Prediction: {query_result[('chagas', max_valid_timestep)]}")
    except Exception as e:
        logger.error(f"Error during query at row {idx}, timestep {max_valid_timestep}")
        raise e

    predictions.append(query_result[('chagas', max_valid_timestep)])
    
    # Store the true label; assuming the label is the same at all timesteps, use time 0
    true_label = row[('chagas', 0)]
    true_labels.append(true_label)
    logger.info(f"Row {idx}: True label: {true_label}")

logger.info("Inference complete.")

INFO:__main__:Processing row index: 1176
INFO:__main__:Row 1176: Max valid timestep = 6
INFO:__main__:Row 1176: Raw evidence: {('p_onset', 0): 1, ('p_offset', 0): 1, ('r_offset', 0): 1, ('t_onset', 0): 0, ('t_offset', 0): 0, ('age', 0): 1, ('gender', 0): 1, ('p_onset', 1): 1, ('p_offset', 1): 1, ('r_offset', 1): 1, ('t_onset', 1): 0, ('t_offset', 1): 0, ('age', 1): 1, ('gender', 1): 1, ('p_onset', 2): 1, ('p_offset', 2): 1, ('r_offset', 2): 1, ('t_onset', 2): 0, ('t_offset', 2): 0, ('age', 2): 1, ('gender', 2): 1, ('p_onset', 3): 1, ('p_offset', 3): 1, ('r_offset', 3): 1, ('t_onset', 3): 0, ('t_offset', 3): 0, ('age', 3): 1, ('gender', 3): 1, ('p_onset', 4): 1, ('p_offset', 4): 1, ('r_offset', 4): 1, ('t_onset', 4): 0, ('t_offset', 4): 0, ('age', 4): 1, ('gender', 4): 1, ('p_onset', 5): 1, ('p_offset', 5): 1, ('r_offset', 5): 1, ('t_onset', 5): 0, ('t_offset', 5): 0, ('age', 5): 1, ('gender', 5): 1, ('p_onset', 6): 0, ('p_offset', 6): 0, ('r_offset', 6): 0, ('t_onset', 6): 0, ('t_offse

In [130]:
print(predictions[0].values)

[0.64906131 0.35093869]


In [132]:
print(predictions[0])

+------------------+----------------------+
| ('chagas', 6)    |   phi(('chagas', 6)) |
| ('chagas', 6)(0) |               0.6491 |
+------------------+----------------------+
| ('chagas', 6)(1) |               0.3509 |
+------------------+----------------------+


In [133]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, roc_auc_score

y_true = df_test[('chagas', 0)]

y_pred = [p.values.argmax() for p in predictions]

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
print(f"Precision: {prec}, Recall: {rec}, F1 Score: {f1}")

Accuracy: 0.641390205371248
Confusion Matrix:
 [[185 136]
 [ 91 221]]
Precision: 0.6190476190476191, Recall: 0.7083333333333334, F1 Score: 0.6606875934230194


Better but not that good...

q_peaks, s_peaks, t_peaks, and r_peaks are static feature I left out, maybe they can help? but they are continous and thus would need to be discretized and thus we would lose some information. but it is worth a try.

In [135]:
interval_df.head().columns

Index(['p_peaks', 'p_onsets', 'p_offsets', 'q_peaks', 'r_onsets', 'r_offsets',
       's_peaks', 't_peaks', 't_onsets', 't_offsets', 'sample_idx', 'lead',
       'heart_rate', 'r_peaks', 'pr_interval', 'qrs_complex', 'qt_interval',
       'rr_interval', 'st_segment'],
      dtype='object')

In [136]:
interval_df.head()

Unnamed: 0,p_peaks,p_onsets,p_offsets,q_peaks,r_onsets,r_offsets,s_peaks,t_peaks,t_onsets,t_offsets,sample_idx,lead,heart_rate,r_peaks,pr_interval,qrs_complex,qt_interval,rr_interval,st_segment
0,0.101,"[nan, 838.0, 1498.0, 2168.0, 2822.0, 3488.0, 4...","[188, 886, 1574, 2250, 2874, 3588, 4268, 4932,...",-0.135,"[266.0, 912.0, 1586.0, 2266.0, 2886.0, 3596.0,...","[352, 1014, 1726, 2368, 3044, 3712, 4390, 5028...",-0.223,0.174,"[474, 1046, 1808, 2482, 3142, 3806, 4470, 5116...","[566.0, 1078.0, 1914.0, 2566.0, 3234.0, 3904.0...",0,II,72.412,"[153, 490, 824, 1154, 1485, 1819, 2154, 2485, ...",83.0,134.25,314.25,663.0,71.778
1,0.124,"[556, 1480, 2534, 3538, 4568, 5554]","[652, 1574, 2626, 3648, 4664, 5640]",-0.152,"[610.0, 1540.0, 2588.0, 3604.0, 4622.0, nan]","[778.0, nan, 2746.0, 3768.0, 4780.0, nan]",-0.159,0.419,"[922.0, 1830.0, 2868.0, 3886.0, 4922.0, nan]","[1030.0, 1946.0, 3004.0, 4018.0, 5036.0, nan]",1,II,48.178,"[357, 818, 1345, 1852, 2361, 2852]",95.667,162.0,414.0,998.0,131.5
2,0.037,"[232, 932, 1644, 2278, 2942, 3592, 4180, 4742,...","[304, 996, 1708, 2346, 3004, 3656, 4242, 4808,...",-0.221,"[336.0, 1034.0, 1746.0, 2382.0, 3038.0, 3690.0...","[420, 1118, 1832, 2468, 3124, 3776, 4362, 4928...",-0.273,0.578,"[524, 1220, 1930, 2566, 3228, 3878, 4460, 5024...","[616, 1314, 2028, 2664, 3322, 3972, 4556, 5118...",2,II,75.278,"[190, 539, 894, 1213, 1542, 1867, 2160, 2442, ...",65.556,87.75,283.0,640.75,100.222
3,0.177,"[550, 1300, 2088, 2914, 3702, 4494, 5322]","[634, 1386, 2180, 2996, 3804, 4586, 5408]",-0.241,"[658.0, 1418.0, 2198.0, 3028.0, 3826.0, 4618.0...","[740, 1514, 2308, 3134, 3910, 4728, 5530]",-0.201,0.174,"[880, 1622, 2458, 3284, 4064, 4850, 5692]","[992, 1744, 2552, 3348, 4166, 4904, 5782]",3,II,60.362,"[352, 730, 1128, 1536, 1935, 2330, 2739]",89.143,98.0,326.667,795.667,140.857
4,0.173,"[nan, 632.0, nan, nan, 4782.0]","[nan, 676.0, 1944.0, nan, nan]",,"[nan, nan, nan, nan, nan]","[nan, nan, nan, 4730.0, nan]",-5.897,0.237,"[nan, nan, nan, 4714.0, nan]","[nan, nan, nan, nan, nan]",4,II,57.845,"[210, 514, 986, 2344, 2581]",44.0,,,1185.5,-16.0


p_peaks,Amplitude of the P-wave,Float64,mV - could be a static variable.  
q_peaks,Amplitude of the Q-wave,Float64,mV - could be a static variable.  
s_peaks,Amplitude of the S-wave,Float64,mV - could be a static variable.  
t_peaks,Amplitude of the T-wave,Float64,mV - could be a static variable.  
heart_rate,Number of contractions of the heart per minute,Float64,bpm - could be static.  
r_peaks,Amplitude of the R-wave,object,mV THIS IS INTERESTING BECUASE IT IS THE ONLY PEAKS THAT IS A LIST OF VALUES NOT AN INDIVIDUAL VALUE
MAYBE THERE IS A WAY TO GET IT TO LINE UP WITH R_OFFSETS?.  

pr_interval,Time between onset of P-wave to onset of R-wave,Float64,msec - could be static.  
qt_interval,Time between onset of Q-wave to offset of T-wave,Float64,msec - I could do t_offsets +- qt_interval and get onset of q-wave. CALCULATED.  
qrs_complex,Time between onset of Q-wave to offset of S-wave,Float64,msec - from calculating above I can calculate offset of S. CALCULATED FROM ST_SEGMENT.   
The above two could turn into time-series with the right calculations?. YES.  
rr_interval,Time between successive R-waves,Float64,msec - could be static.  
st_segment,Time between offset of S-wave to onset of T-wave,Float64,msec - could be used to calculate offset of S since we know onset of T. CALCULATED.    


In [139]:
interval_df['t_onsets'] 

0       [474, 1046, 1808, 2482, 3142, 3806, 4470, 5116...
1            [922.0, 1830.0, 2868.0, 3886.0, 4922.0, nan]
2       [524, 1220, 1930, 2566, 3228, 3878, 4460, 5024...
3               [880, 1622, 2458, 3284, 4064, 4850, 5692]
4                            [nan, nan, nan, 4714.0, nan]
                              ...                        
3158    [nan, 1106.0, nan, 2586.0, 3334.0, 3994.0, 463...
3159    [454, 1102, 1742, 2392, 3050, 3722, 4410, 5046...
3160                  [814, 1652, 2528, 3514, 4402, 5318]
3161            [730, 1568, 2406, 3216, 4040, 4860, 5694]
3162    [nan, 1316.0, nan, 2286.0, nan, 3364.0, 3968.0...
Name: t_onsets, Length: 3163, dtype: object

In [None]:
s_offsets = interval_df['t_onsets'] - interval_df['st_segment']

0       [402.222, 974.222, 1736.222, 2410.222, 3070.22...
1            [790.5, 1698.5, 2736.5, 3754.5, 4790.5, nan]
2       [423.778, 1119.778, 1829.778, 2465.778, 3127.7...
3       [739.143, 1481.143, 2317.143, 3143.143, 3923.1...
4                            [nan, nan, nan, 4730.0, nan]
                              ...                        
3158    [nan, 966.333, nan, 2446.333, 3194.333, 3854.3...
3159    [442.5, 1090.5, 1730.5, 2380.5, 3038.5, 3710.5...
3160      [712.0, 1550.0, 2426.0, 3412.0, 4300.0, 5216.0]
3161    [649.429, 1487.429, 2325.429, 3135.429, 3959.4...
3162    [nan, 1281.714, nan, 2251.714, nan, 3329.714, ...
Length: 3163, dtype: object

In [141]:
s_offsets = interval_df['t_onsets'] - interval_df['st_segment']

In [144]:
q_onsets = interval_df['t_offsets'] - interval_df['qt_interval']

In [145]:
wave_delineation_df = interval_df.copy()

In [147]:
wave_delineation_df['s_offsets'] = s_offsets
wave_delineation_df['q_onsets'] = q_onsets

In [148]:
wave_delineation_df.head()

Unnamed: 0,p_peaks,p_onsets,p_offsets,q_peaks,r_onsets,r_offsets,s_peaks,t_peaks,t_onsets,t_offsets,...,lead,heart_rate,r_peaks,pr_interval,qrs_complex,qt_interval,rr_interval,st_segment,s_offsets,q_onsets
0,0.101,"[nan, 838.0, 1498.0, 2168.0, 2822.0, 3488.0, 4...","[188, 886, 1574, 2250, 2874, 3588, 4268, 4932,...",-0.135,"[266.0, 912.0, 1586.0, 2266.0, 2886.0, 3596.0,...","[352, 1014, 1726, 2368, 3044, 3712, 4390, 5028...",-0.223,0.174,"[474, 1046, 1808, 2482, 3142, 3806, 4470, 5116...","[566.0, 1078.0, 1914.0, 2566.0, 3234.0, 3904.0...",...,II,72.412,"[153, 490, 824, 1154, 1485, 1819, 2154, 2485, ...",83.0,134.25,314.25,663.0,71.778,"[402.222, 974.222, 1736.222, 2410.222, 3070.22...","[251.75, 763.75, 1599.75, 2251.75, 2919.75, 35..."
1,0.124,"[556, 1480, 2534, 3538, 4568, 5554]","[652, 1574, 2626, 3648, 4664, 5640]",-0.152,"[610.0, 1540.0, 2588.0, 3604.0, 4622.0, nan]","[778.0, nan, 2746.0, 3768.0, 4780.0, nan]",-0.159,0.419,"[922.0, 1830.0, 2868.0, 3886.0, 4922.0, nan]","[1030.0, 1946.0, 3004.0, 4018.0, 5036.0, nan]",...,II,48.178,"[357, 818, 1345, 1852, 2361, 2852]",95.667,162.0,414.0,998.0,131.5,"[790.5, 1698.5, 2736.5, 3754.5, 4790.5, nan]","[616.0, 1532.0, 2590.0, 3604.0, 4622.0, nan]"
2,0.037,"[232, 932, 1644, 2278, 2942, 3592, 4180, 4742,...","[304, 996, 1708, 2346, 3004, 3656, 4242, 4808,...",-0.221,"[336.0, 1034.0, 1746.0, 2382.0, 3038.0, 3690.0...","[420, 1118, 1832, 2468, 3124, 3776, 4362, 4928...",-0.273,0.578,"[524, 1220, 1930, 2566, 3228, 3878, 4460, 5024...","[616, 1314, 2028, 2664, 3322, 3972, 4556, 5118...",...,II,75.278,"[190, 539, 894, 1213, 1542, 1867, 2160, 2442, ...",65.556,87.75,283.0,640.75,100.222,"[423.778, 1119.778, 1829.778, 2465.778, 3127.7...","[333.0, 1031.0, 1745.0, 2381.0, 3039.0, 3689.0..."
3,0.177,"[550, 1300, 2088, 2914, 3702, 4494, 5322]","[634, 1386, 2180, 2996, 3804, 4586, 5408]",-0.241,"[658.0, 1418.0, 2198.0, 3028.0, 3826.0, 4618.0...","[740, 1514, 2308, 3134, 3910, 4728, 5530]",-0.201,0.174,"[880, 1622, 2458, 3284, 4064, 4850, 5692]","[992, 1744, 2552, 3348, 4166, 4904, 5782]",...,II,60.362,"[352, 730, 1128, 1536, 1935, 2330, 2739]",89.143,98.0,326.667,795.667,140.857,"[739.143, 1481.143, 2317.143, 3143.143, 3923.1...","[665.3330000000001, 1417.333, 2225.333, 3021.3..."
4,0.173,"[nan, 632.0, nan, nan, 4782.0]","[nan, 676.0, 1944.0, nan, nan]",,"[nan, nan, nan, nan, nan]","[nan, nan, nan, 4730.0, nan]",-5.897,0.237,"[nan, nan, nan, 4714.0, nan]","[nan, nan, nan, nan, nan]",...,II,57.845,"[210, 514, 986, 2344, 2581]",44.0,,,1185.5,-16.0,"[nan, nan, nan, 4730.0, nan]","[nan, nan, nan, nan, nan]"


Try the whole thing again with two new features? Would it be better to write a script to generalize? probably. I could also just copy and paste the cells above and re-run it all.

In [150]:
beatwise_df[beatwise_df['row_idx'] == 191]

Unnamed: 0,p_onset,p_offset,r_onset_present,r_offset,t_onset,t_offset,row_idx,lead,chagas,gender,age
1572,1,1,1,1,0,0,191,II,0,1,0
1573,1,1,1,1,0,0,191,II,0,1,0
1574,1,1,1,0,0,0,191,II,0,1,0
1575,1,1,1,1,0,0,191,II,0,1,0
1576,1,1,1,1,0,0,191,II,0,1,0
1577,1,1,1,1,0,0,191,II,0,1,0
1578,1,1,1,1,0,0,191,II,0,1,0
1579,1,1,1,0,0,0,191,II,0,1,0
1580,1,1,1,0,0,0,191,II,0,1,0
1581,1,1,1,0,0,1,191,II,0,1,0


In [151]:
beatwise_df[beatwise_df['row_idx'] == 192]

Unnamed: 0,p_onset,p_offset,r_onset_present,r_offset,t_onset,t_offset,row_idx,lead,chagas,gender,age
1583,1,1,1,1,0,0,192,II,0,1,3
1584,1,1,1,1,0,0,192,II,0,1,3
1585,1,1,1,1,0,1,192,II,0,1,3
1586,1,1,1,1,0,0,192,II,0,1,3
1587,1,1,1,1,0,0,192,II,0,1,3
1588,1,1,1,0,0,0,192,II,0,1,3
1589,1,1,1,1,0,0,192,II,0,1,3
1590,1,1,1,1,0,0,192,II,0,1,3
1591,0,0,1,0,0,0,192,II,0,1,3


So when delineation is done, some records get skipped, so in the output for example if records 192 is skipped, the record at row_idx 192 is actually record 193. 

When we drop labels it should be the same.

In [154]:
wave_delineation_df[wave_delineation_df['sample_idx'] == 191]

Unnamed: 0,p_peaks,p_onsets,p_offsets,q_peaks,r_onsets,r_offsets,s_peaks,t_peaks,t_onsets,t_offsets,...,lead,heart_rate,r_peaks,pr_interval,qrs_complex,qt_interval,rr_interval,st_segment,s_offsets,q_onsets
191,0.288,"[340, 836, 1310, 1782, 2282, 2802, 3316, 3810,...","[428, 924, 1396, 1870, 2366, 2892, 3404, 3896,...",-0.181,"[398.0, 892.0, 1366.0, 1842.0, 2338.0, 2862.0,...","[536, 1014, 1528, 1958, 2456, 3000, 3490, 4040...",-0.278,0.019,"[600, 1086, 1574, 2054, 2548, 3046, 3548, 4074...","[704, 1192, 1680, 2148, 2640, 3176, 3682, 4184...",...,II,98.02,"[237, 485, 723, 959, 1208, 1469, 1724, 1973, 2...",87.455,142.8,309.8,490.2,39.273,"[560.727, 1046.727, 1534.727, 2014.727, 2508.7...","[394.2, 882.2, 1370.2, 1838.2, 2330.2, 2866.2,..."


In [155]:
wave_delineation_df[wave_delineation_df['sample_idx'] == 192]

Unnamed: 0,p_peaks,p_onsets,p_offsets,q_peaks,r_onsets,r_offsets,s_peaks,t_peaks,t_onsets,t_offsets,...,lead,heart_rate,r_peaks,pr_interval,qrs_complex,qt_interval,rr_interval,st_segment,s_offsets,q_onsets


In [156]:
wave_delineation_df[wave_delineation_df['sample_idx'] == 193]

Unnamed: 0,p_peaks,p_onsets,p_offsets,q_peaks,r_onsets,r_offsets,s_peaks,t_peaks,t_onsets,t_offsets,...,lead,heart_rate,r_peaks,pr_interval,qrs_complex,qt_interval,rr_interval,st_segment,s_offsets,q_onsets
192,0.089,"[364, 1180, 1600, 2266, 2992, 3644, 4226, 4904...","[484, 1222, 1638, 2310, 3038, 3700, 4332, 5022...",-0.111,"[416.0, 1236.0, 1674.0, 2330.0, 3058.0, 3668.0...","[558, 1370, 1802, 2460, 3182, 3830, 4420, 5100...",-0.309,0.189,"[676, 1468, 1920, 2566, 3296, 3954, 4518, 5212...","[770, 1590, 2022, 2690, 3388, 4040, 4630, 5310...",...,II,78.139,"[259, 664, 880, 1209, 1570, 1894, 2189, 2529, ...",75.111,135.0,349.75,633.0,114.444,"[561.556, 1353.556, 1805.556, 2451.556, 3181.5...","[420.25, 1240.25, 1672.25, 2340.25, 3038.25, 3..."


In [None]:
len(wave_delineation_df[wave_delineation_df['p_peaks'] > 0.25]) # could be meaningful

194

In [162]:
# simplest: length along the first axis
lens1 = wave_delineation_df['r_peaks'].map(len)
lens2 = wave_delineation_df['r_onsets'].map(len)

# count how many rows match
count = (lens1 == lens2).sum()
print(f"{count} rows where r_peaks and r_onsets have the same length.")

3163 rows where r_peaks and r_onsets have the same length.


In [None]:
# r_peaks is 100% a dynamic var
(wave_delineation_df['r_peaks'].map(max) > 2800).sum() #r_peaks is def in sample indices

np.int64(1166)

In [169]:
# convert to milliseconds
wave_delineation_df['r_peaks'] = wave_delineation_df['r_peaks'] / 400 * 1000

In [170]:
wave_delineation_df.head()

Unnamed: 0,p_peaks,p_onsets,p_offsets,q_peaks,r_onsets,r_offsets,s_peaks,t_peaks,t_onsets,t_offsets,...,lead,heart_rate,r_peaks,pr_interval,qrs_complex,qt_interval,rr_interval,st_segment,s_offsets,q_onsets
0,0.101,"[nan, 838.0, 1498.0, 2168.0, 2822.0, 3488.0, 4...","[188, 886, 1574, 2250, 2874, 3588, 4268, 4932,...",-0.135,"[266.0, 912.0, 1586.0, 2266.0, 2886.0, 3596.0,...","[352, 1014, 1726, 2368, 3044, 3712, 4390, 5028...",-0.223,0.174,"[474, 1046, 1808, 2482, 3142, 3806, 4470, 5116...","[566.0, 1078.0, 1914.0, 2566.0, 3234.0, 3904.0...",...,II,72.412,"[382.5, 1225.0, 2060.0, 2885.0, 3712.5, 4547.5...",83.0,134.25,314.25,663.0,71.778,"[402.222, 974.222, 1736.222, 2410.222, 3070.22...","[251.75, 763.75, 1599.75, 2251.75, 2919.75, 35..."
1,0.124,"[556, 1480, 2534, 3538, 4568, 5554]","[652, 1574, 2626, 3648, 4664, 5640]",-0.152,"[610.0, 1540.0, 2588.0, 3604.0, 4622.0, nan]","[778.0, nan, 2746.0, 3768.0, 4780.0, nan]",-0.159,0.419,"[922.0, 1830.0, 2868.0, 3886.0, 4922.0, nan]","[1030.0, 1946.0, 3004.0, 4018.0, 5036.0, nan]",...,II,48.178,"[892.5, 2045.0, 3362.5, 4630.0, 5902.5, 7130.0]",95.667,162.0,414.0,998.0,131.5,"[790.5, 1698.5, 2736.5, 3754.5, 4790.5, nan]","[616.0, 1532.0, 2590.0, 3604.0, 4622.0, nan]"
2,0.037,"[232, 932, 1644, 2278, 2942, 3592, 4180, 4742,...","[304, 996, 1708, 2346, 3004, 3656, 4242, 4808,...",-0.221,"[336.0, 1034.0, 1746.0, 2382.0, 3038.0, 3690.0...","[420, 1118, 1832, 2468, 3124, 3776, 4362, 4928...",-0.273,0.578,"[524, 1220, 1930, 2566, 3228, 3878, 4460, 5024...","[616, 1314, 2028, 2664, 3322, 3972, 4556, 5118...",...,II,75.278,"[475.0, 1347.5, 2235.0, 3032.5, 3855.0, 4667.5...",65.556,87.75,283.0,640.75,100.222,"[423.778, 1119.778, 1829.778, 2465.778, 3127.7...","[333.0, 1031.0, 1745.0, 2381.0, 3039.0, 3689.0..."
3,0.177,"[550, 1300, 2088, 2914, 3702, 4494, 5322]","[634, 1386, 2180, 2996, 3804, 4586, 5408]",-0.241,"[658.0, 1418.0, 2198.0, 3028.0, 3826.0, 4618.0...","[740, 1514, 2308, 3134, 3910, 4728, 5530]",-0.201,0.174,"[880, 1622, 2458, 3284, 4064, 4850, 5692]","[992, 1744, 2552, 3348, 4166, 4904, 5782]",...,II,60.362,"[880.0, 1825.0, 2820.0, 3840.0, 4837.5, 5825.0...",89.143,98.0,326.667,795.667,140.857,"[739.143, 1481.143, 2317.143, 3143.143, 3923.1...","[665.3330000000001, 1417.333, 2225.333, 3021.3..."
4,0.173,"[nan, 632.0, nan, nan, 4782.0]","[nan, 676.0, 1944.0, nan, nan]",,"[nan, nan, nan, nan, nan]","[nan, nan, nan, 4730.0, nan]",-5.897,0.237,"[nan, nan, nan, 4714.0, nan]","[nan, nan, nan, nan, nan]",...,II,57.845,"[525.0, 1285.0, 2465.0, 5860.0, 6452.5]",44.0,,,1185.5,-16.0,"[nan, nan, nan, 4730.0, nan]","[nan, nan, nan, nan, nan]"


In [171]:
wave_delineation_df['r_onsets'][0]

array([ 266.,  912., 1586., 2266., 2886., 3596., 4194., 4854.,   nan])

In [172]:
wave_delineation_df['r_offsets'][0]

array([ 352, 1014, 1726, 2368, 3044, 3712, 4390, 5028, 5830])

In [173]:
wave_delineation_df['r_peaks'][0]

array([ 382.5, 1225. , 2060. , 2885. , 3712.5, 4547.5, 5385. , 6212.5,
       7012.5])