In [1]:
# Imports
import pandas as pd
from src.utils.os_helper import get_project_root
from src.data.models import ColumnNames

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

In [2]:
root = get_project_root()
CACHE_DIR = root / "cache" / "data" / "awesome_cgm" / "brown_2019"
data_tables = (
    CACHE_DIR / "raw" / "DCLP3 Public Dataset - Release 3 - 2022-08-04" / "Data Files"
)

In [3]:
cgm_df = pd.read_csv(data_tables / "cgm.txt", sep="|")

In [4]:
cgm_df = cgm_df.rename(columns={"Unnamed: 3": "CGM"})

In [5]:
cgm_df.head()

Unnamed: 0,PtID,Period,DataDtTm,CGM
0,1,1. Baseline,11DEC17:23:59:25,172
1,1,1. Baseline,12DEC17:00:04:24,170
2,1,1. Baseline,12DEC17:00:09:24,167
3,1,1. Baseline,12DEC17:00:14:25,163
4,1,1. Baseline,12DEC17:00:19:25,160


In [6]:
cgm_df.columns

Index(['PtID', 'Period', 'DataDtTm', 'CGM'], dtype='object')

In [7]:
cgm_df.groupby("PtID")["Period"].value_counts().unstack(fill_value=0)

Period,1. Baseline,2. Post Randomization
PtID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4009,53007
2,3667,49870
3,3982,53617
4,3656,45149
5,3614,53537
...,...,...
167,3686,51587
168,3655,52411
169,3664,53891
170,2496,43801


In [8]:
cgm_df[ColumnNames.DATETIME.value] = pd.to_datetime(
    cgm_df["DataDtTm"], format="%d%b%y:%H:%M:%S"
)

# Set datetime as index
cgm_df = cgm_df.set_index(ColumnNames.DATETIME.value)

cgm_df = cgm_df.drop(columns=["DataDtTm"])

In [9]:
cgm_df.head()

Unnamed: 0_level_0,PtID,Period,CGM
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-12-11 23:59:25,1,1. Baseline,172
2017-12-12 00:04:24,1,1. Baseline,170
2017-12-12 00:09:24,1,1. Baseline,167
2017-12-12 00:14:25,1,1. Baseline,163
2017-12-12 00:19:25,1,1. Baseline,160


In [10]:
# Rename to match ColumnNames enum
cgm_df = cgm_df.rename(
    columns={
        "PtID": ColumnNames.P_NUM.value,
        "CGM": "bg_mgdL",  # Keep original, add mmol/L version
        "Period": "period",
    }
)

# Add mmol/L version
cgm_df[ColumnNames.BG.value] = cgm_df["bg_mgdL"] / 18.0

In [11]:
cgm_df.head()

Unnamed: 0_level_0,p_num,period,bg_mgdL,bg_mM
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-12-11 23:59:25,1,1. Baseline,172,9.555556
2017-12-12 00:04:24,1,1. Baseline,170,9.444444
2017-12-12 00:09:24,1,1. Baseline,167,9.277778
2017-12-12 00:14:25,1,1. Baseline,163,9.055556
2017-12-12 00:19:25,1,1. Baseline,160,8.888889


In [17]:
from src.data.preprocessing.data_splitting import split_multipatient_dataframe
from src.data.preprocessing.sampling import (
    ensure_regular_time_intervals_with_aggregation,
)

# Split into per-patient dict first
patient_dict = split_multipatient_dataframe(cgm_df, patient_col="p_num")

# Then process each
for pid, pdf in patient_dict.items():
    processed_df, freq = ensure_regular_time_intervals_with_aggregation(pdf)
    patient_dict[pid] = processed_df

2025-11-27T18:54:06 - ensure_regular_time_intervals_with_aggregation(): Ensuring regular time intervals with aggregation...
2025-11-27T18:54:06 - 	Most common time interval: 5 minutes
2025-11-27T18:54:06 - 	Aggregation strategy: {'p_num': 'first', 'period': 'first', 'bg_mgdL': 'sum', 'bg_mM': 'mean'}
2025-11-27T18:54:06 - Post-ensure_regular_time_intervals_with_aggregation(): 
			Patient 1 
			 - old index length: 57016, 
			 - new index length: 58465
2025-11-27T18:54:06 - ensure_regular_time_intervals_with_aggregation(): Ensuring regular time intervals with aggregation...
2025-11-27T18:54:06 - 	Most common time interval: 5 minutes
2025-11-27T18:54:06 - 	Aggregation strategy: {'p_num': 'first', 'period': 'first', 'bg_mgdL': 'sum', 'bg_mM': 'mean'}
2025-11-27T18:54:06 - Post-ensure_regular_time_intervals_with_aggregation(): 
			Patient 10 
			 - old index length: 55890, 
			 - new index length: 57313
2025-11-27T18:54:06 - ensure_regular_time_intervals_with_aggregation(): Ensuring regula

In [18]:
# Test on single patient first
test_patient = list(patient_dict.keys())[0]
test_df = patient_dict[test_patient]

print(f"Before: {len(test_df)} rows")
processed_test, freq = ensure_regular_time_intervals_with_aggregation(test_df)
print(f"After: {len(processed_test)} rows")
print(f"Frequency: {freq} min")

2025-11-27T18:56:06 - ensure_regular_time_intervals_with_aggregation(): Ensuring regular time intervals with aggregation...
2025-11-27T18:56:06 - 	Most common time interval: 5 minutes
2025-11-27T18:56:06 - 	Aggregation strategy: {'p_num': 'first', 'period': 'first', 'bg_mgdL': 'sum', 'bg_mM': 'mean'}
2025-11-27T18:56:06 - Post-ensure_regular_time_intervals_with_aggregation(): 
			Patient 1.0 
			 - old index length: 58465, 
			 - new index length: 58465


Before: 58465 rows
After: 58465 rows
Frequency: 5 min


In [25]:
test_df.head()

Unnamed: 0_level_0,p_num,period,bg_mgdL,bg_mM
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-12-12 00:00:00,1.0,1. Baseline,172.0,9.555556
2017-12-12 00:05:00,1.0,1. Baseline,170.0,9.444444
2017-12-12 00:10:00,1.0,1. Baseline,167.0,9.277778
2017-12-12 00:15:00,1.0,1. Baseline,163.0,9.055556
2017-12-12 00:20:00,1.0,1. Baseline,160.0,8.888889


In [20]:
# Pick a patient to analyze
test_pid = 89

# Get original and processed
original_df = split_multipatient_dataframe(cgm_df, patient_col="p_num")[test_pid]
processed_df = patient_dict[test_pid]

print(f"Original: {len(original_df)} rows")
print(f"Processed: {len(processed_df)} rows")
print(f"Added rows: {len(processed_df) - len(original_df)} (these are NaN-filled gaps)")

# Find the NEW timestamps (gaps that were filled)
original_timestamps = set(original_df.index)
processed_timestamps = set(processed_df.index)

new_timestamps = processed_timestamps - original_timestamps
print(f"\nNew timestamps added: {len(new_timestamps)}")

Original: 47386 rows
Processed: 57889 rows
Added rows: 10503 (these are NaN-filled gaps)

New timestamps added: 57550


In [None]:
for ts in sorted(list(new_timestamps))[:5]:
    print(ts)
    print(processed_df.loc[ts])

2018-04-16 00:00:00
p_num             89.0
period     1. Baseline
bg_mgdL          112.0
bg_mM         6.222222
Name: 2018-04-16 00:00:00, dtype: object
2018-04-16 00:05:00
p_num             89.0
period     1. Baseline
bg_mgdL          110.0
bg_mM         6.111111
Name: 2018-04-16 00:05:00, dtype: object
2018-04-16 00:10:00
p_num             89.0
period     1. Baseline
bg_mgdL          107.0
bg_mM         5.944444
Name: 2018-04-16 00:10:00, dtype: object
2018-04-16 00:15:00
p_num             89.0
period     1. Baseline
bg_mgdL          102.0
bg_mM         5.666667
Name: 2018-04-16 00:15:00, dtype: object
2018-04-16 00:20:00
p_num             89.0
period     1. Baseline
bg_mgdL           97.0
bg_mM         5.388889
Name: 2018-04-16 00:20:00, dtype: object


In [21]:
# Pick a patient
test_pid = 89
original_df = split_multipatient_dataframe(cgm_df, patient_col="p_num")[test_pid]
processed_df = patient_dict[test_pid]

# Round original timestamps to same 5-min grid (mimicking what the function does)
freq = 5  # minutes
original_rounded = original_df.index.round(f"{freq}min")

# Now compare properly
original_bins = set(original_rounded)
processed_bins = set(processed_df.index)

# TRUE new timestamps (gaps that were filled)
truly_new = processed_bins - original_bins

print(f"Original timestamps: {len(original_df)}")
print(f"Original unique bins (after rounding): {len(original_bins)}")
print(f"Processed timestamps: {len(processed_df)}")
print(f"Truly NEW timestamps (gap fills): {len(truly_new)}")

Original timestamps: 47386
Original unique bins (after rounding): 47386
Processed timestamps: 57889
Truly NEW timestamps (gap fills): 10503


In [22]:
# Create a df with the rounded original timestamps
original_with_bins = original_df.copy()
original_with_bins["rounded_ts"] = original_with_bins.index.round(f"{freq}min")
original_with_bins = original_with_bins.reset_index().rename(
    columns={"datetime": "original_ts"}
)

# Reset processed for merge
processed_reset = processed_df.reset_index().rename(columns={"datetime": "grid_ts"})

# Left join: all processed timestamps, match with original where exists
merged = processed_reset.merge(
    original_with_bins[["rounded_ts", "original_ts", "bg_mM"]].rename(
        columns={"bg_mM": "original_bg"}
    ),
    left_on="grid_ts",
    right_on="rounded_ts",
    how="left",
    indicator=True,  # Adds column showing match status
)

# Show the breakdown
print("Merge result:")
print(merged["_merge"].value_counts())

# The "left_only" rows are the GAP FILLS
gap_fills = merged[merged["_merge"] == "left_only"]
print(f"\n Gap fills (new rows): {len(gap_fills)}")
print(" These should have NaN bg_mM:")
print(gap_fills[["grid_ts", "bg_mM", "original_bg", "_merge"]].head(20))

# Verify ALL gap fills have NaN
print(f"\nâœ“ All gap fills have NaN bg_mM: {gap_fills['bg_mM'].isna().all()}")

Merge result:
_merge
both          47386
left_only     10503
right_only        0
Name: count, dtype: int64

 Gap fills (new rows): 10503
 These should have NaN bg_mM:
               grid_ts  bg_mM  original_bg     _merge
35 2018-04-16 02:55:00    NaN          NaN  left_only
36 2018-04-16 03:00:00    NaN          NaN  left_only
37 2018-04-16 03:05:00    NaN          NaN  left_only
38 2018-04-16 03:10:00    NaN          NaN  left_only
39 2018-04-16 03:15:00    NaN          NaN  left_only
40 2018-04-16 03:20:00    NaN          NaN  left_only
41 2018-04-16 03:25:00    NaN          NaN  left_only
42 2018-04-16 03:30:00    NaN          NaN  left_only
43 2018-04-16 03:35:00    NaN          NaN  left_only
44 2018-04-16 03:40:00    NaN          NaN  left_only
45 2018-04-16 03:45:00    NaN          NaN  left_only
46 2018-04-16 03:50:00    NaN          NaN  left_only
47 2018-04-16 03:55:00    NaN          NaN  left_only
48 2018-04-16 04:00:00    NaN          NaN  left_only
49 2018-04-16 04:05:00 