# Stage 2.0 — EDA preparation

This notebook builds canonical EDA slices from sanitized daily data + quality labels, adds derived features, and prints a compact readiness summary.

In [34]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from garmin_analytics.eda import prepare as eda_prepare

add_derived_features = eda_prepare.add_derived_features
build_eda_frames = eda_prepare.build_eda_frames
eda_readiness_summary = eda_prepare.eda_readiness_summary
load_daily_sanitized = eda_prepare.load_daily_sanitized
load_quality = eda_prepare.load_quality

STRESS_TOTAL_ALIAS_MAP = getattr(
    eda_prepare,
    "STRESS_TOTAL_ALIAS_MAP",
    {
        "stress_total_avg_level": "allDayStress_TOTAL_averageStressLevel",
        "stress_total_total_duration_s": "allDayStress_TOTAL_totalDuration",
        "stress_total_stress_duration_s": "allDayStress_TOTAL_stressDuration",
        "stress_total_rest_s": "allDayStress_TOTAL_restDuration",
        "stress_total_low_s": "allDayStress_TOTAL_lowDuration",
        "stress_total_med_s": "allDayStress_TOTAL_mediumDuration",
        "stress_total_high_s": "allDayStress_TOTAL_highDuration",
        "stress_total_activity_s": "allDayStress_TOTAL_activityDuration",
        "stress_total_uncat_s": "allDayStress_TOTAL_uncategorizedDuration",
    },
)

pd.set_option('display.max_columns', 120)

In [35]:
repo_root = Path.cwd() / '..'
print('Repo root:', repo_root)
daily_sanitized_path = repo_root / 'data/processed/daily_sanitized.parquet'
daily_fallback_path = repo_root / 'data/processed/daily.parquet'
quality_path = repo_root / 'data/processed/daily_quality.parquet'

daily_path = daily_sanitized_path if daily_sanitized_path.exists() else daily_fallback_path
if not daily_path.exists():
    raise FileNotFoundError(f'Daily parquet not found: {daily_path}')
if not quality_path.exists():
    raise FileNotFoundError(f'Quality parquet not found: {quality_path}')

daily_df = load_daily_sanitized(daily_path)
quality_df = load_quality(quality_path)

assert daily_df['calendarDate'].duplicated().sum() == 0, 'daily has duplicate calendarDate rows'
assert quality_df['calendarDate'].duplicated().sum() == 0, 'quality has duplicate calendarDate rows'

print('Daily input:', daily_path)
print('Quality input:', quality_path)
print('daily rows:', len(daily_df), 'quality rows:', len(quality_df))

Repo root: /Users/abatrakov/Documents/FUN/wearable-analytics/notebooks/..
Daily input: /Users/abatrakov/Documents/FUN/wearable-analytics/notebooks/../data/processed/daily_sanitized.parquet
Quality input: /Users/abatrakov/Documents/FUN/wearable-analytics/notebooks/../data/processed/daily_quality.parquet
daily rows: 580 quality rows: 580


In [36]:
frames = build_eda_frames(daily_df, quality_df, strict_min_score=4, loose_min_score=3)

df_all = add_derived_features(frames['df_all'])
df_strict = add_derived_features(frames['df_strict'])
df_sleep = add_derived_features(frames['df_sleep'])

frames = {'df_all': df_all, 'df_strict': df_strict, 'df_sleep': df_sleep}

for name, frame in frames.items():
    print(f'{name}: {len(frame)} rows')

df_all: 580 rows
df_strict: 525 rows
df_sleep: 474 rows


In [None]:
def _prefer_alias(alias: str, original: str, columns: pd.Index) -> str:
    if alias in columns:
        return alias
    return original

key_cols = [
    # identity / quality
    "calendarDate", "quality_score", "valid_day_strict", "has_sleep", "corrupted_stress_only_day",

    # movement
    "totalSteps", "totalDistanceMeters", "activeSeconds", "highlyActiveSeconds",

    # stress (summary + breakdown TOTAL)
    "stressTotalDurationSeconds",
    _prefer_alias("stress_total_avg_level", "allDayStress_TOTAL_averageStressLevel", df_all.columns),
    _prefer_alias("stress_total_rest_s", "allDayStress_TOTAL_restDuration", df_all.columns),
    _prefer_alias("stress_total_low_s", "allDayStress_TOTAL_lowDuration", df_all.columns),
    _prefer_alias("stress_total_med_s", "allDayStress_TOTAL_mediumDuration", df_all.columns),
    _prefer_alias("stress_total_high_s", "allDayStress_TOTAL_highDuration", df_all.columns),
    _prefer_alias("stress_total_activity_s", "allDayStress_TOTAL_activityDuration", df_all.columns),
    _prefer_alias("stress_total_uncat_s", "allDayStress_TOTAL_uncategorizedDuration", df_all.columns),

    # heart rate
    "restingHeartRate", "minHeartRate", "maxHeartRate", "minAvgHeartRate", "maxAvgHeartRate",

    # body battery
    "bodyBatteryStartOfDay", "bodyBatteryEndOfDay", "bodyBatteryLowest", "bodyBatteryHighest",

    # sleep
    "sleepStartTimestampGMT", "sleepEndTimestampGMT",
    "deepSleepSeconds", "lightSleepSeconds", "remSleepSeconds", "awakeSleepSeconds",
    "sleepOverallScore", "avgSleepStress",
]

present_key_cols = [c for c in key_cols if c in df_all.columns]
missing_key_cols = [c for c in key_cols if c not in df_all.columns]
if missing_key_cols:
    print("Warning: missing expected key columns:", missing_key_cols)

derived_cols = [
    "stress_hours", "steps_k", "distance_km", "bb_delta",
    "sleep_total_hours", "sleep_efficiency", "deep_pct", "light_pct",
    "rem_pct", "awake_pct",
]
present_derived_cols = [c for c in derived_cols if c in df_all.columns]

preview_cols = list(dict.fromkeys(present_key_cols + present_derived_cols))

# Vertical-friendly preview: fields go top-to-bottom, example rows go left-to-right
preview_source = df_all[preview_cols].head(10).copy()
preview_source.index = [f"row_{i+1}" for i in range(len(preview_source))]
preview_vertical = preview_source.T
preview_vertical

Unnamed: 0,calendarDate,quality_score,valid_day_strict,has_sleep,corrupted_stress_only_day,totalSteps,totalDistanceMeters,activeSeconds,highlyActiveSeconds,stressTotalDurationSeconds,allDayStress_TOTAL_averageStressLevel,allDayStress_TOTAL_restDuration,allDayStress_TOTAL_lowDuration,allDayStress_TOTAL_mediumDuration,allDayStress_TOTAL_highDuration,allDayStress_TOTAL_activityDuration,allDayStress_TOTAL_uncategorizedDuration,restingHeartRate,minHeartRate,maxHeartRate,minAvgHeartRate,maxAvgHeartRate,bodyBatteryStartOfDay,bodyBatteryEndOfDay,bodyBatteryLowest,bodyBatteryHighest,sleepStartTimestampGMT,sleepEndTimestampGMT,deepSleepSeconds,lightSleepSeconds,remSleepSeconds,awakeSleepSeconds,sleepOverallScore,avgSleepStress,stress_hours,steps_k,distance_km,bb_delta,sleep_total_hours,sleep_efficiency,deep_pct,light_pct,rem_pct,awake_pct
0,2023-05-26,4,True,False,False,1096,863,1285,797,22080,50,3420,3660,4740,3120,5580,1560,63,55,126,55,126,66,36.0,36,70,,,,,,,,,6.133333,1.096,0.863,-30.0,,,,,,
1,2023-05-27,5,True,True,False,20915,17337,6453,5638,84840,25,41400,9660,9480,3660,17100,3540,54,41,160,41,160,36,22.0,22,100,1685148480.0,1685176920.0,5220.0,16260.0,6720.0,240.0,98.0,5.48,23.566667,20.915,17.337,-14.0,7.9,0.991561,0.183544,0.57173,0.236287,0.008439
2,2023-05-28,5,True,True,False,5935,5044,1321,2700,85440,38,31800,10740,7020,14400,10140,11340,50,40,175,40,175,22,19.0,8,93,1685236980.0,1685264159.0,6120.0,14700.0,4620.0,1200.0,82.0,8.59,23.733333,5.935,5.044,-3.0,7.4,0.954955,0.22973,0.551802,0.173423,0.045045
3,2023-05-29,5,True,True,False,17593,14366,5498,3993,85740,23,44880,13800,7740,1200,14940,3180,48,41,119,42,119,19,35.0,18,100,1685320500.0,1685352240.0,6420.0,19860.0,4740.0,720.0,90.0,8.02,23.816667,17.593,14.366,16.0,8.816667,0.977316,0.202268,0.625709,0.149338,0.022684
4,2023-05-30,5,True,True,False,9212,7196,3372,1751,85980,26,44220,14640,7860,4860,12120,2280,47,39,120,39,120,35,24.0,24,100,1685408220.0,1685433900.0,4320.0,14460.0,6180.0,720.0,85.0,6.59,23.883333,9.212,7.196,-11.0,7.133333,0.971963,0.168224,0.563084,0.240654,0.028037
5,2023-05-31,5,True,True,False,13653,11586,2300,5238,86100,26,44820,14580,8940,3480,11640,2640,46,40,177,41,175,24,30.0,22,100,1685488500.0,1685530018.0,7200.0,21000.0,9300.0,3960.0,69.0,13.16,23.916667,13.653,11.586,6.0,11.516667,0.904486,0.173661,0.506512,0.224313,0.095514
6,2023-06-01,4,True,True,False,15278,12642,4627,3986,85680,31,33300,12720,11940,5820,13800,8100,45,39,141,41,140,30,,20,100,1685577180.0,1685606220.0,4500.0,17700.0,5160.0,1680.0,87.0,7.44,23.8,15.278,12.642,,8.066667,0.942149,0.154959,0.609504,0.177686,0.057851
7,2023-06-02,5,True,True,False,13372,10742,4405,3049,85620,32,35700,14340,8580,7020,16200,3780,42,39,117,39,117,20,12.0,12,80,1685664660.0,1685683560.0,5460.0,9960.0,3240.0,240.0,67.0,8.79,23.783333,13.372,10.742,-8.0,5.25,0.987302,0.288889,0.526984,0.171429,0.012698
8,2023-06-03,5,True,True,False,13948,10968,7854,268,85380,31,37500,13680,10140,5520,15180,3360,42,40,149,41,142,12,17.0,6,82,1685768160.0,1685788920.0,5100.0,11520.0,3660.0,480.0,73.0,8.94,23.716667,13.948,10.968,5.0,5.766667,0.976879,0.245665,0.554913,0.176301,0.023121
9,2023-06-04,5,True,True,False,7073,6599,2405,2061,85620,34,36900,2220,8700,11940,17820,8040,41,38,183,39,182,17,27.0,5,100,1685845260.0,1685880300.0,6900.0,18120.0,8460.0,1560.0,93.0,7.58,23.783333,7.073,6.599,10.0,9.733333,0.955479,0.196918,0.517123,0.241438,0.044521


In [38]:
alias_mapping_table = pd.DataFrame(
    [{"alias": alias, "original": original} for alias, original in STRESS_TOTAL_ALIAS_MAP.items()]
 )
alias_mapping_table["used_column"] = alias_mapping_table.apply(
    lambda r: r["alias"] if r["alias"] in df_all.columns else (r["original"] if r["original"] in df_all.columns else None),
    axis=1,
 )
alias_mapping_table

Unnamed: 0,alias,original,used_column
0,stress_total_avg_level,allDayStress_TOTAL_averageStressLevel,allDayStress_TOTAL_averageStressLevel
1,stress_total_total_duration_s,allDayStress_TOTAL_totalDuration,allDayStress_TOTAL_totalDuration
2,stress_total_stress_duration_s,allDayStress_TOTAL_stressDuration,allDayStress_TOTAL_stressDuration
3,stress_total_rest_s,allDayStress_TOTAL_restDuration,allDayStress_TOTAL_restDuration
4,stress_total_low_s,allDayStress_TOTAL_lowDuration,allDayStress_TOTAL_lowDuration
5,stress_total_med_s,allDayStress_TOTAL_mediumDuration,allDayStress_TOTAL_mediumDuration
6,stress_total_high_s,allDayStress_TOTAL_highDuration,allDayStress_TOTAL_highDuration
7,stress_total_activity_s,allDayStress_TOTAL_activityDuration,allDayStress_TOTAL_activityDuration
8,stress_total_uncat_s,allDayStress_TOTAL_uncategorizedDuration,allDayStress_TOTAL_uncategorizedDuration


In [39]:
summary = eda_readiness_summary(frames)
summary

Unnamed: 0,rows_all,rows_strict,rows_sleep,strict_good_pct,sleep_present_pct,corrupted_pct
0,580,525,474,90.517241,81.724138,3.62069


In [40]:
date_ranges = pd.DataFrame([
    {
        'slice': name,
        'rows': len(frame),
        'date_min': frame['calendarDate'].min(),
        'date_max': frame['calendarDate'].max(),
    }
    for name, frame in frames.items()
])
date_ranges

Unnamed: 0,slice,rows,date_min,date_max
0,df_all,580,2023-05-26,2026-02-05
1,df_strict,525,2023-05-26,2026-02-05
2,df_sleep,474,2023-05-27,2026-02-05


In [41]:
def _missingness_table(frame: pd.DataFrame, cols: list[str], top_n: int = 20) -> pd.DataFrame:
    rows = []
    for c in cols:
        s = frame[c]
        rows.append({
            "column": c,
            "missing_count": int(s.isna().sum()),
            "missing_pct": round(float(s.isna().mean() * 100.0), 2),
            "n_unique": int(s.nunique(dropna=True)),
        })
    out = pd.DataFrame(rows).sort_values("missing_pct", ascending=False).head(top_n)
    return out.reset_index(drop=True)

for slice_name, frame in frames.items():
    print(f"\nMissingness over core columns — {slice_name}")
    display(_missingness_table(frame, present_key_cols, top_n=20))


Missingness over core columns — df_all


Unnamed: 0,column,missing_count,missing_pct,n_unique
0,remSleepSeconds,113,19.48,179
1,sleepOverallScore,107,18.45,63
2,avgSleepStress,106,18.28,415
3,awakeSleepSeconds,106,18.28,83
4,lightSleepSeconds,106,18.28,242
5,deepSleepSeconds,106,18.28,109
6,sleepEndTimestampGMT,106,18.28,474
7,sleepStartTimestampGMT,106,18.28,474
8,bodyBatteryEndOfDay,83,14.31,48
9,allDayStress_TOTAL_restDuration,46,7.93,349



Missingness over core columns — df_strict


Unnamed: 0,column,missing_count,missing_pct,n_unique
0,remSleepSeconds,58,11.05,179
1,sleepOverallScore,52,9.9,63
2,avgSleepStress,51,9.71,415
3,awakeSleepSeconds,51,9.71,83
4,lightSleepSeconds,51,9.71,242
5,deepSleepSeconds,51,9.71,109
6,sleepEndTimestampGMT,51,9.71,474
7,sleepStartTimestampGMT,51,9.71,474
8,bodyBatteryEndOfDay,36,6.86,46
9,allDayStress_TOTAL_restDuration,7,1.33,338



Missingness over core columns — df_sleep


Unnamed: 0,column,missing_count,missing_pct,n_unique
0,bodyBatteryEndOfDay,36,7.59,44
1,remSleepSeconds,7,1.48,179
2,restingHeartRate,1,0.21,21
3,sleepOverallScore,1,0.21,63
4,allDayStress_TOTAL_restDuration,1,0.21,305
5,bodyBatteryLowest,0,0.0,30
6,minAvgHeartRate,0,0.0,21
7,maxAvgHeartRate,0,0.0,86
8,bodyBatteryStartOfDay,0,0.0,47
9,sleepStartTimestampGMT,0,0.0,474


## Optional local preview export (OFF by default)

Set `EXPORT_PREVIEW = True` to write a small local CSV preview to `data/interim` (gitignored).

In [42]:
EXPORT_PREVIEW = False

if EXPORT_PREVIEW:
    out_path = repo_root / 'data/interim/eda_prepare_preview.csv'
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df_all[preview_cols].head(200).to_csv(out_path, index=False)
    print('Wrote', out_path)
else:
    print('Preview export disabled (default).')

Preview export disabled (default).
