In [46]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
# Set up sys.path (from the previous step)
from pathlib import Path
import sys

# Define the TRUE project root (go up one level from the notebook's location)
PROJECT_ROOT = Path.cwd().parent 
SRC_DIR = str(PROJECT_ROOT / "src")

if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

# Import the package and initialize paths
from healthcare_signals.features_patient import build_patient_signals
from healthcare_signals.io_prime import save_patient_signals

print("Imports OK ✅")


Imports OK ✅


In [50]:
# CRUCIAL: Initialize the paths in the imported package 
set_project_root(PROJECT_ROOT)

# Execute your code
AS_OF_DATE = "2012-12-31"

# When this function runs, it now uses the correct, initialized DATA_RAW path.
signals = build_patient_signals(AS_OF_DATE).sort_values("n_active_days", ascending=False) 
out_path = save_patient_signals(signals, AS_OF_DATE)

print(f"Snapshot date: {AS_OF_DATE}")
print(f"Signals shape: {signals.shape}")
print(f"Saved to:      {out_path}")

signals.head(10)

Snapshot date: 2012-12-31
Signals shape: (481, 8)
Saved to:      C:\Users\binia\OneDrive\Bini\Education - Professional Development\Springboard\git\healthcare-stream\healthcare-signals-phase3\data\processed\patient_signals_asof=2012-12-31.parquet


Unnamed: 0,provider_id,n_active_days,total_claims,mean_daily_claims,first_activity_dt,last_activity_dt,days_since_last,mean_zscore_allowed
51,1033456526,4,4,1.0,2008-12-13,2009-12-19,1108,-1.214306e-16
47,888901330,3,3,1.0,2009-02-25,2010-10-07,816,-1.110223e-16
247,5111491524,3,3,1.0,2007-11-20,2010-07-13,902,1.249001e-16
18,329757128,2,2,1.0,2008-03-25,2010-03-16,1021,-4.440892e-16
169,3265059745,2,2,1.0,2009-09-29,2010-01-28,1068,-2.220446e-16
190,3796070387,2,2,1.0,2008-08-16,2010-02-24,1041,0.0
139,2620798443,2,2,1.0,2008-03-09,2008-04-24,1712,-1.831868e-15
48,941734431,2,2,1.0,2008-01-29,2009-12-28,1099,0.0
152,2866295111,2,2,1.0,2008-10-28,2009-04-05,1366,-1.110223e-16
217,4316764103,2,2,1.0,2008-02-14,2008-03-25,1742,0.0


In [51]:
import pandas as pd


PROJECT_ROOT = Path.cwd().parent
DATA_RAW = PROJECT_ROOT / "data" / "raw"

# Inspect available date range in facts_daily
facts = pd.read_parquet(DATA_RAW / "facts_daily.parquet")
facts["date"] = pd.to_datetime(facts["date"])

date_min = facts["date"].min()
date_max = facts["date"].max()
print("Available range:", date_min, "→", date_max)

# Month-end snapshots across the full range
snapshot_dates = pd.date_range(date_min, date_max, freq="ME")
snapshot_dates = [d.strftime("%Y-%m-%d") for d in snapshot_dates]
snapshot_dates[:7], snapshot_dates[-7:]


Available range: 2007-10-15 00:00:00 → 2011-02-10 00:00:00


(['2007-10-31',
  '2007-11-30',
  '2007-12-31',
  '2008-01-31',
  '2008-02-29',
  '2008-03-31',
  '2008-04-30'],
 ['2010-07-31',
  '2010-08-31',
  '2010-09-30',
  '2010-10-31',
  '2010-11-30',
  '2010-12-31',
  '2011-01-31'])

In [52]:
all_shapes = []

for as_of in snapshot_dates:
    signals = build_patient_signals(as_of)
    out_path = save_patient_signals(signals, as_of)
    all_shapes.append((as_of, signals.shape[0], signals.shape[1]))
    print(f"{as_of} → {signals.shape} → {out_path.name}")


2007-10-31 → (2, 8) → patient_signals_asof=2007-10-31.parquet
2007-11-30 → (4, 8) → patient_signals_asof=2007-11-30.parquet
2007-12-31 → (5, 8) → patient_signals_asof=2007-12-31.parquet
2008-01-31 → (15, 8) → patient_signals_asof=2008-01-31.parquet
2008-02-29 → (23, 8) → patient_signals_asof=2008-02-29.parquet
2008-03-31 → (35, 8) → patient_signals_asof=2008-03-31.parquet
2008-04-30 → (51, 8) → patient_signals_asof=2008-04-30.parquet
2008-05-31 → (72, 8) → patient_signals_asof=2008-05-31.parquet
2008-06-30 → (91, 8) → patient_signals_asof=2008-06-30.parquet
2008-07-31 → (115, 8) → patient_signals_asof=2008-07-31.parquet
2008-08-31 → (131, 8) → patient_signals_asof=2008-08-31.parquet
2008-09-30 → (141, 8) → patient_signals_asof=2008-09-30.parquet
2008-10-31 → (158, 8) → patient_signals_asof=2008-10-31.parquet
2008-11-30 → (175, 8) → patient_signals_asof=2008-11-30.parquet
2008-12-31 → (196, 8) → patient_signals_asof=2008-12-31.parquet
2009-01-31 → (210, 8) → patient_signals_asof=2009-01

In [53]:
from glob import glob

panel_frames = []

for path in glob("../data/processed/patient_signals_asof=*.parquet"):
    as_of = path.split("patient_signals_asof=")[1].split(".parquet")[0]
    df_snap = pd.read_parquet(path)
    df_snap["as_of_date"] = pd.to_datetime(as_of)
    panel_frames.append(df_snap)

signals_panel = pd.concat(panel_frames, ignore_index=True).sort_values("mean_zscore_allowed")
signals_panel.to_parquet("../data/processed/patient_signals_panel.parquet", index=False)

print("Panel shape:", signals_panel.shape)
signals_panel.head()

Panel shape: (10942, 9)


Unnamed: 0,provider_id,n_active_days,total_claims,mean_daily_claims,first_activity_dt,last_activity_dt,days_since_last,mean_zscore_allowed,as_of_date
1450,1033456526,1,1,1.0,2008-12-13,2008-12-13,77,-1.565322,2009-02-28
1676,1033456526,1,1,1.0,2008-12-13,2008-12-13,108,-1.565322,2009-03-31
1236,1033456526,1,1,1.0,2008-12-13,2008-12-13,49,-1.565322,2009-01-31
1918,1033456526,1,1,1.0,2008-12-13,2008-12-13,138,-1.565322,2009-04-30
1039,1033456526,1,1,1.0,2008-12-13,2008-12-13,18,-1.565322,2008-12-31
