# Project: Predicting Inpatient Clinical Deterioration from FHIR EHR Data
### Notebook: 01_Data_Pull.ipynb
### Purpose: Pull patients and observations from a public FHIR server

In [210]:
# imports
import pandas as pd
import json
from pathlib import Path
from typing import List,Dict,Tuple
import os

In [300]:
fhir_path = Path("/Users/sanasiddiqui/synthea/output/fhir")
files = list(fhir_path.glob("*.json"))

len(files), files[:2]

(558,
 [PosixPath('/Users/sanasiddiqui/synthea/output/fhir/Janyce124_Hermina428_Schuppe920_00a648b9-a457-d3ae-0178-b9d77f258e0e.json'),
  PosixPath('/Users/sanasiddiqui/synthea/output/fhir/Hung902_Friesen796_e5628e68-1f60-c682-a4c8-8eb9ab92ecbf.json')])

In [302]:
f = files[0]
with open(f) as fh:
    data = json.load(fh)

types = {}
for e in data.get("entry", []):
    rt = e["resource"]["resourceType"]
    types[rt] = types.get(rt, 0) + 1


In [334]:

def parse_fhir_bundles(fhir_dir: str, max_files: int = None) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Parse FHIR JSON bundles from a directory into DataFrames for patients, observations, and encounters.
    Adds location info and derived ICU flag for encounters.

    Parameters:
        fhir_dir (str): Path to FHIR JSON files.
        max_files (int, optional): Maximum number of files to parse. Default None = parse all files.

    Returns:
        df_patients (pd.DataFrame)
        df_observations (pd.DataFrame)
        df_encounters (pd.DataFrame)
    """
    fhir_path = Path(fhir_dir)
    files = list(fhir_path.glob("*.json"))
    
    if max_files:
        files = files[:max_files]

    patients = []
    observations = []
    encounters = []

    for f in files:
        with open(f) as fh:
            bundle = json.load(fh)
        
        for entry in bundle.get("entry", []):
            res = entry["resource"]
            rt = res["resourceType"]
            
            if rt == "Patient":
                patients.append({
                    "patient_id": res["id"],
                    "gender": res.get("gender"),
                    "birth_date": res.get("birthDate")
                })
            
            elif rt == "Observation":
                subj = res.get("subject", {}).get("reference", "")
                patient_id = subj.replace("Patient/", "")
                
                coding = res.get("code", {}).get("coding", [{}])[0]
                observations.append({
                    "patient_id": patient_id,
                    "code": coding.get("code"),
                    "name": coding.get("display"),
                    "value": res.get("valueQuantity", {}).get("value"),
                    "unit": res.get("valueQuantity", {}).get("unit"),
                    "time": res.get("effectiveDateTime")
                })
            
            elif rt == "Encounter":
                subj = res.get("subject", {}).get("reference", "")
                patient_id = subj.replace("Patient/", "")
                period = res.get("period", {})
                
                # Extract location info if present
                location_info = res.get("location", [])
                if location_info:
                    loc = location_info[0].get("location", {})
                    loc_name = loc.get("display")
                    loc_type = loc.get("type")  # sometimes None
                else:
                    loc_name = None
                    loc_type = None
                
                # Derived ICU flag
                icu_flag = False
                if loc_name:
                    icu_flag = bool(pd.Series(loc_name).str.contains("ICU|Intensive", case=False, na=False)[0])
                
                encounters.append({
                    "patient_id": patient_id,
                    "start": period.get("start"),
                    "end": period.get("end"),
                    "class": res.get("class", {}).get("code"),
                    "location": loc_name,
                    "location_type": loc_type,
                    "icu": icu_flag
                })

    df_patients = pd.DataFrame(patients)
    df_observations = pd.DataFrame(observations)
    df_encounters = pd.DataFrame(encounters)
    
    return df_patients, df_observations, df_encounters


In [336]:
# Get all data including patients, observations, and encounters
dfs = parse_fhir_bundles(fhir_path)

In [337]:
df_patients = dfs[0]
df_observations = dfs[1]
df_encounters = dfs[2]

In [338]:
# check data frame shapes
df_patients.shape, df_observations.shape, df_encounters.shape

((556, 3), (246106, 6), (28828, 7))

In [346]:
df_encounters['location_type'].unique()

array([None], dtype=object)