# Consolidate labels (create all_labels.csv for featurization)

## Purpose
Consolidate/format labels into the unified all_labels.csv format expected by COUNT featurization and CLMBR batch creation.

## Inputs
- Per-task label files generated in step 02

## Outputs
- all_labels.csv written per split and per task


In [None]:
import os
import datetime
import collections
from loguru import logger
from femr.labelers import LabeledPatients, Label
from ehrshot.labelers.core import load_labeled_patients

In [None]:
def consolidate_labels_for_featurization(PATH_TO_LABELS_DIR: str, overwrite: bool = True) -> str:
    """
     labels_dir (labeled_patients.csv)， all_labels.csv（ 4/5 ）
    -  (patient_id, time)， label ； value  False/0（）
    - ：<PATH_TO_LABELS_DIR>/all_labels.csv
    """
    out_path = os.path.join(PATH_TO_LABELS_DIR, "all_labels.csv")
    if os.path.exists(out_path) and not overwrite:
        logger.info(f"[skip] {out_path} ， overwrite。")
        return out_path

    task_dirs = [
        d for d in os.listdir(PATH_TO_LABELS_DIR)
        if os.path.isdir(os.path.join(PATH_TO_LABELS_DIR, d))
        and os.path.exists(os.path.join(PATH_TO_LABELS_DIR, d, "labeled_patients.csv"))
    ]
    logger.info(f"Found {len(task_dirs)} task(s): {task_dirs}")

    patient_2_times = collections.defaultdict(set)

    logger.info("Start | Consolidate patients")
    for td in task_dirs:
        lp_path = os.path.join(PATH_TO_LABELS_DIR, td, "labeled_patients.csv")
        lp = load_labeled_patients(lp_path)

        for pid, labels in lp.items():
            for l in labels:
                t = l.time.replace(second=0, microsecond=0)
                patient_2_times[pid].add(t)
    logger.info("Finish | Consolidate patients")

    logger.info("Start | Resort labels chronologically")
    merged = {}
    total = 0
    for pid, ts in patient_2_times.items():
        times_sorted = sorted(ts)
        merged[pid] = [Label(time=t, value=False) for t in times_sorted]
        total += len(times_sorted)
    logger.info(f"Finish | Resort labels chronologically. total label times = {total}")

    out = LabeledPatients(merged, labeler_type="miltilabel")
    out.save(out_path)
    logger.info(f"[ok] wrote {out_path}")
    return out_path


In [None]:
import os, collections, itertools, numpy as np
from typing import Any, Dict, Iterable, Tuple, List
from femr.labelers import Label, LabeledPatients
from ehrshot.labelers.core import load_labeled_patients

def _infer_labeler_type_from_values(values: Iterable[Any]) -> str:
    """ labeler_type。"""
    vals = list(itertools.islice((v for v in values if v is not None), 2000))  # 2000
    if not vals:
        return "boolean"
    if all(isinstance(v, (bool, np.bool_)) for v in vals):
        return "boolean"
    if all(isinstance(v, (int, np.integer)) for v in vals):
        return "integer"
    if all(isinstance(v, (float, np.floating)) for v in vals):
        return "float"
    if any(isinstance(v, (list, tuple, set)) for v in vals):
        return "multilabel"
    return "categorical"

def consolidate_labels_for_featurization(
    PATH_TO_LABELS_DIR: str,
    task_name: str,
    overwrite: bool = True,
    align_to_minute: bool = True,
) -> str:
    """
     `task_name`  labeled_patients.csv：
      -  <PATH_TO_LABELS_DIR>/<task_name>/all_labels.csv
      -  value
      - / labeler_type
      - ；，（：“”）
    """
    task_dir = os.path.join(PATH_TO_LABELS_DIR, task_name)
    os.makedirs(task_dir, exist_ok=True)
    in_path = os.path.join(task_dir, "labeled_patients.csv")
    out_path = os.path.join(task_dir, "all_labels.csv")

    if os.path.exists(out_path) and not overwrite:
        logger.info(f"[skip] {out_path} ， overwrite。")
        return out_path

    if not os.path.exists(in_path):
        raise FileNotFoundError(f" {in_path}")

    lp = load_labeled_patients(in_path)  # LabeledPatients  dict
    labeler_type = getattr(lp, "labeler_type", None)
    sample_values: List[Any] = []

    # patient -> {aligned_time -> value}
    patient_time2val: Dict[int, Dict[Any, Any]] = collections.defaultdict(dict)

    items_iter = lp.items() if hasattr(lp, "items") else lp
    for pid, labels in items_iter:
        for l in labels:
            t = l.time
            if align_to_minute:
                t = t.replace(second=0, microsecond=0)
            patient_time2val[pid][t] = l.value
            if len(sample_values) < 2000:
                sample_values.append(l.value)

    if labeler_type in (None, "", "unknown"):
        labeler_type = _infer_labeler_type_from_values(sample_values)
        logger.info(f"[auto]  labeler_type = {labeler_type}")

    merged: Dict[int, List[Label]] = {}
    total = 0
    for pid, t2v in patient_time2val.items():
        times_sorted = sorted(t2v.keys())
        merged[pid] = [Label(time=t, value=t2v[t]) for t in times_sorted]
        total += len(times_sorted)

    logger.info(f"[ok] ：task={task_name}, total label times = {total}, labeler_type={labeler_type}")

    out = LabeledPatients(merged, labeler_type=labeler_type)
    out.save(out_path)
    logger.success(f"[ok] wrote {out_path}")
    return out_path


In [None]:
PATH_TO_LABELS_DIR = "/root/autodl-tmp/femr/held_out/femr_labels"
task = "mimic_icu_phenotyping"

In [None]:
ALL_LABELS = consolidate_labels_for_featurization(PATH_TO_LABELS_DIR,task)
ALL_LABELS

In [None]:
\