In [1]:
import pandas as pd
import numpy as np
import os
import json
import gc

from dataclasses import dataclass
from typing import Optional, Dict, Tuple
from pathlib import Path
from tqdm import tqdm

static_vars = ["age", "sex", "height", "weight"]

dynamic_vars = ["alb", "alp", "alt", "ast", "be", "bicar", "bili", "bili_dir",
                  "bnd", "bun", "ca", "cai", "ck", "ckmb", "cl", "crea", "crp", 
                  "dbp", "fgn", "fio2", "glu", "hgb", "hr", "inr_pt", "k", "lact",
                  "lymph", "map", "mch", "mchc", "mcv", "methb", "mg", "na", "neut", 
                  "o2sat", "pco2", "ph", "phos", "plt", "po2", "ptt", "resp", "sbp", 
                  "temp", "tnt", "urine", "wbc"]

lab_vars = ["alb", "alp", "alt", "ast", "be", "bicar", "bili", "bili_dir",
                  "bnd", "bun", "ca", "cai", "ck", "ckmb", "cl", "crea", "crp", 
                  "fgn", "fio2", "glu", "hgb", "inr_pt", "k", "lact",
                  "lymph", "mch", "mchc", "mcv", "methb", "mg", "na", "neut", 
                  "pco2", "ph", "phos", "plt", "po2", "ptt", "tnt", "wbc"]

# Original Data Load..
MIMIC_ROOT_DIR = Path('/Users/korea/EHRTTA/data/miiv')
EICU_ROOT_DIR = Path('/Users/korea/EHRTTA/data/eicu')
HIRID_ROOT_DIR = Path('/Users/korea/EHRTTA/data/hirid')

In [8]:
# Data Load
dynamics_df = pd.read_csv(EICU_ROOT_DIR/'dynamics_df.csv.gz', compression='gzip')
static_df = pd.read_csv(EICU_ROOT_DIR/'static_df.csv.gz', compression='gzip')

mapping = {'anchor_age' : 'age', 'gender' : 'sex'}
static_df['var_name'] = static_df['var_name'].replace(mapping)

with open('/Users/korea/EHRTTA/data/concept-dict.json', 'r') as f:
    concept_dict = json.load(f)

  static_df = pd.read_csv(EICU_ROOT_DIR/'static_df.csv.gz', compression='gzip')


In [9]:
static_df.var_name.unique()

array(['age', 'sex', 'weight', 'height'], dtype=object)

In [3]:
# 해당 데이터셋마다 필요한 값들을 excel 형식으로 변환해보자.

def select_vars(db_name, var_list, base_dict):
    """
    min, max => 이상치 처리 시 사용
    cateogries => 해당 변수 category 분류 시 사용
    source_table => 어느 테이블에서 가져와야 하는지
    source_columns => 해당 테이블 어느 컬럼에서 가져와야 하는지
    source_itemid => 해당 테이블의 컬럼에서 어떤 id로 존재하는지
    """
    var_normal_min = []
    var_normal_max = []
    var_unit = []
    source_category = []
    source_table = []
    source_column = []
    source_itemid = []
    source_callback = []
    source_vars = [] # to map other lists
    for var in tqdm(var_list):

        for i in range(len(base_dict[var]['sources'][db_name])): # 여러 테이블에 분산되어 있는 경우 길이가 2가 넘을 수 있기 때문

            source_table.append(base_dict[var]['sources'][db_name][i]['table'])

            try:
                source_column.append(base_dict[var]['sources'][db_name][i]['sub_var'])
            except:
                source_column.append(base_dict[var]['sources'][db_name][i]['val_var'])

            try:
                if [223761, 224027] == base_dict[var]['sources'][db_name][i]['ids']: # temp 관련 변수에서 skin temp는 제외 
                    source_itemid.append(223761)
                
                else:
                    source_itemid.append(base_dict[var]['sources'][db_name][i]['ids'])
            except:
                source_itemid.append(None)

            try:
                source_callback.append(base_dict[var]['sources'][db_name][i]['callback'])
            except:
                source_callback.append(None)


            source_category.append(base_dict[var]['category'])

            try:
                var_unit.append(base_dict[var]['unit'])
            except:
                var_unit.append(None)

            try:
                var_normal_min.append(base_dict[var]['min'])
            except:
                var_normal_min.append(None)
            
            try:
                var_normal_max.append(base_dict[var]['max'])
            except:
                var_normal_max.append(None)

            source_vars.append(var)
        
    return var_normal_min, var_normal_max, var_unit, source_category, source_table, source_column, source_itemid, source_callback, source_vars

var_normal_min, var_normal_max, var_unit, source_category, source_table, source_column, source_itemid, source_callback, source_vars  = select_vars('miiv', static_vars + dynamic_vars, concept_dict)

MAPPING_DF = pd.DataFrame({
    'var_name' : source_vars,
    'normal_min' : var_normal_min,
    'normal_max' : var_normal_max,
    'category' : source_category,
    'table' : source_table,
    'column' : source_column,
    'itemid' : source_itemid,
    'unit' : var_unit,
    'method' : source_callback 
})

# FUll name 및 추가 
full_name_dict = {'age' : 'Age', 'sex' : 'Sex', 'height' : 'Height', 'weight' : 'Weight', 'sbp' : 'Blood pressure (systolic)', 'dbp' : 'Blood pressure (diastolic)', 'hr' : 'Heart rate', 'map' : 'Mean arterial pressure', 'o2sat' : 'Oxygen saturation',
                  'resp' : 'Respiratory rate', 'temp' : 'Temperature', 'alb' : 'Albumin', 'alp' : 'Alkaline phosphatase', 'alt' : 'Alanine aminotransferase', 'ast' : 'Aspartate aminotransferase', 'be' : 'Base excess', 'bicar' : 'Bicarbonate',
                  'bili' : 'Bilirubin (total)', 'bili_dir' : 'Bilirubin (direct)', 'bnd' : 'Band form neutrophils', 'bun' : 'Blood urea nitrogen', 'ca' : 'Calcium', 'cai' : 'Calcium ionized', 'crea' : 'Creatinine', 'ck' : 'Creatinine kinase',
                  'ckmb' : 'Creatinine kinase MB', 'cl' : 'Chloride', 'pco2' : 'CO2 partial pressure', 'crp' : 'C-reactive protein', 'fgn' : 'Fibrinogen', 'glu' : 'Glucose', 'hgb' : 'Haemoglobin', 'inr_pt' : 'International normalised ratio (INR)',
                  'lact' : 'Lactate', 'lymph' : 'Lymphocytes', 'mch' : 'Mean cell haemoglobin', 'mchc' : 'Mean corpuscular haemoglobin concentration', 'mcv' : 'Mean corpuscular volume', 'methb' : 'Methaemoglobin', 'mg' : 'Magnesium', 
                  'neut' : 'Neutrophils', 'po2' : 'O2 partial pressure', 'ptt' : 'Partial thromboplastin time', 'ph' : 'pH of blood', 'phos' : 'Phosphate', 'plt' : 'Platelets', 'k' : 'Potassium', 'na' : 'Sodium', 'tnt' : 'Troponin T', 
                  'wbc' : 'White blood cells', 'fio2' : 'Fraction of inspired oxygen', 'urine' : 'Urine output'
                  }

fixed_unit_dict = {'age' : 'Years', 'sex' : '', 'height' : 'cm', 'weight' : 'kg', 'sbp' : 'mmHg', 'dbp' : 'mmHg', 'hr' : 'beats/minute', 'map' : 'mmHg', 'o2sat' : '%',
                  'resp' : 'breaths/minute', 'temp' : '°C', 'alb' : 'g/dL', 'alp' : 'IU/L', 'alt' : 'IU/L', 'ast' : 'IU/L', 'be' : 'mmol/L', 'bicar' : 'mmol/L',
                  'bili' : 'mg/dL', 'bili_dir' : 'mg/dL', 'bnd' : '%', 'bun' : 'mg/dL', 'ca' : 'mg/dL', 'cai' : 'mmol/L', 'crea' : 'mg/dL', 'ck' : 'IU/L',
                  'ckmb' : 'ng/mL', 'cl' : 'mmol/L', 'pco2' : 'mmHg', 'crp' : 'mg/L', 'fgn' : 'mg/dL', 'glu' : 'mg/dL', 'hgb' : 'g/dL', 'inr_pt' : '',
                  'lact' : 'mmol/L', 'lymph' : '%', 'mch' : 'pg', 'mchc' : '%', 'mcv' : 'fL', 'methb' : '%', 'mg' : 'mg/dL', 
                  'neut' : '%', 'po2' : 'mmHg', 'ptt' : 'sec', 'ph' : '', 'phos' : 'mg/dL', 'plt' : '1,000 / μL', 'k' : 'mmol/L', 'na' : 'mmol/L', 'tnt' : 'ng/mL', 
                  'wbc' : '1,000 / μL', 'fio2' : '%', 'urine' : 'mL'
                  }

MAPPING_DF['full_var_name'] = MAPPING_DF['var_name'].apply(lambda x : full_name_dict[x])
MAPPING_DF['fixed_unit'] = MAPPING_DF['var_name'].apply(lambda x : fixed_unit_dict[x])
MAPPING_DF = MAPPING_DF[['var_name', 'normal_min', 'normal_max', 'full_var_name', 'fixed_unit']] # only using variables
MAPPING_DF

100%|██████████| 52/52 [00:00<00:00, 51941.85it/s]


Unnamed: 0,var_name,normal_min,normal_max,full_var_name,fixed_unit
0,age,0.0,100.0,Age,Years
1,sex,,,Sex,
2,height,10.0,230.0,Height,cm
3,weight,1.0,500.0,Weight,kg
4,alb,0.0,6.0,Albumin,g/dL
5,alp,0.0,,Alkaline phosphatase,IU/L
6,alt,0.0,,Alanine aminotransferase,IU/L
7,ast,0.0,,Aspartate aminotransferase,IU/L
8,be,-25.0,25.0,Base excess,mmol/L
9,bicar,5.0,50.0,Bicarbonate,mmol/L


In [4]:
# lab vars 변경
lab_vars = [full_name_dict[i] for i in lab_vars]
print(len(lab_vars))

40


In [5]:
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Set, Any
import numpy as np
import pandas as pd


# ============================================================
# Config
# ============================================================
@dataclass
class MarkdownSummaryConfig:
    # --- Input dataframe column names ---
    # charttime is already "minutes since admission" (int)
    time_col: str = "charttime"     # minutes since ICU admission (int)
    id_col: str = "stay_id"
    var_col: str = "full_var_name"
    val_col: str = "value"
    unit_col: str = "fixed_unit"
    normal_min : str = 'normal_min'
    normal_max : str = 'normal_max'
    
    # --- Feature definitions ---
    # slope = (Δvalue) / (Δminutes)
    # variability = abs(Δvalue) by default
    variability_mode: str = "abs_delta"  # "abs_delta" or "rolling_std"
    rolling_window: int = 5              # used when variability_mode == "rolling_std"
    min_points_for_stats: int = 2        # rolling/std 최소 관측 수

    # --- Formatting ---
    round_ndigits: int = 3


# ============================================================
# Formatting helpers
# ============================================================
def fmt_number(x: Any, ndigits: int = 3, type='measurement') -> str:
    """Format numbers; use 'Not observed' for None/NaN."""
    
    if x is None and type == 'measurement':
        return "Not observed"
    
    if pd.isna(x) and type == 'normal_max':
        return "inf"
    
    if pd.isna(x) and type == 'normal_min':
        return "-inf"

    try:
        if isinstance(x, float) and np.isnan(x):
            return "Not observed"
    except Exception:
        pass

    if isinstance(x, (int, np.integer)):
        return str(int(x))

    try:
        return f"{float(x):.{ndigits}f}"
    except Exception:
        return "Not observed"


def nan_stats(x: np.ndarray) -> Dict[str, Optional[float]]:
    """Return min/max/mean/std on numeric array, ignoring NaNs."""
    x = np.asarray(x, dtype=float)
    x = x[~np.isnan(x)]
    if x.size == 0:
        return {"min": None, "max": None, 'median' : None ,"mean": None, "std": None}

    std = float(np.std(x, ddof=1)) if x.size > 1 else 0.0
    return {
        "min": float(np.min(x)),
        "max": float(np.max(x)),
        'median' : float(np.median(x)),
        "mean": float(np.mean(x)),
        "std": std,
    }


# ============================================================
# Feature computation (per variable)
# ============================================================
def compute_variable_features(
    var_df: pd.DataFrame,
    cfg: MarkdownSummaryConfig,
) -> Dict[str, Any]:
    """
    Compute summary features for ONE variable group.
    Assumption: cfg.time_col is already minutes (int) since admission.
    """
    df_sorted = var_df.sort_values(cfg.time_col).copy()

    times_min = pd.to_numeric(df_sorted[cfg.time_col], errors="coerce").to_numpy(dtype=float)
    values = pd.to_numeric(df_sorted[cfg.val_col], errors="coerce").to_numpy(dtype=float)

    # observation count (non-NaN values)
    n_obs = int(np.sum(~np.isnan(values)))

    # first/last time (minutes)
    first_min = float(times_min[0]) if times_min.size else None
    last_min = float(times_min[-1]) if times_min.size else None

    # interval mean (unique time 기준)
    unique_times = np.unique(times_min[~np.isnan(times_min)])
    if unique_times.size >= 2:
        interval_mean_min = float(np.mean(np.diff(unique_times)))
    else:
        interval_mean_min = None

    # value stats
    value_stats = nan_stats(values)

    # slope stats: Δy / Δt (dt>0)
    slopes = np.array([], dtype=float)
    if times_min.size >= 2:
        dt = np.diff(times_min)
        dy = np.diff(values)
        mask = (~np.isnan(dt)) & (~np.isnan(dy)) & (dt > 0)
        if np.any(mask):
            slopes = dy[mask] / dt[mask]
    slope_stats = nan_stats(slopes) if slopes.size else {"min": None, "max": None, "mean": None, "std": None}

    # variability stats
    if cfg.variability_mode == "abs_delta":
        variability_seq = np.abs(np.diff(values))
        variability_seq = variability_seq[~np.isnan(variability_seq)]
    elif cfg.variability_mode == "rolling_std":
        rolling_std = (
            pd.Series(values)
            .rolling(cfg.rolling_window, min_periods=cfg.min_points_for_stats)
            .std()
            .to_numpy(dtype=float)
        )
        variability_seq = rolling_std[~np.isnan(rolling_std)]
    else:
        raise ValueError("variability_mode must be 'abs_delta' or 'rolling_std'")

    variability_stats = nan_stats(variability_seq) if variability_seq.size else {"min": None, "max": None, "mean": None, "std": None}

    return {
        "first_min": first_min,
        "last_min": last_min,
        "n_obs": n_obs,
        "interval_mean_min": interval_mean_min,
        "value_stats": value_stats,
        "slope_stats": slope_stats,
        "variability_stats": variability_stats,
    }


# ============================================================
# Markdown builder (per patient)
# ============================================================
def build_patient_markdown_summary(
    observations: pd.DataFrame,
    demographics: pd.DataFrame,
    mapping_df : pd.DataFrame,
    cfg: Optional[MarkdownSummaryConfig] = None,
    labs_title: str = "Summary of Lab Results",
    lab_var_list: Optional[list[str]] = None,
    output_var_names: Optional[Set[str]] = ['Urine output'],
) -> str:
    """
    observations: one patient's observation rows
      - must include: cfg.time_col (minutes), cfg.var_col, cfg.val_col
    demographics: {"Age":..., "Gender":..., "Weight":..., "Height":...}
    normal_ranges: {"WBC": (4,11), ...}
    output_var_names: variables to treat as "Output Events" section
    """
    cfg = cfg or MarkdownSummaryConfig()
    output_var_names = output_var_names or set()

    df = observations.copy()
    df[cfg.val_col] = pd.to_numeric(df[cfg.val_col], errors="coerce")
    df[cfg.time_col] = pd.to_numeric(df[cfg.time_col], errors="coerce")  # ensure numeric minutes

    # ----------------------------
    # Header: demographics
    # ----------------------------
    md_lines = []
    md_lines.append("# Patient Demographics at ICU Admission\n")
    md_lines.append(f"- Age : {fmt_number(demographics[demographics['full_var_name'] == 'Age']['value'].values[0])}")
    md_lines.append(f"- Gender : {demographics[demographics['full_var_name'] == 'Sex']['value'].values[0]}")

    try:
        md_lines.append(f"- Weight : {fmt_number(demographics[demographics['full_var_name'] == 'Weight']['value'].values[0])}kg")
    except:
        md_lines.append(f"- Weight : Not observed")
    
    try:
        md_lines.append(f"- Height : {fmt_number(demographics[demographics['full_var_name'] == 'Height']['value'].values[0])}cm \n")
    except:
        md_lines.append(f"- Height : Not observed")

    md_lines.append("")

    # ----------------------------
    # Split labs vs outputs
    # ----------------------------
    is_output = df[cfg.var_col].isin(output_var_names)
    labs_df = df[~is_output].copy()
    output_df = df[is_output].copy()


    mapping_key = cfg.var_col  # "full_var_name"

    meta_cols = [cfg.unit_col, cfg.normal_min, cfg.normal_max]
    meta_cols = [c for c in meta_cols if c in mapping_df.columns]

    var_meta = (
        mapping_df.drop_duplicates(subset=[mapping_key])
                .set_index(mapping_key)[meta_cols]
                .to_dict(orient="index"))
    
    def render_variable_section(title: str, section_df: pd.DataFrame, var_list: list[str]) -> list[str]:
        section_lines = [f"# {title}"]

        for var_name in var_list:
            # 1) metadata from MappingDF (unit + normal range)
            meta = var_meta.get(var_name, {})
            unit_str = meta.get(cfg.unit_col, "") if meta else ""
            ref_min = meta.get(cfg.normal_min, None) if meta else None
            ref_max = meta.get(cfg.normal_max, None) if meta else None

            # 2) patient rows for this var (may be empty)
            var_rows = section_df[section_df[cfg.var_col] == var_name]

            section_lines.append(f"- [{var_name}] ({unit_str})\n")
            section_lines.append(
                f"\t- Normal Value Range : {fmt_number(ref_min, cfg.round_ndigits, 'normal_min')} to {fmt_number(ref_max, cfg.round_ndigits, 'normal_max')}"
            )

            # 3) if no observations -> keep unit/range, but stats/time are Not observed
            n_valid = int(var_rows[cfg.val_col].notna().sum()) if not var_rows.empty else 0
            if n_valid == 0:
                section_lines.append("\t- (Not observed)")
                continue

            # 4) has observations -> compute features
            feats = compute_variable_features(var_rows, cfg)
            section_lines.append(
                f"\t- First/Last Obs : {fmt_number(feats['first_min'], 0)} / {fmt_number(feats['last_min'], 0)}, "
                f"N : {fmt_number(feats['n_obs'], 0)}, Interval : {fmt_number(feats['interval_mean_min'], 0)}"
            )

            vs = feats["value_stats"]
            section_lines.append(
                "\t- Statistics: "
                f"[{fmt_number(vs['min'], cfg.round_ndigits)}, {fmt_number(vs['max'], cfg.round_ndigits)}, "
                f"{fmt_number(vs['median'], cfg.round_ndigits)}, {fmt_number(vs['mean'], cfg.round_ndigits)}, "
                f"{fmt_number(vs['std'], cfg.round_ndigits)}]"
            )
            section_lines.append("")

        return section_lines


    # def render_variable_section(title: str, section_df: pd.DataFrame, var_list : list = None) -> list[str]:
    #     section_lines = [f"# {title}"]
    #     section_lines.append(f"## Time unit : minutes after admission, Obs : Observation Time, N : observation count, Interval : mean interval between observations, Statistics : [min, max, median, mean, std]")

    #     if section_df.empty:
    #         section_lines.append("- (Not observed)\n")
    #         return section_lines

    #     for var_name, var_df in section_df.groupby(cfg.var_col):
    #         # unit
    #         unit_str = ""
    #         if cfg.unit_col in var_df.columns and var_df[cfg.unit_col].notna().any():
    #             unit_str = str(var_df[cfg.unit_col].dropna().iloc[0])

    #         # normal range
    #         ref_min, ref_max = var_df[cfg.normal_min].unique(), var_df[cfg.normal_max].unique()

    #         # features
    #         feats = compute_variable_features(var_df, cfg)

    #         # (옵션) slope/variability level rule 넣고 싶으면 여기에 정의
    #         slope_level = "None"
    #         variability_level = "None"

    #         section_lines.append(f"- [{var_name}] ({unit_str})\n")
    #         section_lines.append(
    #             f"\t- Normal Value Range : {fmt_number(ref_min, cfg.round_ndigits)} to {fmt_number(ref_max, cfg.round_ndigits)}"
    #         )
    #         section_lines.append(f"\t- First/Last Obs : {fmt_number(feats['first_min'], 0)} / {fmt_number(feats['last_min'], 0)}," \
    #                              f" N : {fmt_number(feats['n_obs'], 0)}, Interval : {fmt_number(feats['interval_mean_min'], 0)}")

    #         vs = feats["value_stats"]
    #         section_lines.append(
    #             "\t- Statistics: "
    #             # f"Min = {fmt_number(vs['min'], cfg.round_ndigits)}, "
    #             # f"Max = {fmt_number(vs['max'], cfg.round_ndigits)}, "
    #             # f"Median = {fmt_number(vs['median'], cfg.round_ndigits)}, "
    #             # f"Mean = {fmt_number(vs['mean'], cfg.round_ndigits)}, "
    #             # f"STD = {fmt_number(vs['std'], cfg.round_ndigits)}"
    #             f"[{fmt_number(vs['min'], cfg.round_ndigits)}, {fmt_number(vs['max'], cfg.round_ndigits)}, {fmt_number(vs['median'], cfg.round_ndigits)}, {fmt_number(vs['mean'], cfg.round_ndigits)}, {fmt_number(vs['std'], cfg.round_ndigits)}]"
    #         )

    #         # ss = feats["slope_stats"]
    #         # section_lines.append(
    #         #     "\t- Statistics of Slopes : "
    #         #     f"Min = {fmt_number(ss['min'], cfg.round_ndigits)} ({slope_level}), "
    #         #     f"Max = {fmt_number(ss['max'], cfg.round_ndigits)} ({slope_level}), "
    #         #     f"Mean = {fmt_number(ss['mean'], cfg.round_ndigits)} ({slope_level}), "
    #         #     f"STD = {fmt_number(ss['std'], cfg.round_ndigits)}"
    #         # )

    #         # vb = feats["variability_stats"]
    #         # section_lines.append(
    #         #     "\t- Statistics of Variability : "
    #         #     f"Min = {fmt_number(vb['min'], cfg.round_ndigits)} ({variability_level}), "
    #         #     f"Max = {fmt_number(vb['max'], cfg.round_ndigits)} ({variability_level}), "
    #         #     f"Mean = {fmt_number(vb['mean'], cfg.round_ndigits)} ({variability_level}), "
    #         #     f"STD = {fmt_number(vb['std'], cfg.round_ndigits)}"
    #         # )
    #         section_lines.append("")  # blank line between vars

    #     return section_lines

    
    md_lines.append("## Time unit : minutes after admission, Obs : Observation Time, N : observation count, "
            "Interval : mean interval between observations, Statistics : [min, max, median, mean, std]")
    md_lines += render_variable_section(labs_title, labs_df, lab_var_list)
    md_lines.append("")  # spacer
    md_lines += render_variable_section("Summary of Output Events", output_df, ['Urine output'])

    return "\n".join(md_lines).strip()


# ============================================================
# Example Usage
# ============================================================
if __name__ == "__main__":


    cfg = MarkdownSummaryConfig(
        time_col="charttime",        # already minutes (int)
        variability_mode="abs_delta",
        rolling_window=5,
        round_ndigits=3
    )

    stay_ids = []
    texts = []

    for sid in tqdm(static_df.stay_id.unique()):
        # change
        patient_df = dynamics_df[(dynamics_df['stay_id'] == sid)]
        patient_df = pd.merge(patient_df, MAPPING_DF, on = 'var_name', how = 'left')

        demographics = static_df[static_df['stay_id'] == sid]
        demographics = pd.merge(demographics, MAPPING_DF, on = 'var_name', how = 'left')


        md_text = build_patient_markdown_summary(
            observations=patient_df,
            demographics=demographics,
            mapping_df=MAPPING_DF,
            cfg=cfg,
            lab_var_list = lab_vars
        )

        stay_ids.append(sid)
        texts.append(md_text)

100%|██████████| 13109/13109 [10:44<00:00, 20.33it/s]


In [6]:
text_df = pd.DataFrame(columns=['stay_id', 'text'])
text_df['stay_id'] = stay_ids
text_df['text'] = texts

text_df.to_csv(HIRID_ROOT_DIR/'text_df.csv.gz', compression='gzip', index = False)

In [129]:
with open("example.txt", "w", encoding="utf-8") as f:
    f.write(max_text)

In [None]:
text_df['length'] = text_df['text'].apply(lambda x : len(x))
text_df

Unnamed: 0,stay_id,text,length
0,2,# Patient Demographics at ICU Admission\n\n- A...,6326
1,4,# Patient Demographics at ICU Admission\n\n- A...,5869
2,5,# Patient Demographics at ICU Admission\n\n- A...,5997
3,7,# Patient Demographics at ICU Admission\n\n- A...,6528
4,9,# Patient Demographics at ICU Admission\n\n- A...,6588
...,...,...,...
13104,33897,# Patient Demographics at ICU Admission\n\n- A...,6614
13105,33898,# Patient Demographics at ICU Admission\n\n- A...,6145
13106,33900,# Patient Demographics at ICU Admission\n\n- A...,5146
13107,33901,# Patient Demographics at ICU Admission\n\n- A...,7002


In [10]:
max_length = text_df['length'].max()
max_text = text_df[text_df['length'] == max_length]['text'].values[0]
print(max_text)

# Patient Demographics at ICU Admission

- Age : 70.000
- Gender : M
- Weight : 85.000kg
- Height : 175.000cm 


## Time unit : minutes after admission, Obs : Observation Time, N : observation count, Interval : mean interval between observations, Statistics : [min, max, median, mean, std]
# Summary of Lab Results
- [Albumin] (g/dL)

	- Normal Value Range : 0.000 to 6.000
	- First/Last Obs : 1018 / 1018, N : 1, Interval : Not observed
	- Statistics: [2.000, 2.000, 2.000, 2.000, 0.000]

- [Alkaline phosphatase] (IU/L)

	- Normal Value Range : 0.000 to inf
	- First/Last Obs : 1018 / 1018, N : 1, Interval : Not observed
	- Statistics: [115.000, 115.000, 115.000, 115.000, 0.000]

- [Alanine aminotransferase] (IU/L)

	- Normal Value Range : 0.000 to inf
	- First/Last Obs : 1018 / 1018, N : 1, Interval : Not observed
	- Statistics: [1506.000, 1506.000, 1506.000, 1506.000, 0.000]

- [Aspartate aminotransferase] (IU/L)

	- Normal Value Range : 0.000 to inf
	- First/Last Obs : 1018 / 1018, N : 1

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Llama-3.1-8B" 

TOKEN = 'your tokens'
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=f"{TOKEN}")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)
model.eval()

enc = tokenizer(
    max_text,
    return_tensors="pt",
    truncation=True,
    max_length=8192,   # 필요 시 조절
)

# device로 이동
enc = {k: v.to(model.device) for k, v in enc.items()}

with torch.no_grad():
    # input embedding table에서 바로 lookup
    token_embeddings = model.get_input_embeddings()(enc["input_ids"])
    # shape: [batch=1, seq_len, hidden_size]

print(token_embeddings.shape)


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.41s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


torch.Size([1, 3327, 4096])


In [14]:
full_ids = tokenizer(max_text, add_special_tokens=False).input_ids
print("full length:", len(full_ids))
print("will be truncated to:", 8192, "?", len(full_ids) > 8192)

full length: 3326
will be truncated to: 8192 ? False


In [15]:
enc = tokenizer(
    md_text,
    return_tensors="pt",
    max_length=4096,
    return_overflowing_tokens=False,
)

ids = enc["input_ids"][0].tolist()                 # [seq_len]
tokens = tokenizer.convert_ids_to_tokens(ids)      # 토큰 문자열(대부분 BPE 조각)

for i, (tid, tok) in enumerate(zip(ids[:200], tokens[:200])):  # 앞 200개만 예시
    print(f"{i:4d}  {tid:8d}  {repr(tok)}")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


   0    128000  '<|begin_of_text|>'
   1         2  '#'
   2     30024  'ĠPatient'
   3      4829  'ĠDem'
   4     45245  'ographics'
   5       520  'Ġat'
   6     85015  'ĠICU'
   7     63446  'ĠAdmission'
   8       271  'ĊĊ'
   9        12  '-'
  10     13381  'ĠAge'
  11       551  'Ġ:'
  12       220  'Ġ'
  13      2031  '70'
  14        13  '.'
  15       931  '000'
  16       198  'Ċ'
  17        12  '-'
  18     29317  'ĠGender'
  19       551  'Ġ:'
  20       435  'ĠF'
  21       198  'Ċ'
  22        12  '-'
  23     16923  'ĠWeight'
  24       551  'Ġ:'
  25       220  'Ġ'
  26      1399  '60'
  27        13  '.'
  28       931  '000'
  29      7501  'kg'
  30       198  'Ċ'
  31        12  '-'
  32     22147  'ĠHeight'
  33       551  'Ġ:'
  34       220  'Ġ'
  35      8258  '170'
  36        13  '.'
  37       931  '000'
  38      6358  'cm'
  39     15073  'ĠĊĊĊ'
  40       567  '##'
  41      4212  'ĠTime'
  42      5089  'Ġunit'
  43       551  'Ġ:'
  44      4520  'Ġmi