In [44]:
import re
import hashlib
import pandas as pd
import random as py_random
from docx import Document
from pathlib import Path
from datetime import datetime, timedelta

rows = 1000
df = pd.read_csv("../Datasets/discharge.csv", nrows=rows)
df.to_json('../Datasets/test.json')

In [45]:
df

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...
1,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...
2,10000032-DS-23,10000032,29079034,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...
3,10000032-DS-24,10000032,25742920,DS,24,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...
4,10000084-DS-17,10000084,23052089,DS,17,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...
...,...,...,...,...,...,...,...,...
995,10030753-DS-40,10030753,25629024,DS,40,2199-05-16 00:00:00,2199-05-22 16:49:00,\nName: ___ Unit No: ___\n \nA...
996,10030753-DS-41,10030753,27987271,DS,41,2199-06-02 00:00:00,2199-06-02 17:45:00,\nName: ___ Unit No: ___\n \nA...
997,10030753-DS-42,10030753,24506973,DS,42,2199-07-23 00:00:00,2199-07-23 17:57:00,\nName: ___ Unit No: ___\n \nA...
998,10030753-DS-43,10030753,26512817,DS,43,2199-08-02 00:00:00,2199-08-03 18:55:00,\nName: ___ Unit No: ___\n \nA...


In [46]:
patient_df = df.drop(['note_type', 'note_seq', 'storetime'], axis=1)
patient_df

Unnamed: 0,note_id,subject_id,hadm_id,charttime,text
0,10000032-DS-21,10000032,22595853,2180-05-07 00:00:00,\nName: ___ Unit No: _...
1,10000032-DS-22,10000032,22841357,2180-06-27 00:00:00,\nName: ___ Unit No: _...
2,10000032-DS-23,10000032,29079034,2180-07-25 00:00:00,\nName: ___ Unit No: _...
3,10000032-DS-24,10000032,25742920,2180-08-07 00:00:00,\nName: ___ Unit No: _...
4,10000084-DS-17,10000084,23052089,2160-11-25 00:00:00,\nName: ___ Unit No: __...
...,...,...,...,...,...
995,10030753-DS-40,10030753,25629024,2199-05-16 00:00:00,\nName: ___ Unit No: ___\n \nA...
996,10030753-DS-41,10030753,27987271,2199-06-02 00:00:00,\nName: ___ Unit No: ___\n \nA...
997,10030753-DS-42,10030753,24506973,2199-07-23 00:00:00,\nName: ___ Unit No: ___\n \nA...
998,10030753-DS-43,10030753,26512817,2199-08-02 00:00:00,\nName: ___ Unit No: ___\n \nA...


In [47]:
randoms = rnd.randint(1,len(patient_df))

test_cell = patient_df.iloc[randoms, 4]    

print(f"{random}, {test_cell}")

<module 'random' from '/home/maui/miniconda3/lib/python3.13/random.py'>,  
Name:  ___                Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   M
 
Service: MEDICINE
 
Allergies: 
codeine
 
Attending: ___.
 
Chief Complaint:
chest pain
 
Major Surgical or Invasive Procedure:
Left heart catheterization

 
History of Present Illness:
___ with history of MI x 2 (s/p LAD bare metal stent at ___ 
___ in ___), ___'s, emphysema, who presented with 
chest pain. For the past several weeks, has had intermittent 
sharp chest pain, lasting 5 seconds maximum but increasing in 
frequency, Denies any radiation up the jaw or down the left arm, 
although experiences radiation to the back. Chest pain isn't 
associated with exertion, would occur randomly. Endorses recent 
increasing dyspnea on exertion, nausea and diaphoresis for the 
past several weeks.

Extensive and severe calcification of the coronary arteries, 
mainly 
of the LAD

In [51]:
# ---------- CONFIG ----------
USE_JSON_SOURCE = False  # True -> read from JSON instead of using an existing df
JSON_PATH = "../Datasets/test.json"
CSV_PATH = "../Datasets/discharge.csv"
N_ROWS = 1000

# Pick 5 patients w/ >=3 notes, export 3 notes each
N_PATIENTS = 5
NOTES_PER_PATIENT = 3

INJECT_NAME_IN_TEXT = True
MAKE_SUBFOLDERS_PER_PATIENT = True
OUTPUT_DIR = Path("../Documents")

REQUIRED = ['note_id', 'subject_id', 'hadm_id', 'text']
POSSIBLE_DATE_COLS = ['charttime','chartdate','admittime','dischtime','date','timestamp','storetime']
DATE_FMT = "%m/%d/%Y"
TARGET_YEAR = datetime.now().year  # keep within "this year"

# ---------------- LOAD ----------------
DATA = pd.read_json(JSON_PATH) if USE_JSON_SOURCE else pd.read_csv(CSV_PATH, nrows=N_ROWS)
missing = [c for c in REQUIRED if c not in DATA.columns]
if missing:
    raise KeyError(f"Missing required columns: {missing}")

DATA = DATA.copy()
# Prefer chartdate for ordering if present
date_col = 'chartdate' if 'chartdate' in DATA.columns else None
if date_col:
    DATA[date_col] = pd.to_datetime(DATA[date_col], errors='coerce')

# ---------------- HELPERS ----------------
def stable_seed_from_values(*vals) -> int:
    s = "|".join(map(str, vals))
    h = hashlib.md5(s.encode("utf-8")).hexdigest()
    return int(h[:8], 16)  # 32-bit

def stable_rng_from_values(*vals) -> py_random.Random:
    return py_random.Random(stable_seed_from_values(*vals))

FIRST_NAMES = ["Avery","Jordan","Taylor","Riley","Quinn","Casey","Charlie","Drew","Emerson","Harper",
               "Rowan","Peyton","Alex","Sam","Jamie","Cameron","Elliot","Morgan","Skyler","Reese"]
LAST_NAMES  = ["Hayes","Reed","Parker","Brooks","Miller","Carter","Young","Bennett","Cooper","Gray",
               "Foster","Hughes","Jenkins","Wells","Holland","Bishop","Griffin","Spencer","Blake","Sullivan"]

DOC_FIRST   = ["Chris","Dana","Leslie","Robin","Jesse","Shawn","Kendall","Marin","Devin","Noel",
               "Ari","Cory","Hayden","Jules","Micah","Parker","Reagan","Rowan","Sage","Toby"]
DOC_LAST    = ["Andrews","Barnes","Chandler","Dalton","Edwards","Finch","Gallagher","Harmon","Irving","Jacobs",
               "Keaton","Lang","Maddox","Nielsen","Osborne","Patel","Quincy","Ramirez","Singh","Turner"]

def patient_name_for_subject(subject_id):
    rng = stable_rng_from_values("patient_name", subject_id)
    return f"{FIRST_NAMES[rng.randrange(len(FIRST_NAMES))]} {LAST_NAMES[rng.randrange(len(LAST_NAMES))]}"

def attending_for_subject(subject_id):
    rng = stable_rng_from_values("attending", subject_id)
    return f"Dr. {DOC_FIRST[rng.randrange(len(DOC_FIRST))]} {DOC_LAST[rng.randrange(len(DOC_LAST))]}"

def dob_for_subject(subject_id, anchor_year=TARGET_YEAR):
    """Age 18–99; stable per subject; not tied to chartdate."""
    rng = stable_rng_from_values("dob", subject_id)
    age = rng.randrange(18, 100)
    base = datetime(anchor_year, 7, 1)
    days_into_year = rng.randrange(0, 365)
    return (base - timedelta(days=age*365)) - timedelta(days=days_into_year)

def admit_discharge_for_row_in_year(row, year=TARGET_YEAR):
    """
    Random admission/discharge within the given year, 2–30 day stay.
    Independent of chartdate (used only for ordering).
    """
    rng = stable_rng_from_values("stay", row.get('note_id'), row.get('subject_id'), year)
    stay_days = rng.randrange(2, 31)  # inclusive 2..30
    start_of_year = datetime(year, 1, 1)
    start_next_year = datetime(year + 1, 1, 1)
    days_in_year = (start_next_year - start_of_year).days  # 365/366

    # Choose an admission day that leaves room for the full stay
    latest_admit_day_index = max(0, days_in_year - stay_days - 1)
    admit_index = rng.randrange(0, max(1, latest_admit_day_index + 1))
    admit_dt = start_of_year + timedelta(days=admit_index)
    discharge_dt = admit_dt + timedelta(days=stay_days)

    # Safety clamp (shouldn't be needed but just in case)
    if discharge_dt >= start_next_year:
        discharge_dt = start_next_year - timedelta(days=1)
        admit_dt = max(start_of_year, discharge_dt - timedelta(days=stay_days))
    return admit_dt, discharge_dt

def take_chronological(gdf: pd.DataFrame, k: int) -> pd.DataFrame:
    if date_col and gdf[date_col].notna().any():
        return gdf.sort_values([date_col, 'note_id'], kind='stable').head(k)
    try:
        return gdf.sort_values(['note_id'], kind='stable').head(k)
    except Exception:
        return gdf.head(k)

def safe_name(s: str) -> str:
    return "".join(ch for ch in str(s) if ch not in r'\/:*?"<>|').strip() or "unnamed"

def personalize_text(raw_text: str, patient_name: str, dob_str: str, attending_name: str,
                     admit_str: str, discharge_str: str) -> str:
    """Fill placeholders anywhere on the line; preserve rest of text; avoid 'outpatient'."""
    if raw_text is None or (isinstance(raw_text, float) and pd.isna(raw_text)):
        return ""
    text = str(raw_text)

    # Direct placeholders (flexible spaces & underscores)
    text = re.sub(r'(?i)(\bName:\s*)_{2,}', r'\1' + patient_name, text)
    text = re.sub(r'(?i)\bAdmission\s+Date:\s*_{2,}', f'Admission Date: {admit_str}', text)
    text = re.sub(r'(?i)\bDischarge\s+Date:\s*_{2,}', f'Discharge Date: {discharge_str}', text)
    text = re.sub(r'(?i)\bDate\s+of\s+Birth:\s*_{2,}', f'Date of Birth: {dob_str}', text)
    text = re.sub(r'(?i)\bAttending:\s*_{2,}\.?', f'Attending: {attending_name}.', text)

    # Naturalize "the patient"/"patient" -> name (but not "outpatient")
    text = re.sub(r'(?i)\bthe\s+patient\b', patient_name, text)
    text = re.sub(r'(?i)(?<!out)\bpatient\b', patient_name, text)

    return text

def add_text_block_preserve_newlines(doc: Document, text: str):
    for line in text.splitlines():
        doc.add_paragraph(line)

# ---------------- SELECT SUBJECTS (>=3 notes) ----------------
eligible = DATA.groupby('subject_id').filter(lambda g: len(g) >= NOTES_PER_PATIENT)
subjects = eligible['subject_id'].drop_duplicates()
if subjects.empty:
    raise ValueError("No subject_id appears the required number of times.")
chosen_subjects = subjects.sample(n=min(N_PATIENTS, len(subjects))).tolist()

# ---------------- EXPORT ----------------
OUTPUT_DIR.mkdir(exist_ok=True)
created = 0

for sid in chosen_subjects:
    patient_name = patient_name_for_subject(sid)
    attending_name = attending_for_subject(sid)
    dob_dt = dob_for_subject(sid)
    dob_str = dob_dt.strftime(DATE_FMT)

    patient_notes = eligible[eligible['subject_id'] == sid]
    top_notes = take_chronological(patient_notes, NOTES_PER_PATIENT)

    patient_dir = OUTPUT_DIR / f"{safe_name(patient_name)}_{safe_name(sid)}" if MAKE_SUBFOLDERS_PER_PATIENT else OUTPUT_DIR
    patient_dir.mkdir(parents=True, exist_ok=True)

    for _, row in top_notes.iterrows():
        admit_dt, discharge_dt = admit_discharge_for_row_in_year(row, TARGET_YEAR)
        admit_str = admit_dt.strftime(DATE_FMT)
        discharge_str = discharge_dt.strftime(DATE_FMT)

        # Build .docx
        doc = Document()
        doc.add_heading(patient_name, level=1)
        p = doc.add_paragraph(); p.add_run("Subject ID: ").bold = True; p.add_run(str(row['subject_id']))
        p = doc.add_paragraph(); p.add_run("HADM ID: ").bold = True;    p.add_run(str(row['hadm_id']))
        p = doc.add_paragraph(); p.add_run("Note ID: ").bold = True;    p.add_run(str(row['note_id']))
        if date_col and pd.notna(row.get(date_col)):
            p = doc.add_paragraph(); p.add_run("Note Timestamp: ").bold = True; p.add_run(str(row[date_col]))

        doc.add_paragraph("")
        doc.add_heading("Text", level=2)

        final_text = personalize_text(row['text'], patient_name, dob_str, attending_name, admit_str, discharge_str)
        add_text_block_preserve_newlines(doc, final_text)

        fname = patient_dir / f"{safe_name(patient_name)}_note_{safe_name(row['note_id'])}.docx"
        doc.save(fname)
        created += 1

print(f"✅ Exported {created} documents in {OUTPUT_DIR.resolve()}")
print("Patients selected (subject_id → name | attending | DOB):")
for sid in chosen_subjects:
    print(f"  {sid} → {patient_name_for_subject(sid)} | {attending_for_subject(sid)} | {dob_for_subject(sid).strftime(DATE_FMT)}")

✅ Exported 15 documents in /home/maui/Dev/Text-Analytics-Final-Project/Documents
Patients selected (subject_id → name | attending | DOB):
  10001401 → Casey Gray | Dr. Rowan Singh | 10/29/1966
  10016742 → Morgan Foster | Dr. Reagan Andrews | 07/14/2003
  10022373 → Peyton Brooks | Dr. Rowan Harmon | 11/22/1954
  10021348 → Sam Carter | Dr. Ari Andrews | 11/03/2005
  10001884 → Riley Miller | Dr. Sage Barnes | 11/22/1943
