In [1]:
import json
import os
import sys
import re
import unicodedata
import pandas as pd
from pathlib import Path
from typing import List, Dict
import argparse

In [2]:
# Readin
df = pd.read_csv('MedSynth_huggingface_final.csv')
df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc
0,**1. Subjective:**\n\n **Chief Complaint (CC...,[doctor]: Hello! It’s good to see you today. H...,M25562,PAIN IN LEFT KNEE
1,**1. Subjective:**\n\n - **Chief Complaint (...,"[doctor] Hi there, how are you today?\n\n[pati...",M25562,PAIN IN LEFT KNEE
2,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor] Good morning, how are you doing today...",M25562,PAIN IN LEFT KNEE
3,**1. Subjective:**\n\n**Chief Complaint (CC):*...,[doctor] Good morning! How are you feeling tod...,M25562,PAIN IN LEFT KNEE
4,#####\n**1. Subjective:**\n\n**Chief Complaint...,"[doctor]: Hello Mr. Doe, how are you doing tod...",M25562,PAIN IN LEFT KNEE
...,...,...,...,...
10235,#####\n**1. Subjective:**\n \n**Chief Compla...,[doctor]: Good morning. How are you doing toda...,B3781,CANDIDAL ESOPHAGITIS
10236,### Gastroenterologist Medical Note\n\n#### 1....,"**Doctor:** Hi there, how are you doing today?...",B3781,CANDIDAL ESOPHAGITIS
10237,**1. Subjective:**\n\n**Chief Complaint (CC):*...,"[doctor]: Hi Mr. Harris, how are you doing tod...",B3781,CANDIDAL ESOPHAGITIS
10238,#####\n**1. Subjective:**\n**Chief Complaint (...,"[doctor]: Good morning, Ms. Lee. How are you d...",B3781,CANDIDAL ESOPHAGITIS


In [3]:
# Harmonizing UTF characters
def clean_string(s):
    if not isinstance(s, str):
        return s

    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"[\u200b\u200c\u200d\ufeff]", "", s)
    s = re.sub(r"[\x00-\x1F\x7F]", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df[["Note", "Dialogue"]] = df[["Note", "Dialogue"]].map(clean_string)
df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc
0,**1. Subjective:** **Chief Complaint (CC):** -...,[doctor]: Hello! It’s good to see you today. H...,M25562,PAIN IN LEFT KNEE
1,**1. Subjective:** - **Chief Complaint (CC):**...,"[doctor] Hi there, how are you today?[patient]...",M25562,PAIN IN LEFT KNEE
2,**1. Subjective:****Chief Complaint (CC):**Sev...,"[doctor] Good morning, how are you doing today...",M25562,PAIN IN LEFT KNEE
3,**1. Subjective:****Chief Complaint (CC):** Mo...,[doctor] Good morning! How are you feeling tod...,M25562,PAIN IN LEFT KNEE
4,#####**1. Subjective:****Chief Complaint (CC):...,"[doctor]: Hello Mr. Doe, how are you doing tod...",M25562,PAIN IN LEFT KNEE
...,...,...,...,...
10235,#####**1. Subjective:** **Chief Complaint (CC)...,[doctor]: Good morning. How are you doing toda...,B3781,CANDIDAL ESOPHAGITIS
10236,### Gastroenterologist Medical Note#### 1. Sub...,"**Doctor:** Hi there, how are you doing today?...",B3781,CANDIDAL ESOPHAGITIS
10237,**1. Subjective:****Chief Complaint (CC):**Dif...,"[doctor]: Hi Mr. Harris, how are you doing tod...",B3781,CANDIDAL ESOPHAGITIS
10238,#####**1. Subjective:****Chief Complaint (CC):...,"[doctor]: Good morning, Ms. Lee. How are you d...",B3781,CANDIDAL ESOPHAGITIS


In [4]:
# Erasing leading styling characters

s = df["Note"].astype("string")
df["Note"] = s.apply(lambda x: x[x.find("**"):] if isinstance(x, str) and "**" in x else x)
df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc
0,**1. Subjective:** **Chief Complaint (CC):** -...,[doctor]: Hello! It’s good to see you today. H...,M25562,PAIN IN LEFT KNEE
1,**1. Subjective:** - **Chief Complaint (CC):**...,"[doctor] Hi there, how are you today?[patient]...",M25562,PAIN IN LEFT KNEE
2,**1. Subjective:****Chief Complaint (CC):**Sev...,"[doctor] Good morning, how are you doing today...",M25562,PAIN IN LEFT KNEE
3,**1. Subjective:****Chief Complaint (CC):** Mo...,[doctor] Good morning! How are you feeling tod...,M25562,PAIN IN LEFT KNEE
4,**1. Subjective:****Chief Complaint (CC):** Mo...,"[doctor]: Hello Mr. Doe, how are you doing tod...",M25562,PAIN IN LEFT KNEE
...,...,...,...,...
10235,**1. Subjective:** **Chief Complaint (CC):**Di...,[doctor]: Good morning. How are you doing toda...,B3781,CANDIDAL ESOPHAGITIS
10236,**Chief Complaint (CC):** Difficulty swallowin...,"**Doctor:** Hi there, how are you doing today?...",B3781,CANDIDAL ESOPHAGITIS
10237,**1. Subjective:****Chief Complaint (CC):**Dif...,"[doctor]: Hi Mr. Harris, how are you doing tod...",B3781,CANDIDAL ESOPHAGITIS
10238,**1. Subjective:****Chief Complaint (CC):** Se...,"[doctor]: Good morning, Ms. Lee. How are you d...",B3781,CANDIDAL ESOPHAGITIS


In [5]:
# Drop NA (and one abnormative) values

df.isna().sum()
df = df.drop([10236])
df = df.dropna()
df = df.sort_values(["ICD10", "Note"])
df.reset_index(inplace=True)
df = df.drop(["index"], axis=1)
df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc
0,**1. Subjective:****Chief Complaint (CC):** Se...,[doctor]: Good morning. How are you feeling to...,A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE
1,**1. Subjective:****Chief Complaint (CC):**Fre...,"[doctor]: Good morning, Mrs. Doe. How are you ...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE
2,**1. Subjective:****Chief Complaint (CC):**Mod...,"[doctor]: Hi Mr. Lee, how are you doing today?...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE
3,**1. Subjective:****Chief Complaint (CC):**Wat...,"[doctor]: Good morning, how are you today?[pat...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE
4,**Subjective:****Chief Complaint (CC)**Severe ...,"[doctor]: Hi there, how are you doing today?[p...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE
...,...,...,...,...
10232,**1. Subjective:****Chief Complaint (CC):**Fol...,[doctor]: Good morning! It's nice to see you a...,Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN
10233,**1. Subjective:****Chief Complaint (CC):**Per...,"```markdown[doctor] Hello, how are you doing t...",Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN
10234,**1. Subjective:****Chief Complaint (CC):**Sev...,[doctor] Good morning. How are you feeling tod...,Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN
10235,**1. Subjective:****Chief Complaint (CC):**Sev...,"[doctor] Hi Mr. Lee, how are you doing today?[...",Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN


In [6]:
# Extract SOAP sections from the note text

def extract_soap_sections(note):
    flags = re.DOTALL | re.IGNORECASE

    # Flexible section-start anchors (optional **, optional numbering, case-insensitive)
    obj_start  = r'(?:\*{0,2}\s*(?:\d+[\.\)]\s*)?Objective\s*[\*:]{0,3})'
    assess_start = r'(?:\*{0,2}\s*(?:\d+[\.\)]\s*)?Assessment\s*[\*:]{0,3})'
    plan_start = r'(?:\*{0,2}\s*(?:\d+[\.\)]\s*)?Plan\s*[\*:]{0,3})'

    # --- Subjective ---
    subj_pat = (
        r'(?:\*{0,2}\s*(?:\d+[\.\)]\s*)?'
        r'(?:Subjective|Chief\s+Complaint(?:\s*\(CC\))?)\s*[\*:]{0,3}\s*)'
        r'(.+?)(?=' + obj_start + r'|$)'
    )
    m = re.search(subj_pat, note, flags)
    if not m:
        # Fallback: everything before the Objective header is Subjective
        m = re.search(r'^(.+?)(?=' + obj_start + r')', note, flags)
    subjective = m.group(1).strip() if m else ''

    # --- Objective ---
    obj_pat = obj_start + r'\s*(.+?)(?=' + assess_start + r'|$)'
    m = re.search(obj_pat, note, flags)
    objective = m.group(1).strip() if m else ''

    # --- Assessment ---
    assess_pat = assess_start + r'\s*(.+?)(?=' + plan_start + r'|$)'
    m = re.search(assess_pat, note, flags)
    assessment = m.group(1).strip() if m else ''

    # --- Plan ---
    plan_pat = plan_start + r'\s*(.+?)$'
    m = re.search(plan_pat, note, flags)
    plan = m.group(1).strip() if m else ''

    return {
        'Subjective': subjective,
        'Objective': objective,
        'Assessment': assessment,
        'Plan': plan,
    }

soap_sections = df['Note'].apply(extract_soap_sections).apply(pd.Series)
df = pd.concat([df, soap_sections], axis=1)

# Drop rows where any SOAP dimension failed to extract
before = len(df)
df = df[(df['Subjective'] != '') & (df['Objective'] != '') & (df['Assessment'] != '') & (df['Plan'] != '')]
df = df.reset_index(drop=True)
print(f"Dropped {before - len(df)} rows with empty SOAP sections ({len(df)} remaining)")

df

Dropped 17 rows with empty SOAP sections (10220 remaining)


Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc,Subjective,Objective,Assessment,Plan
0,**1. Subjective:****Chief Complaint (CC):** Se...,[doctor]: Good morning. How are you feeling to...,A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):** Severe abdominal pai...,**Vital Signs:**- Temperature: 100.8°F- Blood ...,**Diagnosis:** Enterocolitis due to Clostridiu...,"**Medication:**- Prescribe Vancomycin 125 mg, ..."
1,**1. Subjective:****Chief Complaint (CC):**Fre...,"[doctor]: Good morning, Mrs. Doe. How are you ...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Frequent watery diarr...,**Vital Signs:**- Temperature: 100.5 °F- Blood...,- Enterocolitis due to Clostridium difficile (...,"- **Medications:** - Vancomycin 125 mg orally,..."
2,**1. Subjective:****Chief Complaint (CC):**Mod...,"[doctor]: Hi Mr. Lee, how are you doing today?...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Moderate abdominal pa...,This section is limited in a telemedicine enco...,Diagnosis: Enterocolitis due to Clostridium di...,**Treatment:**- Prescribe Fidaxomicin 200 mg o...
3,**1. Subjective:****Chief Complaint (CC):**Wat...,"[doctor]: Good morning, how are you today?[pat...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Watery diarrhea and a...,- Physical Exam: Not applicable (telemedicine ...,- Enterocolitis due to Clostridium difficile (...,**Investigations/Test Results:**- Stool sample...
4,**Subjective:****Chief Complaint (CC)**Severe ...,"[doctor]: Hi there, how are you doing today?[p...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC)**Severe abdominal cramp...,**Vitals**- Temperature: 101.2 degrees F- Bloo...,1. Enterocolitis due to Clostridium difficile-...,1. Enterocolitis due to Clostridium difficile-...
...,...,...,...,...,...,...,...,...
10215,**1. Subjective:****Chief Complaint (CC):**Fol...,[doctor]: Good morning! It's nice to see you a...,Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,**Chief Complaint (CC):**Follow-up for managem...,**Vital Signs:**- Blood Pressure: 138/82 mmHg-...,- **Chronic Obstructive Pulmonary Disease (COP...,- **Medications:** - Continue Albuterol inhale...
10216,**1. Subjective:****Chief Complaint (CC):**Per...,"```markdown[doctor] Hello, how are you doing t...",Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,"**Chief Complaint (CC):**Persistent dry cough,...",**Physical Exam:**- **General:** Appears fatig...,- Dependence on Supplemental Oxygen (ICD-10 Co...,- **Treatment Recommendations:** - Initiate su...
10217,**1. Subjective:****Chief Complaint (CC):**Sev...,[doctor] Good morning. How are you feeling tod...,Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,**Chief Complaint (CC):**Severe breathlessness...,**Vital Signs:**- Blood Pressure: Not document...,The patient presents with severe breathlessnes...,Based on the current evaluation and pending di...
10218,**1. Subjective:****Chief Complaint (CC):**Sev...,"[doctor] Hi Mr. Lee, how are you doing today?[...",Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,**Chief Complaint (CC):**Severe breathlessness...,**Vital Signs:**- Blood Pressure: 140/85 mmHg-...,- **Primary Diagnosis:** Dependence on Supplem...,s.**Review of Systems (ROS):**- **General:** R...


In [7]:
def clean_asterisks(text):
    if not isinstance(text, str):
        return text
    # Match 3 or more leading asterisks and replace with exactly 2
    return re.sub(r'^\*{3,}', '**', text)

# Apply to the 4 SOAP columns
for col in ['Subjective', 'Objective', 'Assessment', 'Plan']:
    df[col] = df[col].apply(clean_asterisks)

df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc,Subjective,Objective,Assessment,Plan
0,**1. Subjective:****Chief Complaint (CC):** Se...,[doctor]: Good morning. How are you feeling to...,A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):** Severe abdominal pai...,**Vital Signs:**- Temperature: 100.8°F- Blood ...,**Diagnosis:** Enterocolitis due to Clostridiu...,"**Medication:**- Prescribe Vancomycin 125 mg, ..."
1,**1. Subjective:****Chief Complaint (CC):**Fre...,"[doctor]: Good morning, Mrs. Doe. How are you ...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Frequent watery diarr...,**Vital Signs:**- Temperature: 100.5 °F- Blood...,- Enterocolitis due to Clostridium difficile (...,"- **Medications:** - Vancomycin 125 mg orally,..."
2,**1. Subjective:****Chief Complaint (CC):**Mod...,"[doctor]: Hi Mr. Lee, how are you doing today?...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Moderate abdominal pa...,This section is limited in a telemedicine enco...,Diagnosis: Enterocolitis due to Clostridium di...,**Treatment:**- Prescribe Fidaxomicin 200 mg o...
3,**1. Subjective:****Chief Complaint (CC):**Wat...,"[doctor]: Good morning, how are you today?[pat...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Watery diarrhea and a...,- Physical Exam: Not applicable (telemedicine ...,- Enterocolitis due to Clostridium difficile (...,**Investigations/Test Results:**- Stool sample...
4,**Subjective:****Chief Complaint (CC)**Severe ...,"[doctor]: Hi there, how are you doing today?[p...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC)**Severe abdominal cramp...,**Vitals**- Temperature: 101.2 degrees F- Bloo...,1. Enterocolitis due to Clostridium difficile-...,1. Enterocolitis due to Clostridium difficile-...
...,...,...,...,...,...,...,...,...
10215,**1. Subjective:****Chief Complaint (CC):**Fol...,[doctor]: Good morning! It's nice to see you a...,Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,**Chief Complaint (CC):**Follow-up for managem...,**Vital Signs:**- Blood Pressure: 138/82 mmHg-...,- **Chronic Obstructive Pulmonary Disease (COP...,- **Medications:** - Continue Albuterol inhale...
10216,**1. Subjective:****Chief Complaint (CC):**Per...,"```markdown[doctor] Hello, how are you doing t...",Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,"**Chief Complaint (CC):**Persistent dry cough,...",**Physical Exam:**- **General:** Appears fatig...,- Dependence on Supplemental Oxygen (ICD-10 Co...,- **Treatment Recommendations:** - Initiate su...
10217,**1. Subjective:****Chief Complaint (CC):**Sev...,[doctor] Good morning. How are you feeling tod...,Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,**Chief Complaint (CC):**Severe breathlessness...,**Vital Signs:**- Blood Pressure: Not document...,The patient presents with severe breathlessnes...,Based on the current evaluation and pending di...
10218,**1. Subjective:****Chief Complaint (CC):**Sev...,"[doctor] Hi Mr. Lee, how are you doing today?[...",Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,**Chief Complaint (CC):**Severe breathlessness...,**Vital Signs:**- Blood Pressure: 140/85 mmHg-...,- **Primary Diagnosis:** Dependence on Supplem...,s.**Review of Systems (ROS):**- **General:** R...


In [8]:
# Determining under- and overrepresented ICD codes

counts = df['ICD10'].value_counts()
filtered = counts[counts != 5]

print(filtered)

ICD10
E784       10
E7849      10
N10        10
R312       10
R3129      10
R938       10
R9389      10
R972       10
R9720      10
Z9889      10
Z98890     10
B3781       4
D473        4
D485        4
G893        4
G894        4
H4011X1     4
I6350       4
L97512      4
M3214       4
M4856XA     4
N6320       4
O621        4
R42         4
S0081XA     4
S83512A     4
S8991XA     4
T82868A     4
Z21         4
Z793        4
Z8616       4
Name: count, dtype: int64


In [9]:
# Deleting rows with under- and overrepresented ICD codes

df = df[df['ICD10'].map(df['ICD10'].value_counts()) == 5]
df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc,Subjective,Objective,Assessment,Plan
0,**1. Subjective:****Chief Complaint (CC):** Se...,[doctor]: Good morning. How are you feeling to...,A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):** Severe abdominal pai...,**Vital Signs:**- Temperature: 100.8°F- Blood ...,**Diagnosis:** Enterocolitis due to Clostridiu...,"**Medication:**- Prescribe Vancomycin 125 mg, ..."
1,**1. Subjective:****Chief Complaint (CC):**Fre...,"[doctor]: Good morning, Mrs. Doe. How are you ...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Frequent watery diarr...,**Vital Signs:**- Temperature: 100.5 °F- Blood...,- Enterocolitis due to Clostridium difficile (...,"- **Medications:** - Vancomycin 125 mg orally,..."
2,**1. Subjective:****Chief Complaint (CC):**Mod...,"[doctor]: Hi Mr. Lee, how are you doing today?...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Moderate abdominal pa...,This section is limited in a telemedicine enco...,Diagnosis: Enterocolitis due to Clostridium di...,**Treatment:**- Prescribe Fidaxomicin 200 mg o...
3,**1. Subjective:****Chief Complaint (CC):**Wat...,"[doctor]: Good morning, how are you today?[pat...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Watery diarrhea and a...,- Physical Exam: Not applicable (telemedicine ...,- Enterocolitis due to Clostridium difficile (...,**Investigations/Test Results:**- Stool sample...
4,**Subjective:****Chief Complaint (CC)**Severe ...,"[doctor]: Hi there, how are you doing today?[p...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC)**Severe abdominal cramp...,**Vitals**- Temperature: 101.2 degrees F- Bloo...,1. Enterocolitis due to Clostridium difficile-...,1. Enterocolitis due to Clostridium difficile-...
...,...,...,...,...,...,...,...,...
10215,**1. Subjective:****Chief Complaint (CC):**Fol...,[doctor]: Good morning! It's nice to see you a...,Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,**Chief Complaint (CC):**Follow-up for managem...,**Vital Signs:**- Blood Pressure: 138/82 mmHg-...,- **Chronic Obstructive Pulmonary Disease (COP...,- **Medications:** - Continue Albuterol inhale...
10216,**1. Subjective:****Chief Complaint (CC):**Per...,"```markdown[doctor] Hello, how are you doing t...",Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,"**Chief Complaint (CC):**Persistent dry cough,...",**Physical Exam:**- **General:** Appears fatig...,- Dependence on Supplemental Oxygen (ICD-10 Co...,- **Treatment Recommendations:** - Initiate su...
10217,**1. Subjective:****Chief Complaint (CC):**Sev...,[doctor] Good morning. How are you feeling tod...,Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,**Chief Complaint (CC):**Severe breathlessness...,**Vital Signs:**- Blood Pressure: Not document...,The patient presents with severe breathlessnes...,Based on the current evaluation and pending di...
10218,**1. Subjective:****Chief Complaint (CC):**Sev...,"[doctor] Hi Mr. Lee, how are you doing today?[...",Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,**Chief Complaint (CC):**Severe breathlessness...,**Vital Signs:**- Blood Pressure: 140/85 mmHg-...,- **Primary Diagnosis:** Dependence on Supplem...,s.**Review of Systems (ROS):**- **General:** R...


In [10]:
# Train test split

g = df.groupby('ICD10')

train_df = g.nth([1,2,3,4]).reset_index()
train_df = train_df.drop(["index"], axis=1)
val_df  = g.nth(0).reset_index()
val_df = val_df.drop(["index"], axis=1)

In [11]:
train_df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc,Subjective,Objective,Assessment,Plan
0,**1. Subjective:****Chief Complaint (CC):**Fre...,"[doctor]: Good morning, Mrs. Doe. How are you ...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Frequent watery diarr...,**Vital Signs:**- Temperature: 100.5 °F- Blood...,- Enterocolitis due to Clostridium difficile (...,"- **Medications:** - Vancomycin 125 mg orally,..."
1,**1. Subjective:****Chief Complaint (CC):**Mod...,"[doctor]: Hi Mr. Lee, how are you doing today?...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Moderate abdominal pa...,This section is limited in a telemedicine enco...,Diagnosis: Enterocolitis due to Clostridium di...,**Treatment:**- Prescribe Fidaxomicin 200 mg o...
2,**1. Subjective:****Chief Complaint (CC):**Wat...,"[doctor]: Good morning, how are you today?[pat...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):**Watery diarrhea and a...,- Physical Exam: Not applicable (telemedicine ...,- Enterocolitis due to Clostridium difficile (...,**Investigations/Test Results:**- Stool sample...
3,**Subjective:****Chief Complaint (CC)**Severe ...,"[doctor]: Hi there, how are you doing today?[p...",A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC)**Severe abdominal cramp...,**Vitals**- Temperature: 101.2 degrees F- Bloo...,1. Enterocolitis due to Clostridium difficile-...,1. Enterocolitis due to Clostridium difficile-...
4,**1. Subjective:****Chief Complaint (CC):**- S...,"[doctor] Hi, how are you doing today?[patient]...",A0472,"ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE, NO...",**Chief Complaint (CC):**- Severe watery diarr...,**Vital Signs:**- Blood pressure: 128/76 mmHg-...,- Diagnosis: Enterocolitis due to Clostridium ...,1. **Medication:** - Prescribe fidaxomicin 200...
...,...,...,...,...,...,...,...,...
8019,**Subjective:****Chief Complaint (CC):**Fatigu...,"[doctor]: Hi there, I see you're here today fo...",Z992,DEPENDENCE ON RENAL DIALYSIS,**Chief Complaint (CC):**Fatigue and muscle cr...,- **Vital Signs:** Blood pressure: 140/85 mmHg...,1. **Dependence on Renal Dialysis (ICD-10 Z99....,1. **Dependence on Renal Dialysis:** - Continu...
8020,**1. Subjective:****Chief Complaint (CC):**Per...,"```markdown[doctor] Hello, how are you doing t...",Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,"**Chief Complaint (CC):**Persistent dry cough,...",**Physical Exam:**- **General:** Appears fatig...,- Dependence on Supplemental Oxygen (ICD-10 Co...,- **Treatment Recommendations:** - Initiate su...
8021,**1. Subjective:****Chief Complaint (CC):**Sev...,[doctor] Good morning. How are you feeling tod...,Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,**Chief Complaint (CC):**Severe breathlessness...,**Vital Signs:**- Blood Pressure: Not document...,The patient presents with severe breathlessnes...,Based on the current evaluation and pending di...
8022,**1. Subjective:****Chief Complaint (CC):**Sev...,"[doctor] Hi Mr. Lee, how are you doing today?[...",Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN,**Chief Complaint (CC):**Severe breathlessness...,**Vital Signs:**- Blood Pressure: 140/85 mmHg-...,- **Primary Diagnosis:** Dependence on Supplem...,s.**Review of Systems (ROS):**- **General:** R...


In [12]:
val_df

Unnamed: 0,Note,Dialogue,ICD10,ICD10_desc,Subjective,Objective,Assessment,Plan
0,**1. Subjective:****Chief Complaint (CC):** Se...,[doctor]: Good morning. How are you feeling to...,A047,ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE,**Chief Complaint (CC):** Severe abdominal pai...,**Vital Signs:**- Temperature: 100.8°F- Blood ...,**Diagnosis:** Enterocolitis due to Clostridiu...,"**Medication:**- Prescribe Vancomycin 125 mg, ..."
1,**1. Subjective:****Chief Complaint (CC):** Wa...,"[doctor]: Hi Jane, I see you’re here today bec...",A0472,"ENTEROCOLITIS DUE TO CLOSTRIDIUM DIFFICILE, NO...","**Chief Complaint (CC):** Watery diarrhea, abd...",**VITALS:**- BP: 125/82 mmHg- HR: 90 bpm- Temp...,"Enterocolitis due to Clostridium difficile, no...","- **Medications:** - Prescribed Vancomycin, 12..."
2,**1. Subjective:****Chief Complaint (CC):** Ab...,"[doctor]: Hi there, I see you're not feeling w...",A084,"VIRAL INTESTINAL INFECTION, UNSPECIFIED",**Chief Complaint (CC):** Abdominal cramps and...,**Vital Signs:**- Temperature: 98.6°F- Blood P...,"Primary Diagnosis: Viral Intestinal Infection,...",1. **Treatment:** - Oral Rehydration Solution ...
3,**1. Subjective:****Chief Complaint (CC):** Se...,"[doctor]: Hi, how are you feeling today?[patie...",A09,"INFECTIOUS GASTROENTERITIS AND COLITIS, UNSPEC...",**Chief Complaint (CC):** Severe cramping abdo...,**Vital Signs:** - Temperature: 103°F - Blood ...,"- Infectious gastroenteritis and colitis, unsp...",**Treatment Recommendations:**- **Hydration:**...
4,**1. Subjective:** **Chief Complaint (CC):** S...,[doctor]: Good morning. How are you feeling to...,A4101,SEPSIS DUE TO METHICILLIN SUSCEPTIBLE STAPHYLO...,**Chief Complaint (CC):** Severe symptoms incl...,**General Appearance:** The patient appears ac...,**Primary Diagnosis:** - Sepsis due to methici...,**Treatment:** - Initiate Vancomycin 1g IV eve...
...,...,...,...,...,...,...,...,...
2001,**1. Subjective:** **Chief Complaint (CC):** -...,"[doctor]: Hello, it's nice to meet you. I see ...",Z9861,CORONARY ANGIOPLASTY STATUS,**Chief Complaint (CC):** - The patient report...,**Vital Signs:** - Blood Pressure: 135/85 mmHg...,- The patient is a 72-year-old male with a rec...,s to start light aerobic exercise; consumes al...
2002,**1. Subjective:** - Chief Complaint (CC): Pos...,"[doctor]: Hello, how are you today?[patient]: ...",Z9884,BARIATRIC SURGERY STATUS,- Chief Complaint (CC): Post-operative follow-...,- Vital Signs: - Blood Pressure: 128/82 mmHg -...,- Post-operative status following bariatric su...,- Continue adherence to post-surgical nutritio...
2003,**1. Subjective:**##### Chief Complaint (CC)De...,"[doctor]: Hello, how are you doing today?[pati...",Z9911,DEPENDENCE ON RESPIRATOR [VENTILATOR] STATUS,##### Chief Complaint (CC)Dependence on ventil...,"- Vital Signs: HR 80 bpm, BP 120/70 mmHg, SpO2...",- Primary diagnosis: Dependence on respirator ...,- Prescribed home mechanical ventilation with ...
2004,**1. Subjective:****Chief Complaint (CC):** In...,"[doctor]: Good morning, Ms. Doe. How are you f...",Z992,DEPENDENCE ON RENAL DIALYSIS,**Chief Complaint (CC):** Increased fatigue an...,**Vitals:**- Blood Pressure: 145/85 mmHg- Hear...,1. Chronic kidney disease (Stage 4)2. Polycyst...,**Initiate Treatment:**- Begin peritoneal dial...


In [None]:
# Preparing data for Single LLM finetuning

def format_training_example(dialogue: str, note: str, icd10: str, icd10_desc: str):
    system_prompt = """You are a medical documentation assistant. Your task is to convert patient-doctor consultation dialogues into structured SOAP notes (Subjective, Objective, Assessment, Plan) with appropriate ICD-10 diagnosis codes.

Generate a comprehensive SOAP note that includes:
1. Subjective: Chief complaint, history of present illness, review of systems
2. Objective: Vital signs, physical examination findings
3. Assessment: Diagnosis with ICD-10 code and description, differential diagnoses
4. Plan: Management, referrals, further testing, patient education

Stick to the following rules with absolute authority:
- Do not include anything into the SOAP note that is not present in the presented dialogue.
- Do not assume anything. Be deterministic and only take what is named in the text.
- If you cannot fill out something in the SOAP notes, write only [UNKNOWN] to the corresponding dimension or subdimension.
"""

    user_prompt = f"""Convert the following patient-doctor consultation dialogue into a structured SOAP note:

{dialogue}"""

    # Include ICD10 in the assistant response
    assistant_response = f"""{note}

**ICD-10 Code:** {icd10}
**Diagnosis:** {icd10_desc}"""

    return {
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": assistant_response}
        ]
    }


def process_json_file(input_dataframe: object, output_file_name: str):
    data_records = []
    for idx in range(len(input_dataframe)):
        dialogue = input_dataframe['Dialogue'][idx]
        note = input_dataframe['Note'][idx]
        icd10 = input_dataframe['ICD10'][idx]
        icd10_desc = input_dataframe['ICD10_desc'][idx]

        formatted_example = format_training_example(dialogue, note, icd10, icd10_desc)
        data_records.append(formatted_example)

    # Save to JSONL format
    with open(output_file_name, 'w', encoding='utf-8') as f:
        for record in data_records:
            f.write(json.dumps(record, ensure_ascii=False) + '\n')

single_train_df = train_df.drop(["Subjective", "Objective", "Assessment", "Plan"], axis=1)
single_val_df = val_df.drop(["Subjective", "Objective", "Assessment", "Plan"], axis=1)

process_json_file(single_train_df, "single_agent/data/training_single.jsonl")
process_json_file(single_val_df, "single_agent/data/validation_single.jsonl")

# Format fits Multi-Agent benchmark concept
process_json_file(single_val_df, "multi_agents/data/benchmark_multi_agents.jsonl")

# Format fits Swarm-Agent benchmark concept
process_json_file(single_val_df, "swarm_agents/data/benchmark_swarm_agents.jsonl")

# Format fits Base-Data benchmark
process_json_file(single_val_df, "base_data/benchmark_base_data.jsonl")

In [14]:
# Preparing data for Multi-Agent finetuning (per SOAP dimension)

SOAP_CONFIGS = {
    "subjective": {
        "column": "Subjective",
        "system_prompt": """You are a medical documentation assistant specializing in the Subjective section of SOAP notes. Your task is to extract the Subjective component from a patient-doctor consultation dialogue.

Generate a Subjective section that includes:
- Chief Complaint (CC)
- History of Present Illness (HPI)
- Review of Systems (ROS)
- Past Medical History, Family History, Social History where mentioned

Stick to the following rules with absolute authority:
- Do not include anything that is not present in the presented dialogue.
- Do not assume anything. Be deterministic and only take what is named in the text.
- If you cannot fill out a subdimension, write only [UNKNOWN] for that subdimension."""
    },
    "objective": {
        "column": "Objective",
        "system_prompt": """You are a medical documentation assistant specializing in the Objective section of SOAP notes. Your task is to extract the Objective component from a patient-doctor consultation dialogue.

Generate an Objective section that includes:
- Vital Signs
- Physical Examination findings
- Laboratory and diagnostic test results where mentioned

Stick to the following rules with absolute authority:
- Do not include anything that is not present in the presented dialogue.
- Do not assume anything. Be deterministic and only take what is named in the text.
- If you cannot fill out a subdimension, write only [UNKNOWN] for that subdimension."""
    },
    "assessment": {
        "column": "Assessment",
        "system_prompt": """You are a medical documentation assistant specializing in the Assessment section of SOAP notes. Your task is to extract the Assessment component from a patient-doctor consultation dialogue.

Generate an Assessment section that includes:
- Primary diagnosis with ICD-10 code and description
- Differential diagnoses where discussed
- Clinical reasoning and justification

Stick to the following rules with absolute authority:
- Do not include anything that is not present in the presented dialogue.
- Do not assume anything. Be deterministic and only take what is named in the text.
- If you cannot fill out a subdimension, write only [UNKNOWN] for that subdimension."""
    },
    "plan": {
        "column": "Plan",
        "system_prompt": """You are a medical documentation assistant specializing in the Plan section of SOAP notes. Your task is to extract the Plan component from a patient-doctor consultation dialogue.

Generate a Plan section that includes:
- Medications and treatments
- Referrals and follow-up appointments
- Further testing or investigations
- Patient education and instructions

Stick to the following rules with absolute authority:
- Do not include anything that is not present in the presented dialogue.
- Do not assume anything. Be deterministic and only take what is named in the text.
- If you cannot fill out a subdimension, write only [UNKNOWN] for that subdimension."""
    },
}


def format_soap_example(dialogue: str, soap_text: str, dimension: str):
    config = SOAP_CONFIGS[dimension]
    return {
        "messages": [
            {"role": "system", "content": config["system_prompt"]},
            {"role": "user", "content": f"Extract the {dimension.capitalize()} section from the following patient-doctor consultation dialogue:\n\n{dialogue}"},
            {"role": "assistant", "content": soap_text},
        ]
    }


def process_soap_dimension(df: pd.DataFrame, dimension: str, output_file: str):
    col = SOAP_CONFIGS[dimension]["column"]
    records = []
    for idx in range(len(df)):
        dialogue = df["Dialogue"].iloc[idx]
        soap_text = df[col].iloc[idx]
        if not isinstance(soap_text, str) or soap_text.strip() == "":
            continue
        records.append(format_soap_example(dialogue, soap_text, dimension))

    with open(output_file, "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    print(f"{dimension}: wrote {len(records)} examples to {output_file}")


multi_train_df = train_df.drop(["Note"], axis=1)
multi_val_df = val_df.drop(["Note"], axis=1)

for dim in SOAP_CONFIGS:
    process_soap_dimension(multi_train_df, dim, f"multi_agents/data/training/training_{dim}.jsonl")
    process_soap_dimension(multi_val_df, dim, f"multi_agents/data/validation/validation_{dim}.jsonl")

subjective: wrote 8024 examples to multi_agents/data/training/training_subjective.jsonl
subjective: wrote 2006 examples to multi_agents/data/validation/validation_subjective.jsonl
objective: wrote 8024 examples to multi_agents/data/training/training_objective.jsonl
objective: wrote 2006 examples to multi_agents/data/validation/validation_objective.jsonl
assessment: wrote 8024 examples to multi_agents/data/training/training_assessment.jsonl
assessment: wrote 2006 examples to multi_agents/data/validation/validation_assessment.jsonl
plan: wrote 8024 examples to multi_agents/data/training/training_plan.jsonl
plan: wrote 2006 examples to multi_agents/data/validation/validation_plan.jsonl


In [13]:
"""
Preparing data for Swarm Agent finetuning (Draft-Critique-Refine per SOAP dimension)

Architecture: 3 agents per SOAP dimension x 4 dimensions = 12 fine-tuned adapters
  Drafter  ->  extracts initial SOAP section from the consultation dialogue
  Critic   ->  reviews the draft against the source dialogue, identifies issues
  Refiner  ->  integrates critique feedback into a definitive final section

Training data for the Critic and Refiner is bootstrapped via ICD-code
cross-pairing: each sample is paired with a different example that shares the
same ICD code, whose SOAP section serves as a synthetic "imperfect draft".
Programmatic critiques are then generated by comparing draft tokens against
the gold standard and the source dialogue.  These synthetic critiques are
approximate -- their primary purpose is to teach the model the critique *format*
and *task*; the fine-tuned model's pre-trained reasoning will refine quality.
"""

import random
from collections import defaultdict

# --- Dimension configurations --------------------------------------------------

SWARM_DIMS = {
    "subjective": {
        "column": "Subjective",
        "includes": (
            "- Chief Complaint (CC)\n"
            "- History of Present Illness (HPI)\n"
            "- Review of Systems (ROS)\n"
            "- Past Medical History, Family History, Social History where mentioned"
        ),
    },
    "objective": {
        "column": "Objective",
        "includes": (
            "- Vital Signs\n"
            "- Physical Examination findings\n"
            "- Laboratory and diagnostic test results where mentioned"
        ),
    },
    "assessment": {
        "column": "Assessment",
        "includes": (
            "- Primary diagnosis with ICD-10 code and description\n"
            "- Differential diagnoses where discussed\n"
            "- Clinical reasoning and justification"
        ),
    },
    "plan": {
        "column": "Plan",
        "includes": (
            "- Medications and treatments\n"
            "- Referrals and follow-up appointments\n"
            "- Further testing or investigations\n"
            "- Patient education and instructions"
        ),
    },
}

RULES_BLOCK = (
    "Stick to the following rules with absolute authority:\n"
    "- Do not include anything that is not present in the presented dialogue.\n"
    "- Do not assume anything. Be deterministic and only take what is named in the text.\n"
    "- If you cannot fill out a subdimension, write only [UNKNOWN] for that subdimension."
)


# --- System-prompt builders ----------------------------------------------------

def drafter_system(dim):
    d = SWARM_DIMS[dim]
    return (
        f"You are a medical documentation specialist responsible for drafting "
        f"the {dim.capitalize()} section of SOAP notes. Your task is to extract "
        f"and draft the {dim.capitalize()} component from a patient-doctor "
        f"consultation dialogue.\n\n"
        f"Generate a {dim.capitalize()} section that includes:\n"
        f"{d['includes']}\n\n"
        f"{RULES_BLOCK}"
    )


def critic_system(dim):
    return (
        f"You are a clinical documentation quality reviewer specializing in the "
        f"{dim.capitalize()} section of SOAP notes. Your task is to review a "
        f"draft against the source patient-doctor consultation dialogue and "
        f"provide a structured critique.\n\n"
        f"Evaluate the draft for:\n"
        f"1. HALLUCINATIONS: Information in the draft NOT supported by the dialogue\n"
        f"2. OMISSIONS: Key information from the dialogue MISSING in the draft\n"
        f"3. CLINICAL ACCURACY: Incorrect medical terminology or reasoning\n"
        f"4. FORMATTING: Structure and completeness issues\n\n"
        f"For each category, list specific findings. If no issues are found in a "
        f"category, state \"None identified.\"\n"
        f"End with VERDICT: GOOD | NEEDS_REVISION | POOR"
    )


def refiner_system(dim):
    return (
        f"You are a clinical documentation specialist performing final revision "
        f"of the {dim.capitalize()} section of SOAP notes. Given the source "
        f"consultation dialogue, an initial draft, and a peer critique, produce "
        f"the definitive {dim.capitalize()} section.\n\n"
        f"Incorporate the critique feedback to:\n"
        f"- Remove any hallucinated information not supported by the dialogue\n"
        f"- Add identified omissions from the dialogue\n"
        f"- Correct clinical terminology where needed\n"
        f"- Ensure proper structure and formatting\n\n"
        f"{RULES_BLOCK}"
    )


# --- Cross-pairing for synthetic imperfect drafts -----------------------------

def build_cross_pairs(df):
    """Pair each row with another row sharing the same ICD-10 code (circular
    shift within each code group).  Singletons are randomly paired with one
    another as a fallback."""
    icd_groups = defaultdict(list)
    for idx in range(len(df)):
        icd_groups[df["ICD10"].iloc[idx]].append(idx)

    pairs = []
    singletons = []

    for indices in icd_groups.values():
        if len(indices) < 2:
            singletons.extend(indices)
            continue
        for i, idx in enumerate(indices):
            pairs.append((idx, indices[(i + 1) % len(indices)]))

    # Fallback: pair singletons with each other
    random.shuffle(singletons)
    for i in range(0, len(singletons) - 1, 2):
        pairs.append((singletons[i], singletons[i + 1]))
        pairs.append((singletons[i + 1], singletons[i]))

    return pairs


# --- Synthetic critique generation ---------------------------------------------

def _medical_tokens(text):
    """Extract tokens likely to carry clinical or factual meaning."""
    toks = set(re.findall(r"\b[\w./-]+\b", text.lower()))
    return {t for t in toks if len(t) > 3 or any(c.isdigit() for c in t)}


def generate_synthetic_critique(dialogue, draft, gold, dim):
    draft_tok = _medical_tokens(draft)
    gold_tok  = _medical_tokens(gold)
    dial_tok  = _medical_tokens(dialogue)

    hallucinations = sorted(draft_tok - dial_tok)[:8]
    omissions      = sorted((gold_tok & dial_tok) - draft_tok)[:8]

    overlap = len(draft_tok & gold_tok) / max(len(gold_tok), 1)
    verdict = "GOOD" if overlap > 0.7 else "NEEDS_REVISION" if overlap > 0.4 else "POOR"

    hall_str = ", ".join(hallucinations) if hallucinations else "None identified."
    omit_str = ", ".join(omissions) if omissions else "None identified."
    acc_str  = (
        "Clinical terminology and reasoning are consistent with the dialogue."
        if verdict == "GOOD" else
        "Terminology and clinical reasoning require review given the "
        "discrepancies noted above."
    )

    return (
        f"1. HALLUCINATIONS: {hall_str}\n\n"
        f"2. OMISSIONS: {omit_str}\n\n"
        f"3. CLINICAL ACCURACY: {acc_str}\n\n"
        f"4. FORMATTING: Structure follows standard "
        f"{dim.capitalize()} section format.\n\n"
        f"VERDICT: {verdict}"
    )


# --- Formatting helpers --------------------------------------------------------

def format_drafter(dialogue, soap_text, dim):
    return {"messages": [
        {"role": "system",    "content": drafter_system(dim)},
        {"role": "user",      "content": f"Draft the {dim.capitalize()} section from the following consultation dialogue:\n\n{dialogue}"},
        {"role": "assistant", "content": soap_text},
    ]}


def format_critic(dialogue, draft, critique, dim):
    return {"messages": [
        {"role": "system", "content": critic_system(dim)},
        {"role": "user",   "content": (
            f"Review the following draft {dim.capitalize()} section against "
            f"the source consultation dialogue.\n\n"
            f"Source consultation dialogue:\n{dialogue}\n\n"
            f"Draft {dim.capitalize()} section to review:\n{draft}"
        )},
        {"role": "assistant", "content": critique},
    ]}


def format_refiner(dialogue, draft, critique, gold, dim):
    return {"messages": [
        {"role": "system", "content": refiner_system(dim)},
        {"role": "user",   "content": (
            f"Produce the final revised {dim.capitalize()} section.\n\n"
            f"Source consultation dialogue:\n{dialogue}\n\n"
            f"Initial draft:\n{draft}\n\n"
            f"Peer critique:\n{critique}"
        )},
        {"role": "assistant", "content": gold},
    ]}


# --- Write JSONL ---------------------------------------------------------------

def _write_jsonl(records, path):
    with open(path, "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")


def process_swarm_data(df, split, out_dir):
    df = df.reset_index(drop=True)
    cross_pairs = build_cross_pairs(df)

    for dim, cfg in SWARM_DIMS.items():
        col = cfg["column"]

        # -- Drafter --
        drafter_recs = []
        for idx in range(len(df)):
            soap = df[col].iloc[idx]
            if not isinstance(soap, str) or soap.strip() == "":
                continue
            drafter_recs.append(format_drafter(df["Dialogue"].iloc[idx], soap, dim))

        _write_jsonl(drafter_recs, f"{out_dir}/{split}_drafter_{dim}.jsonl")

        # -- Critic + Refiner (from cross-pairs) --
        critic_recs, refiner_recs = [], []
        for idx, cross_idx in cross_pairs:
            dialogue = df["Dialogue"].iloc[idx]
            gold     = df[col].iloc[idx]
            draft    = df[col].iloc[cross_idx]
            if not isinstance(gold, str) or not isinstance(draft, str):
                continue
            if gold.strip() == "" or draft.strip() == "":
                continue

            critique = generate_synthetic_critique(dialogue, draft, gold, dim)
            critic_recs.append(format_critic(dialogue, draft, critique, dim))
            refiner_recs.append(format_refiner(dialogue, draft, critique, gold, dim))

        _write_jsonl(critic_recs,  f"{out_dir}/{split}_critic_{dim}.jsonl")
        _write_jsonl(refiner_recs, f"{out_dir}/{split}_refiner_{dim}.jsonl")

        print(f"  {dim:<12s}  drafter={len(drafter_recs):>5}  "
              f"critic={len(critic_recs):>5}  refiner={len(refiner_recs):>5}")


# --- Generate all swarm training and validation data ---------------------------

random.seed(42)

swarm_train_df = train_df.drop(["Note"], axis=1)
swarm_val_df = val_df.drop(["Note"], axis=1)

os.makedirs("swarm_agents/data/training",  exist_ok=True)
os.makedirs("swarm_agents/data/validation", exist_ok=True)

print("Training data:")
process_swarm_data(swarm_train_df, "training", "swarm_agents/data/training")

print("\nValidation data:")
process_swarm_data(swarm_val_df, "validation", "swarm_agents/data/validation")

Training data:
  subjective    drafter= 8024  critic= 8024  refiner= 8024
  objective     drafter= 8024  critic= 8024  refiner= 8024
  assessment    drafter= 8024  critic= 8024  refiner= 8024
  plan          drafter= 8024  critic= 8024  refiner= 8024

Validation data:
  subjective    drafter= 2006  critic= 2006  refiner= 2006
  objective     drafter= 2006  critic= 2006  refiner= 2006
  assessment    drafter= 2006  critic= 2006  refiner= 2006
  plan          drafter= 2006  critic= 2006  refiner= 2006


In [None]:
# Export as CSV
# train_df.to_csv("training_data_llm.csv")
# val_df.to_csv("validation_data_llm.csv")

In [None]:
# Export as JSON
# train_df.to_json("training_data_llm.json", orient="records")
# val_df.to_json("validation_data_llm.json", orient="records")