In [1]:
from langsmith import Client
import os
import getpass
from langsmith_utils import extract_and_save_all_child_runs_by_project
from src.settings import get_settings

settings = get_settings()
BASE_DIR = settings.base_dir
EHRXQA_CFG = settings.ehrxqa
BASE_NAME = "xmode-vqa-m3ae-star-100-en-gpt_4o-with-intent"

client = Client()

def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"Please provide your {var}")
_set_if_undefined("OPENAI_API_KEY")
_set_if_undefined("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"



In [2]:
from langsmith_utils import extract_plan_and_details
from pathlib import Path
import json
from tqdm import tqdm

def save_reduced_details(project_name, data_path=None):
    data_path = Path(data_path) if data_path else EHRXQA_CFG.output_dir
    test_run_folder = data_path / f"{project_name}-details"
    assert test_run_folder.exists(), f"{test_run_folder} does not exist"
    test_run_files = list(test_run_folder.glob("*.json"))
    reduced_run_folder = data_path / f"{project_name}-details-reduced"
    reduced_run_folder.mkdir(parents=True, exist_ok=True)
    all_plans = []
    for test_run_file in tqdm(test_run_files):
        try:
            i = int(test_run_file.stem)
            with test_run_file.open("r") as f:
                data = json.load(f)
            _result = extract_plan_and_details(data)
            reduced_run_path = reduced_run_folder / f"{i}.json"
            with reduced_run_path.open("w") as f:
                json.dump(_result, f, indent=2)
            all_plans.append(_result)
        except ValueError:
            print(f"Skipping {test_run_file.name}")
            continue
    output_path = reduced_run_folder.parent / f"{project_name}-details-reduced.json"
    with output_path.open("w") as f:
        json.dump(all_plans, f, indent=2)
    print(f"Saved to {output_path}")



In [3]:
os.environ["LANGCHAIN_PROJECT"] = "xmode-vqa-gpt_4o-english-13"
project_name=os.environ["LANGCHAIN_PROJECT"]

In [4]:
# extract_and_save_all_child_runs_by_project(project_name)
save_reduced_details(project_name)



  0%|          | 0/13 [00:00<?, ?it/s]

100%|██████████| 13/13 [00:00<00:00, 40.22it/s]

Saved to experiments/xmode/en/xmode-vqa-gpt_4o-english-13-details-reduced.json





In [5]:
# extract_and_save_all_child_runs_by_project(project_name)
save_reduced_details(project_name)



100%|██████████| 52/52 [00:01<00:00, 42.37it/s]

Saved to experiments/xmode/en/xmode-vqa-gpt_4o-english-52-details-reduced.json





In [6]:
import pandas as pd
import json
from pathlib import Path

data_path = EHRXQA_CFG.output_dir
json_file_1 = data_path / "xmode-vqa-gpt_4o-english-13-details-reduced.json"
json_file_2 = data_path / "xmode-vqa-gpt_4o-english-52-details-reduced.json"
with json_file_1.open("r") as f:
    data_1 = json.load(f)
with json_file_2.open("r") as f:
    data_2 = json.load(f)
data = data_1 + data_2
source_xlxs = BASE_DIR / "eval_ehr_100_samples.xlsx"
df = pd.read_excel(source_xlxs)


In [17]:
df.iloc[60]["question"]

'was patient 12724975 diagnosed with hypoxemia until 1 year ago and did a chest x-ray reveal any tubes/lines in the abdomen during the same period?'

In [18]:
data[0]["question"]

'was patient 12724975 diagnosed with hypoxemia until 1 year ago and did a chest x-ray reveal any tubes/lines in the abdomen during the same period?'

In [19]:
df.iloc[60]["prediction"]

"[{'Summary': 'Patient 12724975 was diagnosed with hypoxemia until about two years ago and no tubes/lines were detected in the abdomen during chest x-rays in the last year.', 'details': 'The patient had a diagnosis of hypoxemia recorded on 2103-12-27. Chest x-ray analysis from the last year (2104-12-31 to 2105-12-31) revealed no tubes or lines in the abdomen.', 'source': 'Diagnosis records and chest x-ray image analysis.', 'inference': 'no', 'extra explanation': 'The diagnosis occurred two years ago, not within the last year. No tubes or lines were observed in the x-rays.'}]"

In [7]:
json_df = pd.DataFrame(data)
# rename the columns
json_df = json_df.rename(columns={"plans": "Plans (updated with SQL)", "predictions": "prediction"})
json_df.columns


Index(['Plans (updated with SQL)', 'prediction', 'question'], dtype='object')

In [8]:
for index, row in json_df.iterrows():
    question = row["question"]

    df.loc[df['question'] == question, 'Plans (updated with SQL)'] = str(row['Plans (updated with SQL)'])
    df.loc[df['question'] == question, 'prediction'] = str(row['prediction'])

output_xlsx = BASE_DIR / "eval_ehr_100_samples_updated.xlsx"
df.to_excel(output_xlsx, index=False)


In [9]:
json_df["id"] = 0
for index, row in json_df.iterrows():
    question = row["question"]
    question_id = df[df["question"] == question]["id"].values[0]
    json_df.loc[index, "id"] = question_id
json_df["id"] = json_df["id"].astype(int)

json_data = json_df.to_dict(orient="records")

json_folder = EHRXQA_CFG.output_dir / "xmode-vqa-gpt_4o-english-13-details-reduced-tagged"
json_folder.mkdir(parents=True, exist_ok=True)
for d in json_data:
    i = d["id"]
    with (json_folder / f"{i}.json").open("w") as f:
        json.dump(d, f, indent=2)


In [55]:
json_data[0]["question_id"]

0

In [7]:
from pathlib import Path
import json
from tqdm import tqdm
import pandas as pd

data_path = EHRXQA_CFG.dataset_file
eval_folder = EHRXQA_CFG.output_dir / f"{BASE_NAME}-langsmith-extract-details-reduced"
assert eval_folder.exists(), f"{eval_folder} does not exist"
output_folder = EHRXQA_CFG.output_dir / f"{BASE_NAME}-langsmith-extract-details-reduced-tagged"
output_folder.mkdir(parents=True, exist_ok=True)

with data_path.open("r") as f:
    data = json.load(f)
    df = pd.DataFrame(data)

for eval_file in tqdm(list(eval_folder.glob("*.json"))):
    with eval_file.open("r") as f:
        eval_data = json.load(f)
    question = eval_data["question"]
    question_id = df[df["question"] == question]["id"].values[0]
    eval_data["id"] = int(question_id)
    with (output_folder / f"{question_id}.json").open("w") as f:
        json.dump(eval_data, f, indent=2)


  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:00<00:00, 269.33it/s]
