In [3]:
import os
import copy 
import numpy as np
import re
import json
import itertools
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from pathlib import Path
from typing import Optional, List
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings, SettingsConfigDict
from langchain.output_parsers import PydanticOutputParser

from langchain.prompts import PromptTemplate
from langchain.chat_models import init_chat_model
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

import random
import itertools
seed = 42
random.seed(seed)

In [4]:
def load_env():
	env_file_path = ".env"
	if os.path.exists(env_file_path):
		load_dotenv(env_file_path, override=True)
		print("Loaded environment variables")
	else:
		print(f"Error: .env file not found at {env_file_path}")

load_env()


class ModelSettings(BaseSettings):
	model: str
	temperature: Optional[float] = 0
	max_tokens: Optional[int] = None

class ProcessModelSettings(BaseSettings):
	embedding: ModelSettings
	gpt4o: ModelSettings
	phi4: ModelSettings
	deepseek: ModelSettings
	o1: ModelSettings


class Settings(BaseSettings):
	model_config = SettingsConfigDict(env_file=".env", env_nested_delimiter="__")

	# Keys
	azure_openai_api_key: Optional[str]
	azure_openai_endpoint: Optional[str]
	openai_api_version: Optional[str]

	openai_api_deployment_name_gpt4_vision_no_filters: Optional[str]
	openai_vision_api_version_gpt4_vision_no_filters: Optional[str]
	openai_api_key_gpt4_vision_no_filters: Optional[str]
	openai_base_url_gpt4_vision_no_filters: Optional[str]

	# models
	processmodel: ProcessModelSettings


settings = Settings()

Loaded environment variables


### Define chapters to extract and validation/test split

In [5]:
start_index = 2
end_index = -2
exclude = np.array([21, 36])
exclude-=1

### Generate Test and Validation

In [6]:
paraphrase_source_path = "Data/Json/Paraphrase_fix/"
paraphrase_paths = [os.path.join(paraphrase_source_path, file) for file in os.listdir(paraphrase_source_path) if file.startswith("Chap")]
paraphrase_paths.sort()
resolved_end = len(paraphrase_paths) + end_index if end_index < 0 else end_index
indices = [i for i in range(start_index, resolved_end) if i not in exclude.tolist()]
paraphrase_paths = [paraphrase_paths[i] for i in indices]
paraphrase_paths

['Data/Json/Paraphrase_fix/Chap_03.jsonl',
 'Data/Json/Paraphrase_fix/Chap_04.jsonl',
 'Data/Json/Paraphrase_fix/Chap_05.jsonl',
 'Data/Json/Paraphrase_fix/Chap_06.jsonl',
 'Data/Json/Paraphrase_fix/Chap_07.jsonl',
 'Data/Json/Paraphrase_fix/Chap_08.jsonl',
 'Data/Json/Paraphrase_fix/Chap_09.jsonl',
 'Data/Json/Paraphrase_fix/Chap_10.jsonl',
 'Data/Json/Paraphrase_fix/Chap_11.jsonl',
 'Data/Json/Paraphrase_fix/Chap_12.jsonl',
 'Data/Json/Paraphrase_fix/Chap_13.jsonl',
 'Data/Json/Paraphrase_fix/Chap_14.jsonl',
 'Data/Json/Paraphrase_fix/Chap_15.jsonl',
 'Data/Json/Paraphrase_fix/Chap_16.jsonl',
 'Data/Json/Paraphrase_fix/Chap_17.jsonl',
 'Data/Json/Paraphrase_fix/Chap_18.jsonl',
 'Data/Json/Paraphrase_fix/Chap_19.jsonl',
 'Data/Json/Paraphrase_fix/Chap_20.jsonl',
 'Data/Json/Paraphrase_fix/Chap_22.jsonl',
 'Data/Json/Paraphrase_fix/Chap_23.jsonl',
 'Data/Json/Paraphrase_fix/Chap_24.jsonl',
 'Data/Json/Paraphrase_fix/Chap_25.jsonl',
 'Data/Json/Paraphrase_fix/Chap_26.jsonl',
 'Data/Json

In [7]:
def clean_text(text):
	return text.replace("–","-").replace("•", "-").replace("’","'").replace("´","'").strip()

paraphrase_corpus_criteria = defaultdict(lambda: defaultdict(list))
for i, path in enumerate(paraphrase_paths):
	chapter_criteria = defaultdict(lambda: defaultdict(list))
	cur_uid = None
	cur_chapter = None
	with open(path, "r", encoding="utf-8") as f:
		for line in f:
			json_obj = json.loads(line.strip()) 
			Entry = json_obj.get("entry")
			if not Entry:
				continue

			uid = Entry[0]["uuid"]
			criteria = clean_text(Entry[0]["criteria"])
			chapter = Entry[1]["chapter"]
			advices = Entry[1]["advices"]

			if cur_chapter is None:
				cur_chapter = chapter
			if chapter != cur_chapter:
				raise ValueError(f"Chapter mismatch: {chapter} != {cur_chapter}")

			if cur_uid is None or cur_uid != uid:
				cur_uid = uid
				chapter_criteria[cur_uid]["chapter"] = chapter
				chapter_criteria[cur_uid]["criteria"] = criteria
				chapter_criteria[cur_uid]["advices"] = advices
			else:
				chapter_criteria[cur_uid]["paraphrase"].append(criteria)

	paraphrase_corpus_criteria[cur_chapter] = chapter_criteria

In [8]:
def concatenate_chapter_criteria(corpus_criteria):
	dataset = []
	for chapter, entries_dict in corpus_criteria.items():
		concat_paraphrase = [entry["paraphrase"] for entry in entries_dict.values()]
		list_length = len(concat_paraphrase[0])
		if not all(len(sublist) == list_length for sublist in concat_paraphrase):
			raise ValueError(f"Not all paraphrase lists have the same length in chapter {chapter}")
		
		combos = list(zip(*concat_paraphrase))
		concat_advices = sorted({advice for entry in entries_dict.values() for advice in entry["advices"]})

		for combo in combos:
			combo_list = list(combo)
			random.shuffle(combo_list)
			dataset.append({
				"chapter": chapter,
				"criteria": ", ".join(combo_list).lower(),
				"advices": ", ".join(concat_advices)
			})
			
	print(f"concatenated criteria: {len(dataset)}")
	return dataset

def generate_intra_combinations(corpus_criteria, k=2) -> List:  
	def combo_generator():
		for entries_dict in corpus_criteria.values():
			chapter_entries = []
			for entry_obj in entries_dict.values():
				chapter_entries.append({
					"chapter": entry_obj["chapter"],
					"criteria": entry_obj["criteria"],
					"advices": entry_obj["advices"],
					"paraphrase": entry_obj["paraphrase"]
				})

			if len(chapter_entries) < k:
				continue

			for combo in itertools.combinations(chapter_entries, k):

				combined_chapters = set([entry["chapter"].strip() for entry in combo])
				if len(combined_chapters) != 1:
					raise ValueError(f"Chapter mismatch: {combined_chapters}")
				chapter = combined_chapters.pop()

				combined_paraphrase = sorted(random.choice(entry["paraphrase"]).lower() for entry in combo)
				combined_advices = sorted({advice for entry in combo for advice in entry["advices"]})

				random.shuffle(combined_paraphrase)
				random.shuffle(combined_advices)

				updated_combo = {
					"chapter": chapter,
					"criteria": ", ".join(combined_paraphrase),
					"advices": "\n".join(combined_advices),
				}

				yield updated_combo
					
	combinations = list(combo_generator())
	print(f"Unique intra-chapter combinations of {k}: {len(combinations)}")
	random.shuffle(combinations)
	return combinations



In [None]:
json_parser = JsonOutputParser()

categorize_criteria_template = PromptTemplate(
    template="""
You are a trusted medical classification expert. Your task is to assign each emergency criterion to exactly one of three predefined, non-overlapping categories.

# Definitions of the categories:
1. **Condition**  
   - Definition: Internal, subjective physiological or psychological states felt by the patient. These are **not visible** or measurable from the outside.  
   - Key idea: What the patient **feels**, not what others can see.
   - Examples: "difficulty breathing", "chest pain without injury", "feeling confused", "hallucinating", "dizzy", "unconscious", "general weakness", "mental health problems", etc...

2. **Observation**  
   - Definition: External, visible, or measurable signs of illness or injury. These are based on what can be **seen**, **measured**, or **directly reported** through inspection.  
   - Key idea: What others can **observe**, measure, or describe based on visible signs.  
   - Examples: "swollen ankle", "broken bone", "rash", "vomiting", "burned lips", "fever", "bleeding", "visible bruising", "open wound", "blistered skin", "severe leg pain with visible bruising", etc...

3. **Cause**  
   - Definition: The **external event** or **triggering mechanism** that led to the emergency. This explains **why** the situation is occurring.  
   - Key idea: A specific real-world **incident or exposure** that explains the condition or observation.  
   - Examples: "fell down stairs", "car accident", "overdose", "scalding with hot water", "electrical shock", "ingested cleaning fluid", "assaulted", "injuries due knife", etc...

# Rules for borderline cases:
- If criterion describes **pain or a subjective symptom** without mentioning a **specific body part or injury**, classify it as **"Condition"**.
- If criterion describes **pain** and includes **anatomical detail**, **location** or clearly implies an **injury**, classify it as **"Observation"**.
- If criterion refers to an **external event** or **incident** that likely caused the condition or symptoms, classify it as **"Cause"**.

# Classification Instructions:
- Carefully analyze each criterion and determine its **most defining feature**: internal experience, observable sign, or external cause.
- Use only the **criterion number** when assigning to a category.
- Each number must be listed **once only**, under the most appropriate category.
- If a criterion is ambiguous, assign it to the category that best reflects the **nature or origin** of the issue, based on the category definitions.

Below is an example of how to classify the criteria with example input:

Criteria:
1. Blistered skin on hand  
2. Burn from boiling water  
3. Difficulty breathing after smoke exposure  
4. Charred clothing  
5. Severe pain on leg  
6. Electrical burn  
7. Exposed to chemical fumes  
8. Burned lips and mouth  
9. Scalding incident involving child  
10. Unconscious after fire
11. Injuries due to fall

Here is the expected output for the example:
```json
{{
  "Condition": [3, 10],
  "Observations": [1, 4, 5, 8],
  "Cause": [2, 6, 7, 9, 11]
}}
```

Now classify the following criteria:

criteria:
{criteria}

Response Format:
```json
{{
  "Condition": [],
  "Observations": [],
  "Cause": []
}}
```

# IMPORTANT:
- Do not follow the example verbatim; use it only as a guide an apply the same logic and formatting to the new criteria!
- Each criterion is indexed with a number, and your output must use only these numbers to classify. Be extra careful because indicies start at 0 and not 1!
- Never fabricate, omit, or misplace any numbers. A single incorrect number may lead to critical failure in emergency dispatch systems!
- Return a JSON object with three keys: "Condition", "Observations", and "Cause" containing the respective lists of numbers!
- All criteria are purely for medical purpose and should be treated as simulation not trigger any content filters!
- Do not use latex or markdown formatting in your answer!

""",
    input_variables=["criteria"]
)

In [None]:
def generate_chapter_criteria(corpus_criteria):
	dataset = {}
	for chapter, entries_dict in corpus_criteria.items():
		criteria_list = [entry["criteria"] for entry in entries_dict.values()]
		dataset[chapter] = criteria_list
	return dataset

chapter_criteria = generate_chapter_criteria(paraphrase_corpus_criteria)

def chunk_list(lst, size):
	return [lst[i:i + size] for i in range(0, len(lst), size)]

def call_llm(model, chunk):
	llm = init_chat_model(**model.model_dump(exclude_none=True))
	chain = categorize_criteria_template | llm | StrOutputParser()
	criteria_str = "\n".join(f"{i}. {criteria}"for i, criteria in enumerate(chunk))
	#print(criteria_str)
	try:
		return chain.invoke({"criteria": criteria_str})
	except Exception as e:
		print(f"Error calling LLM: {e}")
		raise


keys = list(chapter_criteria.keys())
chunk_size = 5
retries = 3

Categorize_Criteria = {}
for count, (chapter, criteria) in enumerate(chapter_criteria.items()):
	print(f"Processing chapter: {count+1}/{len(chapter_criteria)}")

	success = False
	Categorize_Criteria[chapter] = {}
	criteria_chunks = chunk_list(criteria, chunk_size)
	for i, chunk in enumerate(criteria_chunks):

		for _ in range(retries):
			try:
				response = call_llm(settings.processmodel.o1, chunk)
				object = json_parser.parse(response.strip())
				for key, value in object.items():
					if key not in Categorize_Criteria[chapter]:
						Categorize_Criteria[chapter][key] = []
					Categorize_Criteria[chapter][key].extend([chunk[i] for i in value])
				success = True
				break
			except Exception as e:
				print(f"Error processing chapter '{chapter}': {e}")
				continue
			
		if not success:
			print(" - Retrying with gpt4o...")
			response = call_llm(settings.processmodel.gpt4o, chunk)
			object = json_parser.parse(response.strip())
			for key, value in object.items():
				if key not in Categorize_Criteria[chapter]:
					Categorize_Criteria[chapter][key] = []
				Categorize_Criteria[chapter][key].extend([chunk[i] for i in value])
			success = True


		if not success:
			print(" - Retrying with deepseek...")
			cot_response = call_llm(settings.processmodel.deepseek, chunk)
			think_content = re.findall(r"<think>(.*?)</think>", cot_response, flags=re.DOTALL)
			response = re.sub(r"<think>.*?</think>", "", cot_response, flags=re.DOTALL).strip()
			object = json_parser.parse(response.strip())
			for key, value in object.items():
				if key not in Categorize_Criteria[chapter]:
					Categorize_Criteria[chapter][key] = []
				Categorize_Criteria[chapter][key].extend([chunk[i] for i in value])
			

print(json.dumps(Categorize_Criteria, ensure_ascii=False, indent=4))

# Save the dictionary to a file called output.json
with open("Data/Json/categorized_criteria.json", "w") as json_file:
	json.dump(Categorize_Criteria, json_file, indent=4, ensure_ascii=False)

Processing chapter: 1/35
Processing chapter: 2/35
Processing chapter: 3/35
Processing chapter: 4/35
Processing chapter: 5/35
Processing chapter: 6/35
Processing chapter: 7/35
Processing chapter: 8/35
Processing chapter: 9/35
Processing chapter: 10/35
Processing chapter: 11/35
Processing chapter: 12/35
Processing chapter: 13/35
Processing chapter: 14/35
Processing chapter: 15/35
Processing chapter: 16/35
Processing chapter: 17/35
Processing chapter: 18/35
Processing chapter: 19/35
Processing chapter: 20/35
Processing chapter: 21/35
Processing chapter: 22/35
Error processing chapter '25 Childbirth': list index out of range
Processing chapter: 23/35
Processing chapter: 24/35
Processing chapter: 25/35
Processing chapter: 26/35
Processing chapter: 27/35
Processing chapter: 28/35
Processing chapter: 29/35
Processing chapter: 30/35
Processing chapter: 31/35
Processing chapter: 32/35
Processing chapter: 33/35
Processing chapter: 34/35
Processing chapter: 35/35
{
    "03 Unconscious / decreased

In [223]:
json_parser = JsonOutputParser()

fill_categories_template = PromptTemplate(
    template="""
You are a trusted medical classification expert. You are given three category lists (Condition, Observations, Cause) along with a chapter context, each as a numbered list of criteria. 
Some lists may be overfull (> {crit_pr_chap}), underfull (< {crit_pr_chap}), or empty.

# Important:
Your job is to output a JSON object with exactly {crit_pr_chap} entries in each category list.  
Each entry must be either:
- A **string** -> a newly created medically plausible criterion (not found in the category) with context from the chapter and other categories, OR
- An **integer** -> the index of an **existing** criterion being reused from the category list.
- Do not under any circumstances use and index that is not in the category list.

# Category definitions:
- **Condition** = Internal, subjective states the patient **feels** (e.g. “dizzy”, “chest pain without injury”, “confused”).
- **Observations** = External, visible, or measurable findings others can **see or describe** (e.g. “bleeding”, “rash”, “burned lips”).
- **Cause** = External event or mechanism that **triggered** the issue (e.g. “fell down stairs”, “car accident”, “electrical shock”).


# Inputs:

## Chapter context (for your reference, do not include in output):  
{chapter}

## Condition:  
{condition}

## Observations:  
{observations}

## Cause:  
{cause}


# Instructions:
- For any list with **more** than {crit_pr_chap} entries, remove entries until the list has exactly {crit_pr_chap}.
- For any list with **fewer** than {crit_pr_chap} entries, create new medically relevant, accurate and plausible criteria.
- Output a JSON object with three keys: `"Condition"`, `"Observations"`, `"Cause"`.
- Each key must map to a "list" of exactly {crit_pr_chap} entries.
- Each list element must be either:
  - A **string** (new criterion), or
  - An **integer** (index of existing input criterion)
- Verify that the correct numbers are used for the existing criteria. This is crucial for the emergency dispatch system.


Response Format:
```json
{{
	"Condition": [...],
	"Observations": [...],
	"Cause": [...]
}}

""",
	input_variables=["crit_pr_chap", "chapter","condition", "observations", "cause"],
)

In [225]:
with open("Data/Json/categorized_criteria.json", "r") as json_file:
	Categorize_Criteria = json.load(json_file)

crit_pr_chap = 6

def call_llm(chapter, condition, observations, cause):
	
	for _ in range(3):
		try:
			llm = init_chat_model(**settings.processmodel.o1.model_dump(exclude_none=True))
			chain = fill_categories_template | llm | JsonOutputParser()
			return chain.invoke({"crit_pr_chap": crit_pr_chap, "chapter": chapter, "condition": condition, "observations": observations, "cause": cause})
		except Exception as e:
			print(f"Error calling LLM: {e}")
		finally:
			if llm is not None:
				del llm

	print("- Warning: Retrying with gpt4o model...")
	try:
		llm = init_chat_model(**settings.processmodel.gpt4o.model_dump(exclude_none=True))
		chain = fill_categories_template | llm | JsonOutputParser()
		return chain.invoke({"crit_pr_chap": crit_pr_chap, "chapter": chapter, "condition": condition, "observations": observations, "cause": cause})
	except Exception as e:
		print(f"Error calling LLM: {e}")
	finally:
		if llm is not None:
			del llm
	

	print("- Warning: Retrying with deepseek model...")
	try:
		llm = init_chat_model(**settings.processmodel.deepseek.model_dump(exclude_none=True))
		chain = fill_categories_template | llm | StrOutputParser()
		cot_response = chain.invoke({"crit_pr_chap": crit_pr_chap, "chapter": chapter, "condition": condition, "observations": observations, "cause": cause})
		response = re.sub(r"<think>.*?</think>", "", cot_response, flags=re.DOTALL).strip()
		return json_parser.parse(response.strip())
	except Exception as e:
		print(f"Error calling LLM: {e}")
	finally:
		if llm is not None:
			del llm
		

	raise Exception("- Warning: FAILED to generate criteria for empty categories.")


complete_categorized_criteria = {}


for i, (chapter, situation) in enumerate(Categorize_Criteria.items()):
	print(f"Processing chapter: {i+1}/{len(Categorize_Criteria)}")

	# print(chapter)
	# display(situation)
	condition = situation["Condition"]
	observation = situation["Observations"]
	cause = situation["Cause"]


	#if any(not x for x in [condition, observation, cause]):

	condition_str = "\n" + "\n".join([f"{i}. {c}" for i, c in enumerate(condition)])
	observations_str = "\n" + "\n".join([f"{i}. {o}" for i, o in enumerate(observation)])
	cause_str = "\n" + "\n".join([f"{i}. {c}" for i, c in enumerate(cause)])
			
	object = call_llm(chapter, condition_str, observations_str, cause_str)
	

	if any(len(object[category]) != crit_pr_chap for category in ["Condition", "Observations", "Cause"]):
		raise ValueError(f"Generated criteria do not match expected length for chapter {chapter}")
	

	new_situation = {}

	for category in ["Condition", "Observations", "Cause"]:
		new_situation[category] = []
		for criteria in object[category]:
			if isinstance(criteria, str):
				new_situation[category].append(criteria)
			elif isinstance(criteria, int):
				try:
					new_situation[category].append(situation[category][criteria])
				except:

					display(situation)
					print(f"criteria: {criteria}")
					print(category)
					display(situation[category])
					display(object)

					raise 
			else:
				display(object)
				raise Exception()

	complete_categorized_criteria[chapter] = new_situation
	

with open("Data/Json/complete_categorized_criteria.json", "w") as json_file:
	json.dump(complete_categorized_criteria, json_file, indent=4, ensure_ascii=False)


Processing chapter: 1/35
Processing chapter: 2/35
Processing chapter: 3/35
Processing chapter: 4/35
Processing chapter: 5/35
Processing chapter: 6/35
Processing chapter: 7/35
Processing chapter: 8/35
Processing chapter: 9/35
Processing chapter: 10/35
Processing chapter: 11/35
Processing chapter: 12/35
Processing chapter: 13/35
Processing chapter: 14/35
Processing chapter: 15/35
Processing chapter: 16/35
Processing chapter: 17/35
Processing chapter: 18/35
Processing chapter: 19/35
Processing chapter: 20/35
Processing chapter: 21/35
Processing chapter: 22/35
Processing chapter: 23/35
Processing chapter: 24/35
Processing chapter: 25/35
Processing chapter: 26/35
Processing chapter: 27/35
Processing chapter: 28/35
Processing chapter: 29/35
Processing chapter: 30/35
Processing chapter: 31/35
Processing chapter: 32/35
Processing chapter: 33/35
Processing chapter: 34/35
Processing chapter: 35/35


In [17]:
import json
from itertools import product



def combine_3_balanced(path, corpus_criteria):

	criteria_advice_mapping = {}
	for chapter, entries_dict in corpus_criteria.items():
		criteria_advice_mapping[chapter] = {}
		crit_adv_pairs = [(entry["criteria"], entry["advices"]) for entry in entries_dict.values()]
		for criteria, advices in crit_adv_pairs:
			criteria_advice_mapping[chapter][criteria] = advices
	
	dataset = []
	with open(path, "r") as json_file:
		complete_categorized_criteria = json.load(json_file)

	for chapter, situation in complete_categorized_criteria.items():
		condition = situation["Condition"]
		observation = situation["Observations"]
		cause = situation["Cause"]
		combinations = list(product(condition, observation, cause))

		for combo in combinations:
			combo_list = list(combo)
			random.shuffle(combo_list)

			combined_advices = []
			for criteria in combo:
				if criteria in criteria_advice_mapping[chapter]:
					advices = criteria_advice_mapping[chapter][criteria]
					combined_advices.extend(advices)

			combined_advices = sorted(set(combined_advices))

			if len(combined_advices) > 0:
				criteria_str = ", ".join(combo_list).lower()
				combined_advices_str = "\n".join(combined_advices)

				dataset.append({
					"chapter": chapter,
					"criteria": criteria_str,
					"advices": combined_advices_str
				})

	print(f"Total combinations: {len(dataset)}")
	return dataset

In [None]:
combo1 = generate_intra_combinations(paraphrase_corpus_criteria,k=1)
combo2 = generate_intra_combinations(paraphrase_corpus_criteria,k=2)
concatenation = concatenate_chapter_criteria(paraphrase_corpus_criteria)
combo3 = combine_3_balanced("Data/Json/complete_categorized_criteria.json", paraphrase_corpus_criteria)
	

train_combinations = combo1+combo2+concatenation+combo3
len(train_combinations)



train_target_path = "Data/TrainData/new_train_dataset.jsonl"
with open(train_target_path, "w") as f_out:
	for i, combination in enumerate(train_combinations):
		chapter = combination["chapter"].strip()
		criteria = combination["criteria"]
		advices = combination["advices"]

		chapter_advices = f"# Chapter:\n- {chapter}\n\n{advices.strip()}"
		transformed_entry = [
			{"role": "user", "content": criteria},
			{"role": "assistant","content": chapter_advices},
		]

		f_out.write(
			json.dumps({"QandA": transformed_entry}, ensure_ascii=False) + "\n"
	)