In [1]:
import os
import re
import uuid
import json
import numpy as np
from typing import List
from copy import deepcopy
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from pathlib import Path
from collections import defaultdict
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from typing import Optional, Any, defaultdict
from langchain.chat_models import init_chat_model
from pydantic import BaseModel, Field, field_validator
from langchain.output_parsers import PydanticOutputParser
from pydantic_settings import BaseSettings, SettingsConfigDict
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser

In [2]:
def load_env():
	env_file_path = ".env"
	if os.path.exists(env_file_path):
		load_dotenv(env_file_path, override=True)
		print("Loaded environment variables")
	else:
		print(f"Error: .env file not found at {env_file_path}")

load_env()

class ModelSettings(BaseSettings):
	model: str
	temperature: Optional[float] = 0
	max_tokens: Optional[int] = None

class ProcessModelSettings(BaseSettings):
	gpt41: ModelSettings
	gpt4o_mini: ModelSettings
	gpt41_mini: ModelSettings
	o1_mini: ModelSettings
	gpt41_nano: ModelSettings
	deepseek: ModelSettings
	phi4: ModelSettings
	embedding: ModelSettings


class Settings(BaseSettings):
	model_config = SettingsConfigDict(env_file=".env", env_nested_delimiter="__")

	# Keys
	azure_openai_api_key: Optional[str]
	azure_openai_endpoint: Optional[str]
	openai_api_version: Optional[str]

	openai_api_deployment_name_gpt4_vision_no_filters: Optional[str]
	openai_vision_api_version_gpt4_vision_no_filters: Optional[str]
	openai_api_key_gpt4_vision_no_filters: Optional[str]
	openai_base_url_gpt4_vision_no_filters: Optional[str]

	# models
	processmodel: ProcessModelSettings


settings = Settings()


Loaded environment variables


Define which documents to process

In [3]:
start_index = 2
end_index = -3
exclude = np.array([21])
exclude-=1

In [4]:
def extract_chapter_title(content):
	pattern = r"^#\s*(.+)$"
	for i, line in enumerate(content.split("\n"), start=1):
		match = re.match(pattern, line)
		if match:
			return match.group(1).strip()
	return ""

In [5]:
file_paths = ["Data/LabelWork/" + file for file in os.listdir("Data/LabelWork")]
file_paths.sort()

# Resolve end index if negative
resolved_end = len(file_paths) + end_index if end_index < 0 else end_index
indices = [i for i in range(start_index, resolved_end) if i not in exclude.tolist()]
file_paths = [file_paths[i] for i in indices]

Chapters = []
ChapterContent = {}
Content = {}
for file_path in file_paths:
	chapter_text = Path(file_path).read_text(encoding="utf-8")
	chapter = extract_chapter_title(chapter_text).strip()
	Content[chapter] = chapter_text
	Chapters.append(chapter)
	ChapterContent[chapter] = chapter_text


chapter_str = "\n- " + "\n- ".join(Chapters)
print(chapter_str)


- 03 Unconscious / decreased level of consciousness, breathing normally
- 04 Choking / foreign object
- 05 Mental health issue
- 06 Mental health crisis - suicide risk
- 07 Burns
- 08 Drowning
- 09 Scuba diving accident
- 10 Chemicals / gasses / CBRN
- 11 Accident / injury
- 12 Major incident
- 13 Road traffic accident - RTA
- 14 Violence / abuse
- 15 Allergic reaction
- 16 Child / infant - illness
- 17 Bite / insect sting
- 18 Bleeding - non-traumatic
- 19 Chest pain / cardiac disease
- 20 Diabetes
- 22 Fever / infection / sepsis
- 23 Poisoning - not related to alcohol or drugs
- 24 Functional decline
- 25 Childbirth
- 26 Gynecology / pregnancy
- 27 Stroke symptoms
- 28 Headache
- 29 Skin / rash
- 30 Hyperthermia / heat stroke / heat exhaustion
- 31 Hypothermia / cold person
- 32 Seizures (convulsions, fits)
- 33 Breathing problems
- 34 Drug abuse / intoxication / overdose
- 35 Pain in the abdomen / back / extremities
- 36 Uncertain / unidentified problem
- 37 Urinary tract
- 38 Ear,

In [None]:
summarize_template = PromptTemplate(
    template="""
You are a summarization assistant for a medical retrieval (RAG) system, tasked with producing one focused paragraph per chapter.

You are given:
- A “document” mixing relevant material for one chapter with unrelated or overlapping content.
- A “chapter_list” of all other chapters (to avoid overlap).

Here is the list of other chapters (do not include any of their content):  
{chapter_list}

Here is the source document:  
{document}

# Instructions  
1. Only include signs, symptoms, observations, causes, or clinical features that are uniquely and exclusively related to the target chapter.  
2. Do not include anything that conceptually, terminologically, or contextually fits any other chapter. Avoid general or overlapping criteria.  
3. Exclude any procedural, treatment, or management details. Focus solely on defining clinical features.  
4. Avoid restating or paraphrasing content that can belong to other chapters, even if it appears in the source document.  
5. Write a clear, professional, self-contained paragraph. Include concrete examples of the chapter's defining symptoms or scenarios.  

Important:  
- This summary will feed into an embedding-based search system; precision and exclusivity are critical.  
- Strictly avoid any conceptual or terminological overlap with other chapters.

# Output:
""",
    input_variables=["chapter_list", "document"],
)



short_summary_template = PromptTemplate(
    template="""
You are a summarization assistant for a medical retrieval (RAG) system. Your task is to generate a **strictly isolated, short summary** based **only on the provided document**, focusing on **chapter-specific clinical criteria**.

You are given:
- A “document” with content related to one medical chapter, mixed with potential overlaps.
- A “chapter_list” of other chapters to avoid conceptually or terminologically.

## source document:
{document}
---

## chapter_list (do not include or hint at these):
{chapter_list}
---

# Instructions  
1. Identify only the signs, symptoms, causes, or observable criteria that are **exclusively** tied to this chapter.
2. Do **not** include any clinical features, terms, or observations that may also belong to the chapters in `chapter_list`.  
3. Avoid procedural, treatment-related, or diagnostic advice. Focus **only** on defining features.  
4. If you are unsure whether something is exclusive to this chapter, leave it out. Prioritize precision by excluding anything that could possibly relate to other chapters.
5. The summary must be **factual, concise (max 50 words)**, and **self-contained**.  
6. **Use only the source document. Do not use external knowledge.**
7. Write the summary in key-word format, avoiding any markdown or formatting symbols

# Write the output as a single paragraph with exactly 50 words:
""",
    input_variables=["chapter_list", "document"],
)



filter_criteria_template = PromptTemplate(
    template="""
You are a medical assistant helping to curate a clean, non-overlapping list of clinical criteria for a retrieval system.

You are given:
- A list of "criteria" that may contain noise or ambiguity.
- A list of "other chapters" representing distinct medical topics to avoid.

Your task is to:
1. Identify which criteria are clearly relevant only to the intended topic.
2. Exclude any criteria that overlap conceptually, terminologically, or contextually with any of the chapters in the list.
3. Return only the clean, chapter-specific criteria — without modification or rephrasing.

# List of other chapters (to avoid):
{chapter_list}

# Input criteria:
{criteria_list}

# Output: Return a list of criteria that do NOT overlap with other chapters.
Respond only with a clean list of the accepted criteria:
""",
    input_variables=["criteria_list", "chapter_list"],
)




def call_llm_template(template, input_data):

	try:
		llm = init_chat_model(**settings.processmodel.gpt41.model_dump(exclude_none=True))
		chain = template | llm | StrOutputParser()
		return chain.invoke(input_data)
	except Exception as e:
		print(f"Error calling LLM: {e}")
	
	
	print("- Warning: Retrying with gpt4o-mini...")
	try:
		llm = init_chat_model(**settings.processmodel.gpt4o_mini.model_dump(exclude_none=True))
		chain = template | llm | StrOutputParser()
		return chain.invoke(input_data)
	except Exception as e:
		print(f"Error calling LLM: {e}")


	print("- Warning: Retrying with gpt41-mini...")
	try:
		llm = init_chat_model(**settings.processmodel.gpt41_mini.model_dump(exclude_none=True))
		chain = template | llm | StrOutputParser()
		return chain.invoke(input_data)
	except Exception as e:
		print(f"Error calling LLM: {e}")
	

	print("- Warning: Retrying with o1_mini...")
	try:
		llm = init_chat_model(**settings.processmodel.o1_mini.model_dump(exclude_none=True))
		chain = template | llm | StrOutputParser()
		return chain.invoke(input_data)
	except Exception as e:
		print(f"Error calling LLM: {e}")
	
	print("- Warning: Retrying with gpt41_nano...")
	try:
		llm = init_chat_model(**settings.processmodel.gpt41_nano.model_dump(exclude_none=True))
		chain = template | llm | StrOutputParser()
		return chain.invoke(input_data)
	except Exception as e:
		print(f"Error calling LLM: {e}")
	

	print("- Warning: Retrying with deepseek model...")
	try:
		llm = init_chat_model(**settings.processmodel.deepseek.model_dump(exclude_none=True))
		chain = template | llm | StrOutputParser()
		cot_response = chain.invoke(input_data)
		response =  re.sub(r"<think>.*?</think>", "", cot_response, flags=re.DOTALL).strip()
		return response.strip()
	except Exception as e:
		print(f"Error calling LLM: {e}")
		
	
	display("- Warning: FAILED to generate criteria for empty categories.")

def retry_llm_generation(template, input_data):
	response = None
	while True:
		response = call_llm_template(template, input_data)
		if not response:
			print("Error: None response from the model")
			continue

		response = response.strip().replace("\n\n", "\n")
		words = len(response.split(" "))
		print(f"Response length: {words} words")
		if words < 40 or words > 50:
			print(f"Warning: Response length {words} is outside the expected range (40-50). Retrying...")
			continue

		break
	
		
	return response

In [6]:
from collections import defaultdict

def longest_common_word_prefix(a, b):
	a_words = a.split()
	b_words = b.split()
	min_len = min(len(a_words), len(b_words))
	prefix_words = []
	for i in range(min_len):
		if a_words[i] == b_words[i]:
			prefix_words.append(a_words[i])
		else:
			break
	return ' '.join(prefix_words)


def format_text(criteria_list):

	# Step 1: Find common word prefixes between all pairs
	prefix_counter = defaultdict(set)

	for i, c1 in enumerate(criteria_list):
		for j in range(i+1, len(criteria_list)):
			c2 = criteria_list[j]
			prefix = longest_common_word_prefix(c1, c2)
			if len(prefix.split()) >= 3:  # threshold to avoid noise
				prefix_counter[prefix].add(c1)
				prefix_counter[prefix].add(c2)

	# Step 2: Keep only those used by 2+ unique lines
	valid_prefixes = {p for p, v in prefix_counter.items() if len(v) >= 2}

	# Step 3: Remove shorter prefixes that are sub-prefixes of longer ones
	final_prefixes = []
	for p in sorted(valid_prefixes, key=lambda x: -len(x)):
		if not any(p != other and p.startswith(other) for other in final_prefixes):
			final_prefixes.append(p)

	# Step 4: Group lines
	grouped = defaultdict(list)

	for line in criteria_list:
		matched = False
		for prefix in final_prefixes:
			if line.startswith(prefix):
				grouped[prefix].append(line[len(prefix):].lstrip(" :"))
				matched = True
				break
		if not matched:
			grouped["Other"].append(line)

	prefix = "" #"This chapter is relevant when the patient shows one or more of these symptoms:\n\n"

	# Step 5: Print
	result = ""
	for group, items in grouped.items():
		if group != "Other":
			result += f"\n\n{group}"
		for item in items:
			if item.strip() != "":
				result += f"\n- {item}"
	return prefix + result.strip()

def extract_sections_list(content):
	pattern = r'(^## (?!#).*?(?=^## (?!#)|\Z))'
	sections = re.findall(pattern, content, flags=re.MULTILINE | re.DOTALL)
	section_list = []
	for sec in sections:
		sec = sec.strip()
		lines = sec.splitlines()
		if lines:
			header = lines[0][3:].strip()  # Remove "## " prefix
			body = "\n".join(lines[1:]).strip()
			section_list.append((header, body))  # Store as a tuple
	return section_list

def get_criteria(text):
	criteria_pattern = r"^-\s*(Critical|Urgent|Normal)\s*\|\s*([^|\n]+)(?:\|\s*([^|\n]+))?$"
	matches = re.findall(criteria_pattern, text, flags=re.MULTILINE)
	criteria_list = []
	for _, criteria, _ in matches:
		criteria = criteria.strip()
		# Remove useless criteria/advices
		if "other, not urgent" in criteria.lower() or "other symptoms related to this page" in criteria.lower():
			continue
		criteria_list.append(criteria.strip())
	return criteria_list


def get_scenario(text):
	scenario_match = re.search(r"### SCENARIO\s*(.*?)(?=\n###|\n##|\n#|$)", text, re.DOTALL)
	scenario_content = scenario_match.group(1).strip() if scenario_match else None
	scenario_content = scenario_content.lower().replace("\n", ", ").replace("-", "").strip()
	scenario_content = " ".join(scenario_content.split()).strip()
	if not scenario_content:
		raise Exception("No scenario content found")
	return scenario_content

In [7]:
short_summary_template = PromptTemplate(
    template="""
You are a summarization assistant for a medical retrieval (RAG) system. Your job is to write a **concise, keyword-style paragraph** summarizing a single medical chapter.

## Your Inputs:
- `document`: The full text of one medical chapter's criteria.
- `chapter_list`: A list of **other chapters** that may contain overlapping or similar symptoms.
---

## chapter_list (STRICTLY EXCLUDE anything overlapping with these):
{chapter_list}
---

## Current chapter (for context):
{chapter}
---

## document (your ONLY source of truth):
{document}

## TASK INSTRUCTIONS:

1. Extract only symptoms, conditions, causes, or observable signs that are **distinctively associated with the current chapter**.
2. **Omit any information** (symptoms, words, phrases) that may also apply to chapters listed in `chapter_list`.
3. Ignore treatments, tests, procedures, advice, or questions. Include only observable, clinical features.
4. Write a **single flowing paragraph** using **keywords or short phrases** only — no complete sentences, no bullet points, no lists.
5. Do **not** paraphrase, infer, or invent. Use terms exactly as found in the document.
6. The summary must contain **exactly between 20 and 30 words**. Count and verify this before returning your output.

---

## FORMAT:
Return only the summary paragraph (no title, no formatting, no quotation marks). It must be between 20-30 words.
""",
    input_variables=["chapter_list", "document", "chapter"]
)



In [29]:
with open("Data/Chapter_Short_Summaries2.jsonl", "w", encoding="utf-8") as f_out:
	for i, (chapter, document) in enumerate(Content.items()):
		section_list = extract_sections_list(document)
		relevant_list = []
		for index, (header, body) in enumerate(section_list):
			header = header.strip().lower()
			body = body.strip()

			if header == "criteria":
				criteria = get_criteria(body)
				relevant_list.extend(criteria)
			elif header == "emergency response":
				scenario = get_scenario(body)
				relevant_list.append(scenario)

		relevant_text = f"Chapter: {chapter}\n\n" + format_text(list(set(relevant_list)))
		print(f"\nProcessing {i+1}/{len(Content)}: {chapter}")
		chapter_list = [cur_chapter for cur_chapter in Chapters if cur_chapter != chapter]
		chapter_str = "\n- " + "\n- ".join(chapter_list)

		while True:
			response = call_llm_template(short_summary_template, {"document": relevant_text, "chapter_list": chapter_str, "chapter": chapter})
			if not response:
				print("Error: None response from the model")
				break

			response = response.strip().replace("\n\n", "\n")
			words = len(response.split(" "))
			print(f"Response length: {words} words")
			if words < 20 or words > 30:
				print(f"Warning: Response length {words} is outside the expected range (15-20). Retrying...")
				continue
			break

		if not response:
			response = ""


		entry = {
			"chapter": chapter,
			"summary": response
		}

		print("relevant_text:\n",relevant_text)
		print("-" * 50)
		print("chapter:", chapter)
		print("summary:", response)
		print("=" * 50)

		f_out.write(
			json.dumps(entry, ensure_ascii=False) + "\n"
		)



Processing 1/36: 03 Unconscious / decreased level of consciousness, breathing normally
Response length: 25 words
relevant_text:
 Chapter: 03 Unconscious / decreased level of consciousness, breathing normally

Criteria:

Temporary loss of consciousness (fainting, passing out)
- after taking GTN

free airway child, is the
- infant (under 1 year) breathing normally?
- child breathing normally?
- Drowsy (decreased level of consciousness), breathing normally
- Loss of consciousness (fainting) more than once, alert now
- Has had a seizure (convulsions, fit), drowsy (decreased level of consciousness) and/or confused after 20 minutes
- Unconscious adult, breathing normally
- Has epilepsy. Gradually waking up after a seizure (convulsion, fit)
- free airway adult, is the adult breathing normally?
- Unconscious child, breathing normally
--------------------------------------------------
chapter: 03 Unconscious / decreased level of consciousness, breathing normally
summary: Temporary loss of cons



Error: None response from the model
relevant_text:
 Chapter: 13 Road traffic accident - RTA

Criteria:

- Severe pain in the head, neck, chest, pelvis or thigh
- Suspect a serious injury
- Involved in a traffic accident, no symptoms at this time
- Weakness or loss of sensation in the legs (suspect neck or back injury) Look out for numbness or loss of sensation.
- Injury to the face or throat, possible threat to airway
- Slight pain or discomfort in the neck, no additional symptoms
- Pain in the neck or back, no loss of sensation in the arms and legs
- Head injury - remembers little about what happened. May also be nauseous or dizzy
- Danger og hypothermia / low body temperature Se 31 Hypotermi
- Pale, clammy skin (cold sweating)
- Has an injury, high risk patient
- Gaping wound, may need stitches
- Shortness of breath or difficulty breathing
- Major blood loss
- Severe pain or deformity in the face, shoulder, ribs, arm, hand, neck of femur, knee, ankle, foot (possible fracture or joint



Error: None response from the model
relevant_text:
 Chapter: 14 Violence / abuse

Criteria:

- Sexual violence
- Danger of serious violence
- Possible abuse
- Shortness of breath or difficulty breathing
- Severe pain or deformity in the face, shoulder, ribs, arm, hand, neck of femur, knee, ankle, foot (possible fracture or joint injury)
- Severe pain
- Has been unconscious, alert now
- Aggressive person threatening violence
- Minor injuries (cuts and bruises) Se 11 Skade
- Blunt force to the throat
- Bleeding that has stopped or can be stopped
- Unconscious (unresponsive)

Violence, has been or is being abused -
- Pale, clammy skin (cold sweating)
- Weakness or loss of sensation in the legs, newly observed neurological symptoms (suspect neck or back injury)
- Major blood loss, still bleeding
- Alert but decreasing consciousness (fainting)
- Injury due to knifing, shooting or blunt weapon

Victim of violence
- without injury
- with acute psychological reaction
--------------------------



Error: None response from the model
relevant_text:
 Chapter: 18 Bleeding - non-traumatic

Criteria:

- Vomiting small amounts of fresh blood
- Post-operative bleeding in the throat
- Pale, clammy skin (cold sweating), still bleeding
- Major rectal bleed
- Bleeding after an operation or an endoscopy
- Coughing up large amounts of fresh blood
- Vomiting dark blood, fine otherwise
- Moderate rectal bleed without other symptoms
- Shortness of breath or difficulty breathing
- Vomiting dark blood similar to coffee grounds, seems weak and tired
- Alert, but decreasing consciousness
- Major nosebleed lasting more than 20 mins.
- Skin sores bleeding a little all the time
- Coughing up small smounts of blood mixed with phlegm
- Small amounts of blood on stools or on toilet paper
- Vomiting large amounts of fresh blood
- Unconscious (unresponsive)
- Black, tarry stools without other symptoms

Large amount of black, tarry
- stools
- stools, feels unwell, sudden onset
------------------------------

In [8]:
def extract_chapter_title(content):
    pattern = r"^#\s*(.+)$"
    for i, line in enumerate(content.split("\n"), start=1):
        match = re.match(pattern, line)
        if match:
            return match.group(1).strip()
    return ""

def extract_sections_list(content):
	pattern = r'(^## (?!#).*?(?=^## (?!#)|\Z))'
	sections = re.findall(pattern, content, flags=re.MULTILINE | re.DOTALL)
	section_list = []
	for sec in sections:
		sec = sec.strip()
		lines = sec.splitlines()
		if lines:
			header = lines[0][3:].strip()  # Remove "## " prefix
			body = "\n".join(lines[1:]).strip()
			section_list.append((header, body))  # Store as a tuple
	return section_list

In [9]:
file_paths_json = ["Data/Json/Unique_Criteria_Scenario/" + file for file in os.listdir("Data/Json/Unique_Criteria_Scenario")]
file_paths_json.sort()

# Resolve end index if negative
resolved_end = len(file_paths_json) + end_index if end_index < 0 else end_index
indices = [i for i in range(start_index, resolved_end) if i not in exclude.tolist()]
file_paths_json = [file_paths_json[i] for i in indices]


criteria_chapter_map = defaultdict(list)
for file_path in file_paths_json:
	with open(file_path, "r", encoding="utf-8") as f:
		for line in f:
			entry = json.loads(line.strip()) 
			if not entry:
				continue
			chapter = entry.get("chapter")
			criteria = entry.get("criteria")
			criteria_chapter_map[chapter].append(criteria)
		
		

print(json.dumps(criteria_chapter_map, indent=2, ensure_ascii=False))

{
  "03 Unconscious / decreased level of consciousness, breathing normally": [
    "Unconscious adult, breathing normally",
    "Unconscious child, breathing normally",
    "Drowsy (decreased level of consciousness), breathing normally",
    "Has had a seizure (convulsions, fit), drowsy (decreased level of consciousness) and/or confused after 20 minutes",
    "Loss of consciousness (fainting) more than once, alert now",
    "Temporary loss of consciousness (fainting, passing out)",
    "Temporary loss of consciousness (fainting, passing out) after taking GTN",
    "Has epilepsy. Gradually waking up after a seizure (convulsion, fit)",
    "free airway child, is the infant (under 1 year) breathing normally?",
    "free airway child, is the child breathing normally?",
    "free airway adult, is the adult breathing normally?"
  ],
  "04 Choking / foreign object": [
    "May have an airway blockage, unconscious or losing consciousness",
    "May have an airway blockage, difficulty breathing

### Generate criteria advice pairs

In [None]:
fp_json = ["Data/Json/Unique_Criteria/" + file for file in os.listdir("Data/Json/Unique_Criteria")]
fp_json.sort()

# Resolve end index if negative
resolved_end = len(fp_json) + end_index if end_index < 0 else end_index
indices = [i for i in range(start_index, resolved_end) if i not in exclude.tolist()]
fp_json = [fp_json[i] for i in indices]


criteria_advice_mapping = defaultdict(list)
for file_path in fp_json:
	with open(file_path, "r", encoding="utf-8") as f:
		for line in f:
			entry = json.loads(line.strip()) 
			if not entry:
				continue

			data = entry["entry"]

			criteria = data[0]["content"]
			chapter = data[1]["content"]
			advices = data[1]["advices"]

			criteria_advice_mapping[chapter].append({
				"criteria": criteria,
				"advices": advices
			})

print(json.dumps(criteria_advice_mapping, indent=2, ensure_ascii=False))

{
  "03 Unconscious / decreased level of consciousness, breathing normally": [
    {
      "criteria": "Unconscious adult, breathing normally",
      "advices": [
        "\nIMPORTANT INFORMATION TO THE CALLER\n- Help is on the way. I may need to phone you back, so keep this phone free until the medics arrive.\n- Watch the person all the time. Tell me immediately if anything changes.",
        "\nMEDICATION\n- Any medication being used by the person must accompany them at all times."
      ]
    },
    {
      "criteria": "Unconscious child, breathing normally",
      "advices": [
        "\nIMPORTANT INFORMATION TO THE CALLER\n- Help is on the way. I may need to phone you back, so keep this phone free until the medics arrive.\n- Watch the person all the time. Tell me immediately if anything changes.",
        "\nMEDICATION\n- Any medication being used by the person must accompany them at all times."
      ]
    },
    {
      "criteria": "Drowsy (decreased level of consciousness), bre

In [42]:
import itertools
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain.embeddings import init_embeddings
embed_model = init_embeddings("ollama:nomic-embed-text:latest")


def clean_chapter(text):
	# text = text.replace("/", ",")
	text = re.sub(r'#', '', text) # Remove hashtags
	text = re.sub(r'\d+', '', text)  # Remove numbers
	return text.strip()



#docs_summary = []

docs_criteria_advices = []
compiled_criteria = []
docs_criteria = []
docs_chapter = []


for i, (chapter, criteria_list) in enumerate(criteria_chapter_map.items()):

	content = ChapterContent[chapter]
	data_list = criteria_advice_mapping[chapter]
	advices_list = list(set(list(itertools.chain.from_iterable([entry["advices"] for entry in data_list]))))
	

	# full document embedding
	embed_str_doc = "search_document:\n" + content.replace("\n\n", "\n")
	docs_chapter.append(
		Document(page_content=embed_str_doc, metadata={"chapter": chapter})
	)

	# criteria and advices embedding
	criteria_advice_str = f'search_document:\nCHAPTER: "{clean_chapter(chapter)}"\n\nCRITERIA:\n\n' + format_text(criteria_list) + "\n\nADVICES:\n" + "\n\n- ".join(advices_list)
	criteria_advice_str = "\n".join(
		line for line in criteria_advice_str.splitlines()
		if not (line.strip().startswith("-") and line.strip() == "-")
	)
	docs_criteria_advices.append(
		Document(page_content=criteria_advice_str, metadata={"chapter": chapter})
	)
	

	# compiled criteria embedding
	compiled_criteria_str = f'search_document:\nCHAPTER: "{clean_chapter(chapter)}"\n\nCRITERIA:\n' + format_text(criteria_list)
	compiled_criteria.append(
		Document(page_content=compiled_criteria_str, metadata={"chapter": chapter})
	)
	

	# criteria embedding
	criteria_str = f'search_document:\nChapter: "{clean_chapter(chapter)}"\n\nCriteria:\n- ' + "\n- ".join(criteria_list)
	docs_criteria.append(
		Document(page_content=criteria_str, metadata={"chapter": chapter})
	)

	# print(embed_str_summary)

	# print(embed_str_doc)
	#print("==========================")


# vectorstore_criteria = FAISS.from_documents(docs_summary, embed_model)
# vectorstore_criteria.save_local("VectorStores/Summary_Chapter_FAISS")

# vectorstore_criteria = FAISS.from_documents(docs_chapter, embed_model)
# vectorstore_criteria.save_local("VectorStores/Summary_Criteria_Chapter_FAISS")

# with open("Chapter_Criteria.jsonl", "w", encoding="utf-8") as f_out:
# 	for chapter, criteria in save_test.items():
# 		entry = {
# 			"chapter": chapter,
# 			"criteria": criteria
# 		}
# 		f_out.write(
# 			json.dumps(entry, ensure_ascii=False) + "\n"
# 		)


FAISS.from_documents(docs_chapter, embed_model).save_local("VectorStores2/Chapter_FAISS")
FAISS.from_documents(docs_criteria_advices, embed_model).save_local("VectorStores2/Criteria_Advices_FAISS")
FAISS.from_documents(docs_criteria, embed_model).save_local("VectorStores2/Criteria_FAISS")
FAISS.from_documents(compiled_criteria, embed_model).save_local("VectorStores2/Compiled_Criteria_FAISS")

In [106]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import init_embeddings
embed_model = init_embeddings("ollama:nomic-embed-text:latest")
criteria_vectorstore = FAISS.load_local(folder_path="VectorStores/Criteria_Chapter_FAISS", embeddings=embed_model, allow_dangerous_deserialization=True)
summary_criteria_vectorstore = FAISS.load_local(folder_path="VectorStores/Summary_Criteria_Chapter_FAISS", embeddings=embed_model, allow_dangerous_deserialization=True)
summary_vectorstore = FAISS.load_local(folder_path="VectorStores/Summary_Chapter_FAISS", embeddings=embed_model, allow_dangerous_deserialization=True)

def get_chapter_vectorstore(query, top_k=6):
    prefix = "search_query: "
    query = prefix + query
    ntotal = summary_criteria_vectorstore.index.ntotal

    def get_rank(query, k):
        summary_results = summary_criteria_vectorstore.similarity_search(query, k=k)
        seen_chapters = set()
        rank = []
        for res in summary_results:
            chapter = res.metadata.get("chapter")
            if chapter not in seen_chapters:
                seen_chapters.add(chapter)
                rank.append(chapter)
        return rank

    if top_k >= ntotal:
        return get_rank(query, k=ntotal)

    for k in range(top_k, ntotal):
        rank = get_rank(query, k=k)
        if len(rank) >= top_k:
            return rank

    # Fallback if unique chapters never reach top_k
    return get_rank(query, k=ntotal)


def get_criteria_vectorstore(query, top_k=6):
	prefix = "search_query: "
	criteria_results = criteria_vectorstore.similarity_search(prefix + query, k=top_k)
	response = []
	for res in criteria_results: 
		chap = res.metadata.get("chapter")
		response.append({"content": res.page_content, "chapter": chap})
	return response

def get_summary_vectorstore(query, top_k=6):
	prefix = "search_query: "
	criteria_results = summary_vectorstore.similarity_search(prefix + query, k=top_k)
	response = []
	for res in criteria_results: 
		chap = res.metadata.get("chapter")
		response.append({"content": res.page_content, "chapter": chap})
	return response
	

In [None]:
semantic_prompt_template = PromptTemplate(
    template="""
You are an assistant for a medical triage system. Identify the 5 most relevant medical chapters based on the user query. Each chapter has a numeric ID and a summary.

## Chapter Summaries:
{chapter_summaries}

## User Query:
{query}

# Instructions:
- Only select from the provided chapters.
- Do not hallucinate or invent any chapter names.
- Return exactly 5 chapter IDs in descending order of relevance.

# Output Format (example):
```json
{{
  "retrieved": [5,2,3,1,4]
}}
```
""",
    input_variables=["query", "chapter_summaries"]
)



def get_criteria_vectorstore(query, top_k=6):
	prefix = "search_query: "
	criteria_results = criteria_vectorstore.similarity_search(prefix + query, k=top_k)
	response = []
	for res in criteria_results: 
		chap = res.metadata.get("chapter")
		response.append({"content": res.page_content, "chapter": chap})
	return response


In [None]:
query =  {
	"condition": "Unconscious, not breathing normally, difficulty breathing",
	"observations": "Complaining about feeling ill for a couple of days",
	"cause": "Unknown medical condition",
	"callers_actions": "Helping patient"
}
query = json.dumps(query, indent=2, ensure_ascii=False)
retrieval = get_criteria_vectorstore(query, top_k=15)
rag_candidates = [result["chapter"].lower() for result in retrieval]
display("🔐 Retrieved Candidate Chapters (Criteria):",rag_candidates)

In [None]:
summary =  {
	"condition": "conscious, not breathing normally, difficulty breathing",
	"observations": "fire and Burning injuries",
	"cause": "Unknown medical condition",
}


summary = json.dumps(summary, indent=2, ensure_ascii=False)
retrieval = get_summary_vectorstore(summary, top_k=10)
candidates = [result["chapter"] for result in retrieval]
display("🔐 Retrieved Candidate Chapters (Summary of Criteria):",candidates)
print("-"*50)
retrieval = get_chapter_vectorstore(summary, top_k=5)
display("🔐 Retrieved Candidate Chapters (Summary + Criteria):",retrieval)
print("-"*50)
retrieval = get_criteria_vectorstore(summary, top_k=5)
candidates = [result["chapter"] for result in retrieval]
display("🔐 Retrieved Candidate Chapters (Criteria):",candidates)
print("-"*50)

'🔐 Retrieved Candidate Chapters (Summary of Criteria):'

['36 Uncertain / unidentified problem',
 '22 Fever / infection / sepsis',
 '33 Breathing problems',
 '12 Major incident',
 '10 Chemicals / gasses / CBRN',
 '08 Drowning',
 '30 Hyperthermia / heat stroke / heat exhaustion',
 '04 Choking / foreign object',
 '16 Child / infant - illness',
 '11 Accident / injury']

--------------------------------------------------


'🔐 Retrieved Candidate Chapters (Summary + Criteria):'

['07 Burns',
 '11 Accident / injury',
 '33 Breathing problems',
 '10 Chemicals / gasses / CBRN',
 '03 Unconscious / decreased level of consciousness, breathing normally']

--------------------------------------------------


'🔐 Retrieved Candidate Chapters (Criteria):'

['07 Burns',
 '33 Breathing problems',
 '11 Accident / injury',
 '36 Uncertain / unidentified problem',
 '22 Fever / infection / sepsis']

--------------------------------------------------
