In [None]:
import os
import json
import pandas as pd
from tqdm import tqdm
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define dataset paths
diagnostic_kg_path = "/content/drive/My Drive/mimic-iv-ext-direct-1.0.0/Diagnosis_flowchart"
samples_path = "/content/drive/My Drive/mimic-iv-ext-direct-1.0.0/Finished"

# Function to load JSON files
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

# Load knowledge graphs
kg_files = [f for f in os.listdir(diagnostic_kg_path) if f.endswith(".json")]
knowledge_graphs = {file: load_json(os.path.join(diagnostic_kg_path, file)) for file in kg_files}

# Load annotated samples
samples_data = []
for root, _, files in os.walk(samples_path):
    for file in files:
        if file.endswith(".json"):
            samples_data.append(load_json(os.path.join(root, file)))

print(f"✅ Loaded {len(knowledge_graphs)} knowledge graphs and {len(samples_data)} clinical notes.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Loaded 24 knowledge graphs and 466 clinical notes.


In [None]:
!pip uninstall -y spacy


Found existing installation: spacy 3.7.5
Uninstalling spacy-3.7.5:
  Successfully uninstalled spacy-3.7.5


In [None]:
!python -m spacy download en_core_sci_md


/usr/bin/python3: No module named spacy


In [None]:
import re
import scispacy
import en_core_sci_sm

nlp = en_core_sci_sm.load()

def preprocess_text(text):
    """
    Cleans and tokenizes clinical text using spaCy.
    - Lowercasing
    - Removing special characters & numbers
    - Tokenizing & lemmatization
    - Removing stopwords
    """
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]  # Lemmatization + Stopword removal
    return " ".join(tokens)

def extract_input_content(record):
    """
    Recursively extracts clinical observations from nested diagnostic records.
    Looks for keys ending with '$Input1' to '$Input6'.
    """
    extracted_text = []

    def recursive_extract(data):
        if isinstance(data, dict):
            for key, value in data.items():
                if "$Input" in key:  # Check if key is an input note
                    extracted_text.append(key.split("$Input")[0])  # Extract the actual text
                recursive_extract(value)  # Continue searching in nested dicts

    recursive_extract(record)
    return " ".join(extracted_text) if extracted_text else None

# Apply extraction and preprocessing
for i, sample in enumerate(samples_data):
    raw_text = extract_input_content(sample)  # Extract clinical note text
    if raw_text:
        samples_data[i]["processed_text"] = preprocess_text(raw_text)  # Store preprocessed text

print("✅ Preprocessing complete! Sample output:")
print(samples_data[0]["processed_text"])


ModuleNotFoundError: No module named 'en_core_sci_sm'

In [None]:
import faiss
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np

# Load BioClinicalBERT for embeddings
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda")

# Function to generate embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token embedding

# Generate embeddings for all clinical notes
embeddings = np.array([get_embedding(sample["processed_text"])[0] for sample in tqdm(samples_data)])

# Create FAISS index
d = embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(d)
index.add(embeddings)  # Add embeddings to FAISS index

print(f"✅ FAISS Indexing Complete! {len(embeddings)} documents indexed.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
100%|██████████| 466/466 [00:05<00:00, 90.21it/s] 

✅ FAISS Indexing Complete! 466 documents indexed.





In [None]:
def search_faiss(query, k=5):
    query_embedding = get_embedding(query)  # Convert query to embedding
    _, indices = index.search(query_embedding, k)  # Search in FAISS index
    return [(samples_data[i]["processed_text"], i) for i in indices[0]]

# Example Query
query = "What are the risk factors for stroke?"
faiss_results = search_faiss(query)

print("🔍 FAISS Results:")
for doc, idx in faiss_results:
    print(f"Index: {idx}\nText: {doc[:200]}...\n")


🔍 FAISS Results:
Index: 52
Text: vs severe blood pressure headache...

Index: 70
Text: bp   elevated blood pressure   headache...

Index: 213
Text: iaa   glucose poor glycemic control...

Index: 113
Text: upper endoscopy duodenal ulcer bleeding present melanous stool evening x day total approx   dark bloody bowel movement lab notable hct     patient sister diagnose colon cancer...

Index: 216
Text: asthma little bad control recently need use albuterol pretty day significant improvement fev   chest pressure feel like bandlike pressure sensation episode chest discomfort recur hypertriglyceridemia...



In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load Flan-T5
gen_model_name = "google/flan-t5-large"
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(gen_model_name).to("cuda")

def generate_answer(query, retrieval_method="faiss", k=3):
    if retrieval_method == "faiss":
        retrieved_docs = search_faiss(query, k)

    context = "\n".join([doc[:500] for doc, _ in retrieved_docs])
    prompt = f"Patient Query: {query}\n\nRelevant Information:\n{context}\n\nProvide a concise, clinically relevant response."

    inputs = gen_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    with torch.no_grad():
        output_tokens = gen_model.generate(**inputs, max_new_tokens=150, temperature=0.7)

    return gen_tokenizer.decode(output_tokens[0], skip_special_tokens=True)

# Example Query
query = "What are the risk factors for stroke?"
response = generate_answer(query)

print("🔍 Generated Response:")
print(response)


OutOfMemoryError: CUDA out of memory. Tried to allocate 126.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 12.12 MiB is free. Process 148738 has 14.73 GiB memory in use. Of the allocated memory 14.60 GiB is allocated by PyTorch, and 7.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=272f78f9c8dcfe182f94b7417a9fffd01fe20c2f62a7a900d5ae5a16023f9b75
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu

# Example reference & generated response
reference = "High blood pressure, diabetes, and smoking are major risk factors for stroke."
generated = "The risk factors for stroke include hypertension, diabetes, and tobacco use."

# Compute BLEU score
bleu = sentence_bleu([reference.split()], generated.split())
print(f"🔹 BLEU Score: {bleu:.4f}")

# Compute ROUGE score
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = scorer.score(reference, generated)
print(f"🔹 ROUGE Scores: {rouge_scores}")


🔹 BLEU Score: 0.0000
🔹 ROUGE Scores: {'rouge1': Score(precision=0.5454545454545454, recall=0.5, fmeasure=0.5217391304347826), 'rouge2': Score(precision=0.4, recall=0.36363636363636365, fmeasure=0.380952380952381), 'rougeL': Score(precision=0.36363636363636365, recall=0.3333333333333333, fmeasure=0.34782608695652173)}


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Apply smoothing
smooth_fn = SmoothingFunction().method1
bleu = sentence_bleu([reference.split()], generated.split(), smoothing_function=smooth_fn)

print(f"🔹 Smoothed BLEU Score: {bleu:.4f}")


🔹 Smoothed BLEU Score: 0.1071
