In [1]:
import os

# Adjust the dataset name based on what you called it when uploading
input_path = "/kaggle/input/mimic-iv-ext-direct/"  # Replace 'mimic-iv-ext-direct' with your dataset slug

# List contents
print("Diagnostic KG files:", os.listdir(os.path.join(input_path, "diagnostic_kg/Diagnosis_flowchart")))
print("Sample folders:", os.listdir(os.path.join(input_path, "samples/Finished")))

Diagnostic KG files: ['Migraine.json', 'Gastro-oesophageal Reflux Disease.json', 'Peptic Ulcer Disease.json', 'Stroke.json', 'Multiple Sclerosis.json', 'Atrial Fibrillation.json', 'Pituitary Disease.json', 'Hypertension.json', 'COPD.json', 'Aortic Dissection.json', 'Cardiomyopathy.json', 'Thyroid Disease.json', 'Upper Gastrointestinal Bleeding.json', 'Asthma.json', 'Adrenal Insufficiency.json', 'Pneumonia.json', 'Tuberculosis.json', 'Acute Coronary Syndrome.json', 'Epilepsy.json', 'Heart Failure.json', 'Alzheimer.json', 'Diabetes.json', 'Pulmonary Embolism.json', 'Hyperlipidemia.json']
Sample folders: ['Pneumonia', 'Stroke', 'Peptic Ulcer Disease', 'Asthma', 'Pituitary Disease', 'Migraine', 'Diabetes', 'Acute Coronary Syndrome', 'Tuberculosis', 'Hypertension', 'Hyperlipidemia', 'Pulmonary Embolism', 'Heart Failure', 'Alzheimer', 'Aortic Dissection', 'Thyroid Disease', 'Atrial Fibrillation', 'Adrenal Insufficiency', 'Upper Gastrointestinal Bleeding', 'COPD', 'Cardiomyopathy', 'Multiple 

In [2]:
import json

# Load a sample knowledge graph
kg_path = os.path.join(input_path, "diagnostic_kg/Diagnosis_flowchart", "Heart Failure.json")
with open(kg_path, 'r') as f:
    kg_data = json.load(f)

# Explore
print("Diagnostic Tree:")
print(json.dumps(kg_data["diagnostic"], indent=2))
print("\nKnowledge Premises:")
print(json.dumps(kg_data["knowledge"], indent=2))

Diagnostic Tree:
{
  "Suspected Heart Failure": {
    "Strongly Suspected Heart Failure": {
      "Heart Failure": {
        "HFrEF": [],
        "HFmrEF": [],
        "HFpEF": []
      }
    }
  }
}

Knowledge Premises:
{
  "Suspected Heart Failure": {
    "Risk Factors": "CAD, Hypertension, Valve disease, Arrhythmias, CMPs, Congenital heart disease, Infective, Drug-induced, Infiltrative, Storage disorders, Endomyocardial disease, Pericardial disease, Metabolic, Neuromuscular disease; etc.",
    "Symptoms": "Typical: Breathlessness, Orthopnoea, Paroxysmal nocturnal dyspnoea, Reduced exercise tolerance, Fatigue, tiredness, increased time to recover after exercise, Ankle swelling. Less typical: Nocturnal cough, Wheezing, Bloated feeling, Loss of appetite, Confusion (especially in the elderly), Depression, Palpitation, Dizziness, Syncope.; etc.",
    "Signs": "More specific: Elevated jugular venous pressure, Hepatojugular reflux, Third heart sound (gallop rhythm), Laterally displaced api

In [3]:
# Load a sample annotated note
sample_path = os.path.join(input_path, "samples/Finished/Stroke/Hemorrhagic Stroke/stroke_sample1.json")
with open(sample_path, 'r') as f:
    sample_data = json.load(f)

# Explore
print("Sample Note:")
print(json.dumps(sample_data, indent=2))

Sample Note:
{
  "Hemorrhagic Stroke$Intermedia_3": {
    "Intracerebral hemorrhage is a direct diagnostic criterion for hemorrhagic stroke$Cause_1": {
      "have cerebellar hemorrhage$Input2": {}
    },
    "There was a 10 mm intraparenchymal hemorrhage in the right cerebellar hemisphere, which caused a mild compression effect on the fourth ventricle. Intraparenchymal hemorrhage is a typical manifestation of hemorrhagic stroke.$Cause_1": {
      "10 mm intraparenchymal hemorrhage in the right cerebellar hemisphere$Input6": {}
    },
    "MRI results showed a large susceptibility artifact in the right cerebellar hemisphere, corresponding to mild hyperintensity on T1-weighted imaging and peripheral hyperintensity on T2/FLAIR, consistent with subacute hemorrhage in the right cerebellar hemisphere seen on previous CT.$Cause_1": {
      "large focus of susceptibility artifact in the right cerebellar hemisphere, with corresponding mild intrinsic T1 hyperintensity and peripheral T2/FLAIR hy

In [6]:
import re

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Replace newlines and tabs with spaces
    text = text.replace('\n', ' ').replace('\t', ' ')
    # Remove excessive punctuation, keep clinically relevant ones
    text = re.sub(r'[^\w\s.,;:>-]', ' ', text)  # Keep letters, numbers, spaces, and basic punctuation
    # Collapse multiple spaces into one and strip
    text = ' '.join(text.split()).strip()
    return text

In [7]:
import pandas as pd

# Preprocess Knowledge Graphs
kg_docs = []
for kg_file in os.listdir(os.path.join(input_path, "diagnostic_kg/Diagnosis_flowchart")):
    if kg_file.endswith(".json"):
        with open(os.path.join(input_path, "diagnostic_kg/Diagnosis_flowchart", kg_file), 'r') as f:
            kg = json.load(f)
        label = kg_file.replace('.json', '')
        knowledge_text = "; ".join([f"{k}: {v}" for k, v in kg["knowledge"].items()])
        knowledge_text = preprocess_text(knowledge_text)
        kg_docs.append({"label": label, "text": knowledge_text})

kg_df = pd.DataFrame(kg_docs)
kg_df['source'] = 'knowledge_graph'
print("Knowledge Graph DataFrame:")
print(kg_df.head())

Knowledge Graph DataFrame:
                               label  \
0                           Migraine   
1  Gastro-oesophageal Reflux Disease   
2               Peptic Ulcer Disease   
3                             Stroke   
4                 Multiple Sclerosis   

                                                text           source  
0  suspected epilepsy: risk factors : genetic pre...  knowledge_graph  
1  suspected gastro-oesophageal reflux disease: r...  knowledge_graph  
2  suspected peptic ulcer disease: risk factors :...  knowledge_graph  
3  suspected stroke: risk factors : hypertension,...  knowledge_graph  
4  suspected multiple sclerosis: risk factors : g...  knowledge_graph  


In [8]:
# Preprocess Annotated Notes
sample_docs = []
for root, dirs, files in os.walk(os.path.join(input_path, "samples/Finished")):
    for file in files:
        if file.endswith(".json"):
            file_path = os.path.join(root, file)
            with open(file_path, 'r') as f:
                data = json.load(f)
            note_text = " ".join([data.get(f"input{i}", "") for i in range(1, 7) if f"input{i}" in data])
            note_text = preprocess_text(note_text)
            label = os.path.basename(os.path.dirname(file_path))
            sample_docs.append({"label": label, "text": note_text})

sample_df = pd.DataFrame(sample_docs)
sample_df['source'] = 'annotated_note'
print("Sample Notes DataFrame:")
print(sample_df.head(20))

Sample Notes DataFrame:
                  label                                               text  \
0   Bacterial Pneumonia  lethargy the day prior he was noticably confus...   
1   Bacterial Pneumonia  chills, fatigue, cough he is 61 yo retired col...   
2   Bacterial Pneumonia  shortness of breath her symptoms started about...   
3   Bacterial Pneumonia  fever her symptoms began approximately 10 days...   
4   Bacterial Pneumonia  fever ms. ___ is a ___ with history of copd un...   
5   Bacterial Pneumonia  bilateral ear pain, cough, shortness of breath...   
6   Bacterial Pneumonia  neutropenic fever he is 61 year-old caucasian ...   
7   Bacterial Pneumonia  none ___ woman with aml, s p sct ___ years ago...   
8   Bacterial Pneumonia  fever and cough he is with a history of hcv an...   
9   Bacterial Pneumonia  fever, cough she is with history of copd uncer...   
10  Bacterial Pneumonia  none the patient presents with a cough produci...   
11  Bacterial Pneumonia  cough she is wi

In [9]:
# Add source column
kg_df['source'] = 'knowledge_graph'
sample_df['source'] = 'annotated_note'

# Combine
docs_df = pd.concat([kg_df, sample_df], ignore_index=True)
docs_df.columns = ['label', 'text', 'source']  # Rename for clarity
print("Combined Corpus:")
print(docs_df.head(20))

# Save for later use
docs_df.to_csv("/kaggle/working/docs.csv", index=False)

Combined Corpus:
                                label  \
0                            Migraine   
1   Gastro-oesophageal Reflux Disease   
2                Peptic Ulcer Disease   
3                              Stroke   
4                  Multiple Sclerosis   
5                 Atrial Fibrillation   
6                   Pituitary Disease   
7                        Hypertension   
8                                COPD   
9                   Aortic Dissection   
10                     Cardiomyopathy   
11                    Thyroid Disease   
12    Upper Gastrointestinal Bleeding   
13                             Asthma   
14              Adrenal Insufficiency   
15                          Pneumonia   
16                       Tuberculosis   
17            Acute Coronary Syndrome   
18                           Epilepsy   
19                      Heart Failure   

                                                 text           source  
0   suspected epilepsy: risk factors : genetic p

In [10]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
embeddings = model.encode(docs_df['text'].tolist(), show_progress_bar=True)
docs_df['embeddings'] = embeddings.tolist()

# Save with embeddings
docs_df.to_pickle("/kaggle/working/docs_with_embeddings.pkl")  # Use pickle to preserve embeddings

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]