In [1]:
import re
import spacy
nlp_name = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from rapidfuzz import process, fuzz
import json

In [3]:
demo_script = """
Physician: Hello Sarah, thank you for coming in. How can I help you today?

Patient: Hi, Dr. Lee. I'm not really sure where to start. I just feel... on edge all the time. I'm constantly worried, and it's starting to wear me down.

Physician: I'm sorry to hear that. Can you tell me more about this 'worry'? What sort of things are on your mind?

Patient: Honestly, everything. It can be small things, like being late for an appointment, or big things, like my parents' health or job security. My mind just seems to find something to fixate on, and then it spirals. I imagine the worst-case scenario for everything.

Physician: And how long has this been going on?

Patient: It's been there in the background for years, I think, but it's gotten really bad over the last six months. It feels like I can't switch my brain off.

Physician: Are you experiencing any physical symptoms along with the mental anxiety?

Patient: Yes, and that's what really prompted me to come in. I've been having moments where my heart starts racing for no reason, my palms get sweaty, and I feel a bit dizzy. Sometimes I have trouble falling asleep because my thoughts are racing. I'm also clenching my jaw constantly without realizing it.

Physician: Those sound like panic attacks or at least significant physical manifestations of anxiety. How is this affecting your day-to-day life? Your work, your relationships?

Patient: At work, I have trouble concentrating. I'll read the same email three times. I'm more irritable with my partner, and I've been turning down invitations to see friends because I just don't have the energy. I feel like I'm trapped in my own head.

Physician: Thank you for sharing that, Sarah. It takes courage to talk about this. What you're describing sounds very much like Generalized Anxiety Disorder, or GAD. It's a very common and, importantly, a very treatable condition. You are not alone in this.

Patient: A disorder? That sounds so serious. I thought I was just a 'worrier'.

Physician: It's a clinical term, but it just means that the worry has become persistent and severe enough to impact your quality of life. The good news is that we have excellent, evidence-based treatments. The two main pillars are therapy and medication.

Patient: I'm a bit nervous about medication.

Physician: That's a very normal concern. A common and effective approach is Cognitive-Behavioral Therapy, or CBT. It's a type of talk therapy that helps you identify and challenge those negative thought patterns. For medication, a class of drugs called SSRIs, like Sertraline, can be very effective at 'turning down the volume' on the anxiety, which can make therapy more effective. They are non-addictive.

Patient: So I don't have to decide on medication today?

Physician: Not at all. A great first step would be a referral to a therapist who specializes in CBT. I can give you a list of trusted colleagues. We can also schedule a follow-up with me in a month to see how you feel and to revisit the medication option then if you're still struggling.

Patient: Okay. I like the idea of starting with therapy. It feels like a proactive step. Just talking about it has made me feel a little lighter, actually.

Physician: That's often the case. Recognizing the problem is the first and most important step. We'll get you the support you need.

Patient: Thank you, doctor. I really appreciate it.
"""

In [4]:
def extract_patient_name(raw_text:str):
    lines = demo_script.splitlines()
    for x in lines:
        if x == '':
            lines.remove(x)
    names = []
    for line in lines[:6]:
        doc = nlp_name(line)
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                names.append(ent.text)
    return names[0] if names else "Not Specified"

In [5]:
def clean_transcript(text: str) -> str:
    cleaned = re.sub(r"(?:Physician|Doctor|Patient):", "", text)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    return cleaned

In [6]:
nlp_med_si = spacy.load("en_core_sci_lg")
nlp_med_av = spacy.load("en_ner_bc5cdr_md")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [7]:
SYMPTOM_DICT = {
    "headache", "migraine", "throbbing head pain", "sharp pain", "dull ache", "burning sensation", "stabbing pain",
    "nausea", "vomiting", "queasiness", "upset stomach",
    "dizziness", "lightheadedness", "vertigo", "spinning sensation",
    "chest pain", "pressure in chest", "tightness", "heartburn",
    "shortness of breath", "dyspnea", "breathlessness", "wheezing", "chest tightness", "difficulty breathing",
    "fatigue", "tiredness", "lack of energy", "exhaustion",
    "fever", "high temperature", "chills", "night sweats", "hot flashes", "cold sweats",
    "cough", "persistent cough", "dry cough", "productive cough", "hacking cough",
    "sore throat", "throat pain", "scratchy throat", "hoarseness",
    "runny nose", "nasal discharge", "stuffy nose", "congestion",
    "back pain", "lumbar pain", "lower back ache", "upper back pain",
    "neck pain", "cervical pain", "stiff neck",
    "abdominal pain", "stomach ache", "tummy ache", "cramping", "bloating", "gas",
    "muscle ache", "myalgia", "joint pain", "arthralgia",
    "rash", "skin eruption", "hives", "redness", "itching", "pruritus",
    "swelling", "edema", "inflammation", "puffiness",
    "tingling", "paresthesia", "pins and needles", "numbness",
    "bleeding", "hemorrhage", "nosebleed", "bruising",
    "insomnia", "sleep disturbance", "difficulty sleeping", "restlessness",
    "palpitations", "racing heart", "irregular heartbeat",
    "diarrhea", "loose stools", "constipation", "hard stools",
    "blurred vision", "double vision", "eye pain", "visual disturbance",
    "earache", "ringing in ears", "tinnitus", "hearing loss",
    "urinary frequency", "painful urination", "dysuria", "blood in urine",
    "weight loss", "weight gain", "loss of appetite", "increased appetite",
    "confusion", "memory loss", "difficulty concentrating", "brain fog"
}


TREATMENT_DICT = {
    "rest", "bed rest", "take it easy",
    "ice pack", "cold compress", "heat pack", "warm compress",
    "physical therapy", "physiotherapy sessions", "rehab", "exercise program",
    "surgery", "operative repair", "procedure", "surgical intervention", "minimally invasive surgery",
    "antibiotics", "amoxicillin", "doxycycline", "penicillin",
    "painkillers", "analgesics", "ibuprofen", "acetaminophen", "paracetamol", "naproxen",
    "steroids", "corticosteroids", "prednisone", "dexamethasone",
    "insulin therapy", "oral hypoglycemics", "metformin", "glipizide",
    "antihypertensives", "lisinopril", "amlodipine", "losartan",
    "chemotherapy", "radiation therapy", "immunotherapy",
    "inhaler", "bronchodilator", "albuterol", "salmeterol",
    "antidepressants", "SSRIs", "sertraline", "fluoxetine", "venlafaxine",
    "anticoagulants", "warfarin", "heparin", "rivaroxaban",
    "dietary changes", "low-salt diet", "gluten-free diet", "low-carb diet", "Mediterranean diet",
    "vaccination", "immunization", "flu shot", "tetanus shot",
    "wound dressing", "bandaging", "stitches", "sutures",
    "counseling", "psychotherapy", "cognitive behavioral therapy", "CBT",
    "hydration", "fluid intake", "electrolyte replacement", "IV fluids",
    "elevating the limb", "compression stockings", "massage",
    "applying ointment", "topical creams", "steroid creams", "antibiotic ointment",
    "breathing exercises", "pulmonary rehabilitation", "oxygen therapy",
    "diet modification", "exercise regimen", "weight management", "lifestyle changes",
    "mindfulness", "relaxation techniques", "meditation",
    "laparoscopic surgery", "endoscopy", "colonoscopy"
}


DIAGNOSIS_DICT = {
    "hypertension", "high blood pressure",
    "diabetes mellitus", "type 2 diabetes", "type II diabetes", "sugar diabetes",
    "myocardial infarction", "heart attack",
    "stroke", "cerebrovascular accident", "brain attack",
    "whiplash injury", "cervical strain", "neck sprain",
    "concussion", "mild traumatic brain injury", "head injury",
    "pneumonia", "lung infection", "chest infection",
    "urinary tract infection", "UTI", "bladder infection",
    "appendicitis", "inflammation of the appendix",
    "fracture", "broken bone", "hairline fracture",
    "sprain", "ligament tear", "twisted ankle",
    "gastroenteritis", "stomach flu", "food poisoning",
    "anemia", "low hemoglobin", "iron deficiency",
    "migraine disorder", "recurrent headaches",
    "osteoarthritis", "degenerative joint disease", "wear and tear arthritis",
    "depression", "major depressive disorder", "clinical depression",
    "anxiety disorder", "generalized anxiety", "panic disorder",
    "asthma", "reactive airway disease",
    "chronic obstructive pulmonary disease", "COPD", "emphysema",
    "bronchitis", "chest cold",
    "sinusitis", "sinus infection",
    "allergic rhinitis", "hay fever",
    "otitis media", "middle ear infection",
    "gastroesophageal reflux disease", "GERD", "acid reflux",
    "peptic ulcer disease", "stomach ulcer",
    "irritable bowel syndrome", "IBS",
    "kidney stones", "renal calculi",
    "prostatitis", "prostate infection",
    "arthritis", "rheumatoid arthritis", "gout",
    "osteoporosis", "brittle bones",
    "thyroid disorder", "hypothyroidism", "hyperthyroidism",
    "bipolar disorder", "manic depression",
    "schizophrenia", "psychotic disorder",
    "post-traumatic stress disorder", "PTSD",
    "skin cancer", "melanoma", "basal cell carcinoma",
    "hepatitis", "liver inflammation",
    "cirrhosis", "liver scarring",
    "fatty liver disease", "hepatic steatosis",
    "multiple sclerosis", "MS",
    "Parkinson's disease", "tremor disorder",
    "Alzheimer's disease", "dementia"
}


PROGNOSIS_DICT = {
    "full recovery expected", "complete recovery likely", "should make a full recovery",
    "partial recovery", "some residual symptoms possible", "may not fully resolve",
    "guarded prognosis", "uncertain outcome", "wait and see",
    "good prognosis", "favorable outcome", "positive outlook",
    "poor prognosis", "unfavorable outcome", "serious condition",
    "recovery expected within six months", "reconstructive healing time", "recovery in 2-3 weeks", "healing time of 4-6 weeks",
    "likely recurrence", "risk of relapse", "may come back",
    "chronic condition", "long-term management needed", "ongoing treatment required",
    "acute condition", "short-lived course", "temporary issue",
    "stable", "condition stable", "no change expected",
    "progressive", "worsening over time", "may deteriorate",
    "in remission", "disease in remission", "currently under control",
    "monitor regularly", "follow-up recommended", "keep an eye on it",
    "palliative care", "supportive management", "focus on comfort",
    "physiotherapy", "rehabilitation program", "exercise therapy",
    "expected to improve", "likely to get better", "should see improvement",
    "may require ongoing treatment", "long-term follow-up needed", "continued care necessary",
    "risk of complications", "potential for recurrence", "may need further intervention",
    "stable condition", "no immediate concerns", "holding steady",
    "full recovery anticipated", "partial recovery possible", "residual symptoms likely",
    "prognosis is excellent", "very good outlook", "should do well",
    "recovery timeline", "healing process", "expected duration",
    "short-term issue", "long-term condition", "permanent damage",
    "no long-term impact", "should not affect daily life", "minimal lasting effects",
    "10 weeks", "weeks", "months", "days", "years"
}

In [8]:
def extract_medical_spans(text: str):
    entities = []
    doc_si = nlp_med_si(text)
    doc_av = nlp_med_av(text)
    for ent in doc_si.ents:
        entities.append(ent.text)
    for ent in doc_av.ents:
        entities.append(ent.text)
    return entities

In [9]:
def map_to_bucket(span: str):
   
    buckets = {
        "Symptoms": SYMPTOM_DICT,
        "Treatment": TREATMENT_DICT,
        "Diagnosis": DIAGNOSIS_DICT,
        "Prognosis": PROGNOSIS_DICT,
    }
    best_bucket, best_match, best_score = None, None, 0
    for bucket, terms in buckets.items():
        match, score, _ = process.extractOne(span, terms, scorer=fuzz.token_sort_ratio)
        if score > best_score:
            best_bucket, best_match, best_score = bucket, match, score
   
    if best_score >= 70:
        return best_bucket, best_match, best_score
    return None, None, 0


In [10]:
def extract_medical_json(raw_text: str) -> dict:
   
    cleaned = clean_transcript(raw_text)
    patient_name = extract_patient_name(raw_text)
    spans = extract_medical_spans(cleaned)
  
    results = {
        "Patient_Name": patient_name,
        "Symptoms": set(),
        "Treatment": set(),
        "Diagnosis": set(),
        "Prognosis": set(),
    }

    for span in spans:
        bucket, match, score = map_to_bucket(span)
        if bucket:
            results[bucket].add(span)

    for k in ["Symptoms", "Treatment", "Diagnosis", "Prognosis"]:
        results[k] = list(results[k])
    for key,value in results.items():
        if key == "Patient_Name":
            continue
        else:
            if len(value) == 0:
                results[key] = "Not Specified"

    return results


In [11]:
result = extract_medical_json(demo_script)
print(json.dumps(result, indent=2))

{
  "Patient_Name": "Sarah",
  "Symptoms": [
    "condition",
    "persistent",
    "heart",
    "schedule",
    "concentrating",
    "severe",
    "everything",
    "racing"
  ],
  "Treatment": [
    "CBT",
    "SSRIs",
    "appointment",
    "talk therapy",
    "medication",
    "therapy",
    "Sertraline"
  ],
  "Diagnosis": [
    "mental anxiety",
    "Generalized Anxiety Disorder",
    "brain",
    "panic attacks",
    "disorder"
  ],
  "Prognosis": [
    "month",
    "moments",
    "months",
    "years"
  ]
}
