In [None]:
!pip install transformers sentence-transformers langchain langchain-community langchain-groq langchain-openai spacy yake

In [None]:
import re
import spacy
import yake

from langchain import LLMChain, PromptTemplate
from langchain.llms.base import LLM
from langchain.chains import SequentialChain
from langchain.tools import Tool
from langchain.agents import initialize_agent, AgentType
from langchain_core.output_parsers import JsonOutputParser
from transformers import pipeline
from langchain_openai import OpenAI
import json
import os
from google.colab import userdata
os.environ['OPENAI_API_KEY']=userdata.get('OPENAI_API_KEY')

In [2]:
nlp = spacy.load("en_core_web_sm")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
sentiment_pipeline = pipeline("sentiment-analysis")
kw_extractor = yake.KeywordExtractor(lan="en", n=3, top=10)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


In [18]:
transcript = """
Physician: Good morning, Ms. Jones. How are you feeling today?
Patient: Good morning, doctor. I’m doing better, but I still have some discomfort now and then.

Physician: I understand you were in a car accident last September. Can you walk me through what happened?
Patient: Yes, it was on September 1st, around 12:30 in the afternoon. I was driving from Cheadle Hulme to Manchester when I had to stop in traffic. Out of nowhere, another car hit me from behind, which pushed my car into the one in front.

Physician: That sounds like a strong impact. Were you wearing your seatbelt?
Patient: Yes, I always do.

Physician: What did you feel immediately after the accident?
Patient: At first, I was just shocked. But then I realized I had hit my head on the steering wheel, and I could feel pain in my neck and back almost right away.

Physician: Did you seek medical attention at that time?
Patient: Yes, I went to Moss Bank Accident and Emergency. They checked me over and said it was a whiplash injury, but they didn’t do any X-rays. They just gave me some advice and sent me home.

Physician: How did things progress after that?
Patient: The first four weeks were rough. My neck and back pain were really bad—I had trouble sleeping and had to take painkillers regularly. It started improving after that, but I had to go through ten sessions of physiotherapy to help with the stiffness and discomfort.

Physician: That makes sense. Are you still experiencing pain now?
Patient: It’s not constant, but I do get occasional backaches. It’s nothing like before, though.

Physician: That’s good to hear. Have you noticed any other effects, like anxiety while driving or difficulty concentrating?
Patient: No, nothing like that. I don’t feel nervous driving, and I haven’t had any emotional issues from the accident.

Physician: And how has this impacted your daily life? Work, hobbies, anything like that?
Patient: I had to take a week off work, but after that, I was back to my usual routine. It hasn’t really stopped me from doing anything.

Physician: That’s encouraging. Let’s go ahead and do a physical examination to check your mobility and any lingering pain.
[Physical Examination Conducted]
Physician: Everything looks good. Your neck and back have a full range of movement, and there’s no tenderness or signs of lasting damage. Your muscles and spine seem to be in good condition.

Patient: That’s a relief!
Physician: Yes, your recovery so far has been quite positive. Given your progress, I’d expect you to make a full recovery within six months of the accident. There are no signs of long-term damage or degeneration.

Patient: That’s great to hear. So, I don’t need to worry about this affecting me in the future?
Physician: That’s right. I don’t foresee any long-term impact on your work or daily life. If anything changes or you experience worsening symptoms, you can always come back for a follow-up. But at this point, you’re on track for a full recovery.

Patient: Thank you, doctor. I appreciate it.
Physician: You’re very welcome, Ms. Jones. Take care, and don’t hesitate to reach out if you need anything.
"""

In [None]:
def extract_medical_entities(text):
    doc = nlp(text)
    out = {"Symptoms": [], "Diagnosis": [], "Treatment": [], "Prognosis": []}
    patterns = {
        "Symptoms": ["neck pain", "back pain", "headache", "stiffness", "difficulty sleeping"],
        "Diagnosis": ["whiplash", "concussion"],
        "Treatment": ["physiotherapy", "painkillers", "analgesics"],
        "Prognosis": ["full recovery", "no long-term damage"]
    }
    lower = text.lower()
    for k, pats in patterns.items():
        for p in pats:
            if p in lower and p not in out[k]:
                out[k].append(p.title() if k=="Symptoms" else p)

    # detect numeric physiotherapy sessions
    m = re.findall(r"(\d+)\s+(?:physiotherapy|sessions)", text, flags=re.I)
    for val in m:
        out["Treatment"].append(f"{val} physiotherapy sessions")

    # regex fallback
    pain = re.findall(r"(neck|back|head)[^.,;\\n]{0,30}?pain|pain in (?:the )?(neck|back|head)", text, flags=re.I)
    for p in pain:
        if isinstance(p, tuple):
            part = next((x for x in p if x), None)
        else:
            part = p
        if part:
            out["Symptoms"].append(f"{part.capitalize()} pain")
    for k in out:
        out[k] = list(dict.fromkeys(out[k]))
    return out
NERTool = Tool(
    name="medical_ner",
    func=lambda text: extract_medical_entities(text),
    description="Extract Symptoms, Diagnosis, Treatment, Prognosis from transcript"
)

In [None]:
def summarize_text(text):
    words = text.split()
    if len(words) > 400:
        parts = [" ".join(words[i:i+400]) for i in range(0, len(words), 400)]
        sums = [summarizer(p, max_length=130, min_length=30)[0]["summary_text"] for p in parts]
        return " ".join(sums)
    return summarizer(text, max_length=150, min_length=30)[0]["summary_text"]

SummarizerTool = Tool(
    name="summarizer",
    func=lambda text: summarize_text(text),
    description="Return a concise medical summary of the transcript"
)

In [6]:
KeywordTool = Tool(
    name="keywords",
    func=lambda text: [k for k,score in kw_extractor.extract_keywords(text)],
    description="Extract key medical phrases"
)

In [None]:
def sentiment_intent(text):
    s = sentiment_pipeline(text)[0]
    label = s["label"]
    if label.lower() in ("positive", "neutral"):
        sentiment = "Reassured"
    else:
        sentiment = "Anxious"
    t = text.lower()
    if any(w in t for w in ["worry", "worried", "concerned", "nervous"]):
        intent = "Seeking reassurance"
    elif any(w in t for w in ["pain", "hurt", "ache", "stiff"]):
        intent = "Reporting symptoms"
    else:
        intent = "General"
    return {"Sentiment": sentiment, "Intent": intent}

SentimentTool = Tool(
    name="sentiment_intent",
    func=lambda text: sentiment_intent(text),
    description="Return sentiment and intent for patient utterance"
)


In [8]:
soap_template = """
You are a clinical assistant that converts provided notes into a SOAP note JSON.
Input:
{summary}

Entities: {entities}

IMPORTANT:
- Return ONLY valid JSON. No explanation, no headings, no markdown, no leading text.
- Top-level keys must be: "Subjective", "Objective", "Assessment", "Plan".
- If a field is empty, use an empty string or empty list.
"""
llm = OpenAI(
    model="gpt-4o-mini",
    temperature=0,
)

In [None]:
def normalize_entities(ner):
    out = {}
    for k,v in ner.items():
        seen = set()
        lst = []
        for it in v:
            t = " ".join(it.strip().split())
            tnorm = t.lower()
            if tnorm not in seen:
                lst.append(t.title())
                seen.add(tnorm)
        out[k] = lst
    return out

def strip_speakers(text):
    lines = []
    for line in text.splitlines():
        if ":" in line:
            lines.append(line.split(":",1)[1].strip())
        else:
            lines.append(line)
    return "\n".join(lines)

clean_transcript = strip_speakers(transcript)
ner_raw = NERTool.func(clean_transcript)
ner = normalize_entities(ner_raw)

In [10]:
summary_text = summarize_text(clean_transcript)
entities_dict = ner
print("SUMMARY:\n", summary_text)
print("ENTITIES:\n", json.dumps(entities_dict, indent=2, ensure_ascii=False))

SUMMARY:
 Ms. Jones suffered a whiplash injury in a car accident last September. She had to go through ten sessions of physiotherapy to help with the stiffness and discomfort. The first four weeks were rough. There are no signs of long-term damage or degeneration. If anything changes, you can always come back for a follow-up. At this point, you’re on track for a full recovery.
ENTITIES:
 {
  "Symptoms": [
    "Back Pain",
    "Stiffness"
  ],
  "Diagnosis": [
    "Whiplash"
  ],
  "Treatment": [
    "Physiotherapy",
    "Painkillers"
  ],
  "Prognosis": [
    "Full Recovery"
  ]
}


In [11]:
soap_prompt = PromptTemplate(input_variables=["summary", "entities"], template=soap_template)
soap_chain = soap_prompt | llm | JsonOutputParser()
soap_inputs = {
    "summary": summary_text,
    "entities": json.dumps(entities_dict, ensure_ascii=False)
}

In [None]:
def strip_speakers(text):
    lines = []
    for line in text.splitlines():
        lines.append(line.split(":",1)[1].strip() if ":" in line else line)
    return "\n".join(lines)

In [13]:
clean_transcript = strip_speakers(transcript)
summary_text = SummarizerTool.func(clean_transcript)

In [None]:
def _extract_first_json(s):
    m = re.search(r"\{[\s\S]*\}", s)
    return m.group(0) if m else None

In [None]:
from transformers import pipeline
sentiment_fallback = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)


def extract_clean_keywords(clean_transcript, KeywordTool, top_n = 10):
    raw_kws = KeywordTool.func(clean_transcript)
    stopset = {"physician", "patient", "good", "morning", "today", "nt", "n"}
    def clean_kw(k):
        if not k:
            return ""
        # normalize curly quotes, remove punctuation, collapse spaces
        k = k.replace("’", "'").replace("‘", "'")
        k = re.sub(r"[^A-Za-z0-9\s']", " ", k)
        k = " ".join(k.split()).strip()
        k = k.replace("'", "")
        if len(k) <= 2:
            return ""
        return k
    keywords = []
    for k in raw_kws:
        kc = clean_kw(k).lower()
        if kc and kc not in stopset and kc not in (x.lower() for x in keywords):
            keywords.append(kc.title())
        if len(keywords) >= top_n:
            break
    return keywords

sentiment_prompt = PromptTemplate(
    input_variables=["utterance"],
    template="""
    You are a clinical NLP assistant. Analyze the patient's statement below:
    "{utterance}"

    Return a JSON:
    {"Sentiment": one of ["Anxious","Neutral","Reassured"],
     "Intent": one of ["Reporting symptoms","Seeking reassurance","Expressing gratitude","General"]}
    """
)

sentiment_chain = sentiment_prompt | llm | JsonOutputParser()
# sentiment extractor
def extract_patient_sentiments_llm(transcript):
    patient_utts = [
        line.split(":", 1)[1].strip()
        for line in transcript.splitlines()
        if line.strip().lower().startswith("patient:")
    ]
    sentiments = []
    for utt in patient_utts:
        try:
            s = sentiment_chain.invoke({"utterance": utt})
            if not isinstance(s, dict):
                raise ValueError("unexpected sentiment_chain output")
        except Exception:
            try:
                lbl = sentiment_fallback(utt)[0]["label"]
                s = {"Sentiment": "Reassured" if lbl.lower() in ("positive","neutral") else "Anxious",
                     "Intent": "Reporting symptoms"}
            except Exception:
                s = {"Sentiment": "Neutral", "Intent": "General"}
        if not sentiments or s != sentiments[-1]:
            sentiments.append(s)
    return sentiments




config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [16]:
patient_sentiments = extract_patient_sentiments_llm(transcript)
patient_sentiments

[{'Sentiment': 'Anxious', 'Intent': 'Reporting symptoms'},
 {'Sentiment': 'Reassured', 'Intent': 'Reporting symptoms'},
 {'Sentiment': 'Anxious', 'Intent': 'Reporting symptoms'},
 {'Sentiment': 'Reassured', 'Intent': 'Reporting symptoms'}]

In [None]:
def run_pipeline(transcript, patient_name="Unknown"):
    patient_sentiments = extract_patient_sentiments_llm(transcript)

    # clean transcript for summarizer / keywords / NER
    clean_transcript = strip_speakers(transcript)

    # NER on cleaned transcript + normalize
    ner_raw = NERTool.func(clean_transcript)
    ner = normalize_entities(ner_raw)

    # Summarize cleaned transcript
    summary_text = SummarizerTool.func(clean_transcript)

    # Keywords from cleaned transcript
    keywords = extract_clean_keywords(clean_transcript, KeywordTool)

    # Build SOAP input
    entities_json = json.dumps(ner, ensure_ascii=False)
    soap_input = {"summary": summary_text, "entities": entities_json}

    # Invoke soap_chain
    raw = None
    try:
        raw = soap_chain.invoke(soap_input)
    except Exception:
        try:
            if hasattr(soap_chain, "run"):
                raw = soap_chain.run(**soap_input)
        except Exception:
            prompt_text = soap_prompt.format(summary=summary_text, entities=entities_json)
            raw = llm(prompt_text)

    print("DEBUG SOAP RAW:", repr(raw))

    # normalize into dict
    if isinstance(raw, dict):
        soap = raw
    else:
        raw_text = "" if raw is None else (raw if isinstance(raw, str) else str(raw))
        if raw_text.strip().lower() in ("", "none"):
            soap = {"raw": raw_text, "_validation_missing_keys": ["Subjective","Objective","Assessment","Plan"]}
        else:
            m = re.search(r"\{[\s\S]*\}", raw_text)
            if m:
                try:
                    soap = json.loads(m.group(0))
                except Exception:
                    soap = {"raw": raw_text}
            else:
                soap = {"raw": raw_text}

    # Light non-fatal validation
    if isinstance(soap, dict):
        missing = [k for k in ("Subjective","Objective","Assessment","Plan") if k not in soap]
        if missing:
            soap["_validation_missing_keys"] = missing

    return {
        "Patient_Name": patient_name,
        "Entities": ner,
        "Summary": summary_text,
        "Keywords": keywords,
        "Patient_Sentiments": patient_sentiments,
        "SOAP": soap
    }

In [19]:
out = run_pipeline(transcript, patient_name="Janet Jones")
print(json.dumps(out, indent=2))

DEBUG SOAP RAW: {'Subjective': {'Chief Complaint': 'Ms. Jones suffered a whiplash injury in a car accident last September.', 'History of Present Illness': 'She had to go through ten sessions of physiotherapy to help with the stiffness and discomfort. The first four weeks were rough.'}, 'Objective': {'Symptoms': ['Back Pain', 'Stiffness'], 'Diagnosis': ['Whiplash'], 'Treatment': ['Physiotherapy', 'Painkillers'], 'Prognosis': ['Full Recovery']}, 'Assessment': 'No signs of long-term damage or degeneration. On track for a full recovery.', 'Plan': 'If anything changes, you can always come back for a follow-up.'}
{
  "Patient_Name": "Janet Jones",
  "Entities": {
    "Symptoms": [
      "Back Pain",
      "Stiffness"
    ],
    "Diagnosis": [
      "Whiplash"
    ],
    "Treatment": [
      "Physiotherapy",
      "Painkillers"
    ],
    "Prognosis": [
      "Full Recovery"
    ]
  },
  "Summary": "Ms. Jones suffered a whiplash injury in a car accident last September. She had to go through t