In [None]:
from datasets import load_dataset

ds = load_dataset("adesouza1/soap_notes")

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['age', 'patient_name', 'doctor_data', 'gender', 'dob', 'phone', 'person_data', 'health_problem', 'patient_convo', 'soap_notes', 'doctor_name', 'address', 'full_patient_data'],
        num_rows: 558
    })
    test: Dataset({
        features: ['age', 'patient_name', 'doctor_data', 'gender', 'dob', 'phone', 'person_data', 'health_problem', 'patient_convo', 'soap_notes', 'doctor_name', 'address', 'full_patient_data'],
        num_rows: 62
    })
})

In [15]:
ds['train'][0].keys()

dict_keys(['age', 'patient_name', 'doctor_data', 'gender', 'dob', 'phone', 'person_data', 'health_problem', 'patient_convo', 'soap_notes', 'doctor_name', 'address', 'full_patient_data'])

In [48]:
ds['train'][0]['health_problem']

'Joint Pain'

In [18]:
ds['train'][0]['full_patient_data']

'\n\n\nDEMOGRAPHICS:\nName: Zack Fields\nAge: 28\nAddress: 6104 Parker Rd, Minneapolis, Georgia 60074\nPhone: (839) 255-2098\n\nPROBLEM LIST:\n1. Chronic: Joint Pain (Right Knee and Left Ankle)\n2. Other: Seasonal Allergies\n\nHEALTH MAINTENANCE:\nVaccines and Screenings:\n- Tdap: 08/14/2022 (overdue)\n- Influenza: 10/02/2022 (due soon)\n- Colonoscopy: 09/15/2023 (not due yet)\n\nREMINDERS AND RESULTS:\n- Reminder: Tdap booster due\n- Reminder: Schedule annual physical\n- Result: Lipid Panel (09/15/2022): Normal\n\nCARE TEAM AND COMMUNICATION:\nCare Team:\n- Dr. Kristen Kelly, Primary Care Physician\n- Dr. Rebecca Nguyen, Orthopedic Surgeon\n- Sarah Johnson, Nurse Practitioner\n- John Smith, Physical Therapist\n\nCommunication:\n- Preferred Method: Email\n- Emergency Contact: Jane Fields, Mother, (555) 123-4567\n\nALLERGIES:\n- Penicillin\n\nMEDICATIONS:\n- Ibuprofen 600mg, PRN, every 4-6 hours as needed for pain\n- Loratadine 10mg, daily for allergies\n\nIMMUNIZATIONS:\n- Tdap: 08/14/

In [16]:
ds['train'][0]['patient_convo']

"\nPhysician: Good morning, Zack. How are you feeling today?\n\nPatient: Hi, Doc. I've been feeling pretty lousy lately. I've got this constant pain in my right knee and left ankle that just won't go away. It's like a dull ache that's always there, no matter what I'm doing.\n\nPhysician: I see. Can you tell me more about the pain? Is it sharp, dull, throbbing? Does it feel like it's affecting your ability to move around?\n\nPatient: It's definitely dull and achy. It's not excruciating, but it's definitely noticeable. It's like there's pressure on my joints all the time. Yeah, it's definitely affecting my mobility. I've been avoiding stairs and stuff because it just hurts too much.\n\nPhysician: Okay, that helps. Have you noticed any swelling or redness in the affected areas?\n\nPatient: Not really. I mean, sometimes my knee will get a little puffy if I've been walking or standing for a while, but it's not like it's massively swollen or anything.\n\nPhysician: Got it. And how long have 

In [17]:
ds['train'][0]['soap_notes']

"Subjective:\nZack Fields reports experiencing constant pain in his right knee and left ankle for the past six months. He describes the pain as a dull ache that is always present and affects his mobility, particularly when walking or standing for extended periods. He notes that the pain improves when he is sitting or lying down. Zack has been taking ibuprofen for pain relief, but it provides only minimal relief and occasionally causes stomach discomfort.\n\nObjective:\nDuring the examination, Zack appeared uncomfortable when moving his right knee and left ankle. There were no signs of swelling or redness in the affected joints. Zack's range of motion in both joints was limited due to pain. He is currently taking ibuprofen 600mg as needed for pain relief. Zack's mother expressed concerns about his mental health due to the impact of the pain on his daily activities.\n\nAssessment:\nBased on Zack's symptoms and history, the likely diagnosis is osteoarthritis in the right knee and left ank

In [1]:
# src/data_loader.py

from datasets import load_dataset
from typing import Dict, List

class SOAPDataLoader:
    def __init__(self, dataset_name: str = "adesouza1/soap_notes"):
        self.dataset = load_dataset(dataset_name)
    
    def get_examples(self, split: str = "train", n: int = None):
        """Get n examples from dataset"""
        data = self.dataset[split]
        if n:
            data = data.select(range(min(n, len(data))))
        
        return [
            {
                'patient_convo': ex['patient_convo'],
                'soap_notes': ex['soap_notes'],
                'full_patient_data': ex['full_patient_data'],
                'health_problem': ex['health_problem']
            }
            for ex in data
        ]

In [3]:
# src/evals/structure_eval.py

import re

class StructureEvaluator:
    """Check if SOAP note has all required sections"""
    
    REQUIRED_SECTIONS = ['Subjective', 'Objective', 'Assessment', 'Plan']
    
    def evaluate(self, generated_note: str) -> dict:
        results = {
            'has_all_sections': True,
            'missing_sections': [],
            'section_lengths': {}
        }
        
        for section in self.REQUIRED_SECTIONS:
            # Check if section exists
            pattern = rf'{section}:?\s*\n'
            if not re.search(pattern, generated_note, re.IGNORECASE):
                results['has_all_sections'] = False
                results['missing_sections'].append(section)
            else:
                # Measure section length
                section_text = self._extract_section(generated_note, section)
                results['section_lengths'][section] = len(section_text.split())
        
        return results
    
    def _extract_section(self, note: str, section: str) -> str:
        """Extract text from a specific section"""
        # Simple regex to get section content
        pattern = rf'{section}:?\s*\n(.*?)(?=\n[A-Z][a-z]+:|\Z)'
        match = re.search(pattern, note, re.IGNORECASE | re.DOTALL)
        return match.group(1).strip() if match else ""

In [4]:
# Load data
loader = SOAPDataLoader()
examples = loader.get_examples(n=10)  # Start with 10

# Run eval
evaluator = StructureEvaluator()

for i, example in enumerate(examples):
    result = evaluator.evaluate(example['soap_notes'])
    print(f"\n--- Example {i} ---")
    print(f"All sections present: {result['has_all_sections']}")
    if result['missing_sections']:
        print(f"Missing: {result['missing_sections']}")
    print(f"Section lengths: {result['section_lengths']}")


--- Example 0 ---
All sections present: True
Section lengths: {'Subjective': 74, 'Objective': 68, 'Assessment': 57, 'Plan': 71}

--- Example 1 ---
All sections present: True
Section lengths: {'Subjective': 44, 'Objective': 42, 'Assessment': 34, 'Plan': 32}

--- Example 2 ---
All sections present: True
Section lengths: {'Subjective': 52, 'Objective': 44, 'Assessment': 43, 'Plan': 74}

--- Example 3 ---
All sections present: True
Section lengths: {'Subjective': 50, 'Objective': 56, 'Assessment': 57, 'Plan': 42}

--- Example 4 ---
All sections present: True
Section lengths: {'Subjective': 79, 'Objective': 47, 'Assessment': 59, 'Plan': 97}

--- Example 5 ---
All sections present: True
Section lengths: {'Subjective': 69, 'Objective': 47, 'Assessment': 48, 'Plan': 98}

--- Example 6 ---
All sections present: True
Section lengths: {'Subjective': 38, 'Objective': 55, 'Assessment': 51, 'Plan': 57}

--- Example 7 ---
All sections present: True
Section lengths: {'Subjective': 57, 'Objective': 56

In [7]:
import medspacy 

In [8]:
nlp = medspacy.load() 

In [13]:
example['patient_convo']

"\n\nPhysician: Good morning, Mr. Diaz. How are you feeling today?\n\nPatient: Hi, Dr. Burns. I've been feeling pretty lousy lately. I've had this constant heartburn that doesn't seem to go away, even when I take my medication. And sometimes, I get this sharp pain in my stomach that makes me feel like I'm going to throw up.\n\nPhysician: I see. Have you noticed any other symptoms? Maybe some bloating or abdominal discomfort?\n\nPatient: Yeah, I do get bloated a lot, and sometimes I have this feeling of fullness that lasts for hours. And the abdominal pain can be pretty intense. It's like someone is stabbing me with a knife.\n\nPhysician: I understand. Have you had any changes in your bowel movements? Maybe some diarrhea or constipation?\n\nPatient: Well, I have had some diarrhea lately, but it's not always there. And sometimes, I feel like I can't go to the bathroom even when I need to. It's like my body is just not responding.\n\nPhysician: Okay, that's helpful. Have you noticed any o

In [9]:
example = examples[1]
# logging.getLogger("PyRuSH").setLevel(logging.WARNING)
transcript = nlp(example['soap_note']) 

[32m2025-11-04 18:03:29.148[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 1 'Physician' marked as sentence start (span begin)[0m
[32m2025-11-04 18:03:29.148[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 9 'How' marked as sentence start (span end next token)[0m
[32m2025-11-04 18:03:29.149[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 9 'How' marked as sentence start (span begin)[0m
[32m2025-11-04 18:03:29.149[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=0] [doc 0] Token 15 '

' marked as sentence start (span end whitespace)[0m
[32m2025-11-04 18:03:29.150[0m | [34m[1mDEBUG   [0m

In [46]:
note_doc = nlp(example['soap_notes'])

[32m2025-11-04 16:58:04.037[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=7] [doc 0] Token 0 'Subjective' marked as sentence start (span begin)[0m
[32m2025-11-04 16:58:04.038[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=7] [doc 0] Token 2 '
' marked as sentence start (span end whitespace)[0m
[32m2025-11-04 16:58:04.039[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=7] [doc 0] GAP DETECTED: tokens 2-2 (idx 11-11) between spans 11-12[0m
[32m2025-11-04 16:58:04.040[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=7] [doc 0] Token 2 '
' marked as sentence start (whitespace in gap between spans)[0m
[32m2025-11-04 16:58:04.040[0m | [34m[

In [10]:
transcript_entities = [ent.text for ent in transcript.ents]
transcript_entities

[]

In [None]:
# Uninstall both
!pip uninstall numpy spacy -y

# Reinstall with compatible versions
!pip install numpy==1.24.3
!pip install spacy==3.7.2

# Now download the model
! python -m spacy download en_core_web_sm

In [47]:
example

{'patient_convo': "\n\nPhysician: Good morning, Mr. Diaz. How are you feeling today?\n\nPatient: Hi, Dr. Burns. I've been feeling pretty lousy lately. I've had this constant heartburn that doesn't seem to go away, even when I take my medication. And sometimes, I get this sharp pain in my stomach that makes me feel like I'm going to throw up.\n\nPhysician: I see. Have you noticed any other symptoms? Maybe some bloating or abdominal discomfort?\n\nPatient: Yeah, I do get bloated a lot, and sometimes I have this feeling of fullness that lasts for hours. And the abdominal pain can be pretty intense. It's like someone is stabbing me with a knife.\n\nPhysician: I understand. Have you had any changes in your bowel movements? Maybe some diarrhea or constipation?\n\nPatient: Well, I have had some diarrhea lately, but it's not always there. And sometimes, I feel like I can't go to the bathroom even when I need to. It's like my body is just not responding.\n\nPhysician: Okay, that's helpful. Have