In [1]:
from datasets import load_dataset

# download and load data
dataset = load_dataset("argilla/medical-domain")

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'inputs', 'prediction', 'prediction_agent', 'annotation', 'annotation_agent', 'multi_label', 'explanation', 'id', 'metadata', 'status', 'event_timestamp', 'metrics'],
        num_rows: 4966
    })
})


In [2]:
texts = dataset['train']['text']
labels = dataset['train']['prediction']
import pandas as pd
import numpy as np

# Load into a DataFrame for easier handling
df = pd.DataFrame({
    "text": texts,
    "label": labels
})


In [3]:
df['label'].head(10).tolist()

[[{'label': ' Gastroenterology', 'score': 1.0}],
 [{'label': ' Surgery', 'score': 1.0}],
 [{'label': ' Radiology', 'score': 1.0}],
 [{'label': ' Surgery', 'score': 1.0}],
 [{'label': ' SOAP / Chart / Progress Notes', 'score': 1.0}],
 [{'label': ' Letters', 'score': 1.0}],
 [{'label': ' Surgery', 'score': 1.0}],
 [{'label': ' Radiology', 'score': 1.0}],
 [{'label': ' Lab Medicine - Pathology', 'score': 1.0}],
 [{'label': ' Surgery', 'score': 1.0}]]

In [4]:
def extract_label(prediction_list):
    """
    """
    if isinstance(prediction_list, list) and len(prediction_list) > 0:
        item = prediction_list[0]
        if isinstance(item, dict) and 'label' in item:
            return item['label'].strip()  # remove space
    return None

df = pd.DataFrame({
    "text": texts,
    "label": [extract_label(p) for p in labels]
})
df['label'].head(10).tolist()

['Gastroenterology',
 'Surgery',
 'Radiology',
 'Surgery',
 'SOAP / Chart / Progress Notes',
 'Letters',
 'Surgery',
 'Radiology',
 'Lab Medicine - Pathology',
 'Surgery']

In [6]:

# --- 1. Basic info ---
print(f"Total samples: {len(df)}")
print(f"Unique labels: {df['label'].nunique()}")

# Count per class
label_counts = df['label'].value_counts()
print("\nSamples per class:")
print(label_counts)

# Check balance
print("\nDataset balance:")
print((label_counts / len(df) * 100).round(2))




Total samples: 4966
Unique labels: 40

Samples per class:
label
Surgery                          1088
Consult - History and Phy.        516
Cardiovascular / Pulmonary        371
Orthopedic                        355
Radiology                         273
General Medicine                  259
Gastroenterology                  224
Neurology                         223
SOAP / Chart / Progress Notes     166
Urology                           156
Obstetrics / Gynecology           155
Discharge Summary                 108
ENT - Otolaryngology               96
Neurosurgery                       94
Hematology - Oncology              90
Ophthalmology                      83
Nephrology                         81
Emergency Room Reports             75
Pediatrics - Neonatal              70
Pain Management                    61
Psychiatry / Psychology            53
Office Notes                       50
Podiatry                           47
Dermatology                        29
Cosmetic / Plastic Surge

In [None]:
# --- 2. Text length statistics ---
df['length'] = df['text'].apply(lambda x: len(str(x).split()))

print("\nText length statistics:")
print(f"Min: {df['length'].min()}")
print(f"Avg: {df['length'].mean():.2f}")
print(f"Max: {df['length'].max()}")



Text length statistics:
Min: 1
Avg: 465.45
Max: 3029

Sample examples:

- PREOPERATIVE DIAGNOSIS:,  Iron deficiency anemia.,POSTOPERATIVE DIAGNOSIS:,  Diverticulosis.,PROCEDURE:,  Colonoscopy.,MEDICATIONS: , MAC.,PROCEDURE: , The Olympus pediatric variable colonoscope was i 

- CLINICAL INDICATION:  ,Normal stress test.,PROCEDURES PERFORMED:,1.  Left heart cath.,2.  Selective coronary angiography.,3.  LV gram.,4.  Right femoral arteriogram.,5.  Mynx closure device.,PROCEDURE 

- FINDINGS:,Axial scans were performed from L1 to S2 and reformatted images were obtained in the sagittal and coronal planes.,Preliminary scout film demonstrates anterior end plate spondylosis at T11-12 

- PREOPERATIVE DIAGNOSIS: , Blood loss anemia.,POSTOPERATIVE DIAGNOSES:,1.  Diverticulosis coli.,2.  Internal hemorrhoids.,3.  Poor prep.,PROCEDURE PERFORMED:,  Colonoscopy with photos.,ANESTHESIA: , Co 

- REASON FOR VISIT:  ,Elevated PSA with nocturia and occasional daytime frequency.,HISTORY: , A 68-year-old

In [10]:

# --- 3. Example inspection ---

# number of examples per class (adjust if needed)
examples_per_class = 3

# loop over each unique label
for label_name in sorted(df['label'].dropna().unique()):
    print(f"\n{'='*20} {label_name} {'='*20}")
    subset = df[df['label'] == label_name]
    
    # sample up to 'examples_per_class' texts from this class
    sample_texts = subset['text'].sample(min(len(subset), examples_per_class), random_state=42)
    
    for i, text in enumerate(sample_texts, start=1):
        # shorten text and replace newlines for readability
        preview = str(text).replace('\n', ' ')[:400]
        print(f"\nExample {i}:\n{preview} ...")





Example 1:
HISTORY:,  I had the pleasure of meeting and evaluating the patient referred today for evaluation and treatment of chronic sinusitis.  As you are well aware, she is a pleasant 50-year-old female who states she started having severe sinusitis about two to three months ago with facial discomfort, nasal congestion, eye pain, and postnasal drip symptoms.  She states she really has sinus problems, but  ...

Example 2:
SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not ...

Example 3:
HISTORY: , A 34-year-old male presents today self-referred at the recommendation of Emergency Room physicians and his nephrologist to pursue further alle