# Task 1: Data Exploration and Processing

Explore your specific dataset by calculating basic statistics number of samples and number of samples per class: is your dataset balanced? min / avg / max length of text reading through 100+ samples: noteworthy style, vocabulary, idioms.

In [15]:
from datasets import load_dataset
import pandas as pd

def load_medical_domain_dataset():
    """
    Load the medical domain dataset from Hugging Face
    """
    try:
        # Load the dataset
        dataset = load_dataset("argilla/medical-domain")
        print("Dataset loaded successfully!")
        print(f"Available splits: {list(dataset.keys())}")
        
        return dataset
    
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None
    
dataset = load_medical_domain_dataset()

Dataset loaded successfully!
Available splits: ['train']


In [None]:
def analyze_dataset_statistics(dataset):
    """
    Analyze the length of text and distribution of samples across different classes in the dataset
    """
    for split_name, split_data in dataset.items():
        
        # Display number of samples
        print(f"\nNumber of samples: {len(split_data)}")
        print(f"  Features: {list(split_data.features.keys())}")
        #print(f"  First sample: {split_data[1]}")
        if len(split_data) > 0:
            # Calculate Min, Avg, Max length of text
            text_lengths = [len(sample['text']) for sample in split_data if 'text' in sample]
            if text_lengths:
                min_length = min(text_lengths)
                avg_length = sum(text_lengths) / len(text_lengths)
                max_length = max(text_lengths)
                print(f" Text length:\n     Min: {min_length}, Avg: {avg_length:.2f}, Max: {max_length}")
        
        print("\nAnalyzing class distribution:")
        # Count samples per class
        class_counts = {}
        total_samples = len(split_data)
        
        for sample in split_data:
            if 'prediction' in sample and isinstance(sample['prediction'], list):
                for label in sample['prediction']:
                    if 'label' in label:
                        prediction = label['label']
                        class_counts[prediction] = class_counts.get(prediction, 0) + 1
        
        # Display results
        if class_counts:
            print("\nClass distribution:")
            for class_name, count in class_counts.items():
                percentage = (count / total_samples) * 100
                print(f"  {class_name}: {count} samples ({percentage:.2f}%)")
        else:
            print("No prediction labels found in this split")

if dataset:
    analyze_dataset_statistics(dataset)


Number of samples: 4966
  Features: ['text', 'inputs', 'prediction', 'prediction_agent', 'annotation', 'annotation_agent', 'multi_label', 'explanation', 'id', 'metadata', 'status', 'event_timestamp', 'metrics']
 Text length:
     Min: 11, Avg: 3052.314337494966, Max: 18425

Analyzing class distribution:

Class distribution:
   Gastroenterology: 224 samples (4.51%)
   Surgery: 1088 samples (21.91%)
   Radiology: 273 samples (5.50%)
   SOAP / Chart / Progress Notes: 166 samples (3.34%)
   Letters: 23 samples (0.46%)
   Lab Medicine - Pathology: 8 samples (0.16%)
   Consult - History and Phy.: 516 samples (10.39%)
   Podiatry: 47 samples (0.95%)
   General Medicine: 259 samples (5.22%)
   Psychiatry / Psychology: 53 samples (1.07%)
   Cardiovascular / Pulmonary: 371 samples (7.47%)
   Urology: 156 samples (3.14%)
   Ophthalmology: 83 samples (1.67%)
   Physical Medicine - Rehab: 21 samples (0.42%)
   Neurology: 223 samples (4.49%)
   Autopsy: 8 samples (0.16%)
   Orthopedic: 355 samples 

Establish a structured and flexible (configurable) processing pipeline with steps for reading documents from file tokenizing normalizing (lowercase, lemmatize/stem, …) filtering (stop words, …)
Use the script clustering.py as a template for clustering your text