In [3]:
def extract_tokens_from_radgraph(text_list):
    from radgraph import RadGraph
    
    # Initialize RadGraph
    radgraph = RadGraph()
    
    # Process the input text
    annotations = radgraph(text_list)
    
    # Extract only the tokens from each entity
    all_tokens = []
    
    for doc_id, doc_data in annotations.items():
        for entity_id, entity_data in doc_data['entities'].items():
            # Some tokens might be phrases (multi-word), so split if needed
            tokens = entity_data['tokens'].split()
            all_tokens.extend(tokens)
    
    return all_tokens

# Example usage
text = ["no evidence of acute cardiopulmonary process moderate hiatal hernia"]
token_list = extract_tokens_from_radgraph(text)
print(token_list)

model_type not provided, defaulting to radgraph-xl
['acute', 'cardiopulmonary', 'process', 'moderate', 'hiatal', 'hernia']


In [2]:
import json
from datasets import Dataset, DatasetDict

# 文件路径
file_path = "/home/ghan/R2Gen/data/mimic_cxr/annotation_label.json"

# 加载 JSON 文件
with open(file_path, 'r') as f:
    raw_data = json.load(f)

# 检查顶级键
print("顶级键：", raw_data.keys())

# 检查 train 的样本数量
print("Train 样本数量：", len(raw_data['train']))

# 将数据转换为 DatasetDict
dataset_dict = {
    split: Dataset.from_list(raw_data[split]) for split in raw_data.keys()
}
dataset = DatasetDict(dataset_dict)

# 打印概览
print("数据集概览：", dataset)

for i in range(min(1, len(dataset['train']))):
    print(f"样本 {i+1}: {dataset['train'][i]}")

顶级键： dict_keys(['train', 'val', 'test'])
Train 样本数量： 270790
数据集概览： DatasetDict({
    train: Dataset({
        features: ['id', 'study_id', 'subject_id', 'report', 'image_path', 'split', 'label'],
        num_rows: 270790
    })
    val: Dataset({
        features: ['id', 'study_id', 'subject_id', 'report', 'image_path', 'split', 'label'],
        num_rows: 2130
    })
    test: Dataset({
        features: ['id', 'study_id', 'subject_id', 'report', 'image_path', 'split', 'label'],
        num_rows: 3858
    })
})
样本 1: {'id': '02aa804e-bde0afdd-112c0b34-7bc16630-4e384014', 'study_id': 50414267, 'subject_id': 10000032, 'report': 'There is no focal consolidation, pleural effusion or pneumothorax.  Bilateral\n nodular opacities that most likely represent nipple shadows. The\n cardiomediastinal silhouette is normal.  Clips project over the left lung,\n potentially within the breast. The imaged upper abdomen is unremarkable.\n Chronic deformity of the posterior left sixth and seventh ribs ar

In [6]:
import json
import os
from tqdm import tqdm
from datasets import Dataset, DatasetDict
from radgraph import RadGraph
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_report(report_text, radgraph_model):
    """Process a single report with RadGraph and extract tokens, splitting multi-word tokens"""
    annotations = radgraph_model([report_text])
    
    # Extract tokens and split multi-word tokens
    tokens = []
    if annotations and '0' in annotations:
        for entity_id, entity_data in annotations['0']['entities'].items():
            # Split multi-word tokens by space
            token_text = entity_data['tokens']
            individual_tokens = token_text.split()
            tokens.extend(individual_tokens)
    
    return tokens

def process_dataset_with_radgraph(dataset_dict, num_workers=8):
    # Initialize RadGraph
    radgraph = RadGraph()
    
    # Create a new dictionary to store the processed data
    processed_data = {}
    
    # Process each split
    for split_name, split_dataset in dataset_dict.items():
        print(f"Processing {split_name} split...")
        processed_split = []
        all_samples = list(split_dataset)
        
        # Create a progress bar
        pbar = tqdm(total=len(all_samples), desc=f"Processing {split_name}")
        
        # Setup ThreadPoolExecutor
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            # Submit tasks
            future_to_sample = {}
            for sample in all_samples:
                future = executor.submit(process_report, sample['report'], radgraph)
                future_to_sample[future] = sample
            
            # Process results as they complete
            for future in as_completed(future_to_sample):
                sample = future_to_sample[future]
                try:
                    tokens = future.result()
                    new_sample = sample.copy()
                    new_sample['tokens'] = tokens
                    processed_split.append(new_sample)
                except Exception as exc:
                    print(f"Sample processing generated an exception: {exc}")
                    # Still add the sample but with empty tokens
                    new_sample = sample.copy()
                    new_sample['tokens'] = []
                    processed_split.append(new_sample)
                
                # Update progress bar
                pbar.update(1)
        
        # Close progress bar
        pbar.close()
        processed_data[split_name] = processed_split
    
    return processed_data

# Main execution
if __name__ == "__main__":
    # File paths
    file_path = "/home/ghan/R2Gen/data/mimic_cxr/annotation_label.json"
    output_path = "/home/ghan/R2Gen/data/mimic_cxr/annotation_label_with_tokens.json"
    
    # Number of worker threads
    num_workers = 8  # Adjust based on your CPU cores
    
    # Load JSON file
    print("Loading dataset...")
    with open(file_path, 'r') as f:
        raw_data = json.load(f)
    
    # Convert to DatasetDict
    dataset_dict = {
        split: Dataset.from_list(raw_data[split]) for split in raw_data.keys()
    }
    dataset = DatasetDict(dataset_dict)
    
    # Process with RadGraph using multiple threads
    print(f"Processing reports with RadGraph using {num_workers} threads...")
    processed_data = process_dataset_with_radgraph(dataset, num_workers)
    
    # Save processed data to new JSON file
    print(f"Saving processed data to {output_path}...")
    with open(output_path, 'w') as f:
        json.dump(processed_data, f)
    
    print("Done!")

Loading dataset...
Processing reports with RadGraph using 8 threads...
model_type not provided, defaulting to radgraph-xl
Processing train split...


Processing train: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 270790/270790 [5:58:58<00:00, 12.57it/s]


Processing val split...


Processing val: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2130/2130 [01:25<00:00, 25.05it/s]


Processing test split...


Processing test: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3858/3858 [02:51<00:00, 22.51it/s]


Saving processed data to /home/ghan/R2Gen/data/mimic_cxr/annotation_label_with_tokens.json...
Done!


In [7]:
import json
import os
from tqdm import tqdm
from datasets import Dataset, DatasetDict
from radgraph import RadGraph
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_report(report_text, radgraph_model):
    """Process a single report with RadGraph and extract tokens, splitting multi-word tokens"""
    annotations = radgraph_model([report_text])
    
    # Extract tokens and split multi-word tokens
    tokens = []
    if annotations and '0' in annotations:
        for entity_id, entity_data in annotations['0']['entities'].items():
            # Split multi-word tokens by space
            token_text = entity_data['tokens']
            individual_tokens = token_text.split()
            tokens.extend(individual_tokens)
    
    return tokens

def process_dataset_with_radgraph(dataset_dict, num_workers=8):
    # Initialize RadGraph
    radgraph = RadGraph()
    
    # Create a new dictionary to store the processed data
    processed_data = {}
    
    # Process each split
    for split_name, split_dataset in dataset_dict.items():
        print(f"Processing {split_name} split...")
        processed_split = []
        all_samples = list(split_dataset)
        
        # Create a progress bar
        pbar = tqdm(total=len(all_samples), desc=f"Processing {split_name}")
        
        # Setup ThreadPoolExecutor
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            # Submit tasks
            future_to_sample = {}
            for sample in all_samples:
                future = executor.submit(process_report, sample['report'], radgraph)
                future_to_sample[future] = sample
            
            # Process results as they complete
            for future in as_completed(future_to_sample):
                sample = future_to_sample[future]
                try:
                    tokens = future.result()
                    new_sample = sample.copy()
                    new_sample['tokens'] = tokens
                    processed_split.append(new_sample)
                except Exception as exc:
                    print(f"Sample processing generated an exception: {exc}")
                    # Still add the sample but with empty tokens
                    new_sample = sample.copy()
                    new_sample['tokens'] = []
                    processed_split.append(new_sample)
                
                # Update progress bar
                pbar.update(1)
        
        # Close progress bar
        pbar.close()
        processed_data[split_name] = processed_split
    
    return processed_data

# Main execution
if __name__ == "__main__":
    # File paths
    file_path = "/home/ghan/R2Gen/data/iu_xray/annotation_label.json"
    output_path = "/home/ghan/R2Gen/data/iu_xray/annotation_label_with_tokens.json"
    
    # Number of worker threads
    num_workers = 8  # Adjust based on your CPU cores
    
    # Load JSON file
    print("Loading dataset...")
    with open(file_path, 'r') as f:
        raw_data = json.load(f)
    
    # Convert to DatasetDict
    dataset_dict = {
        split: Dataset.from_list(raw_data[split]) for split in raw_data.keys()
    }
    dataset = DatasetDict(dataset_dict)
    
    # Process with RadGraph using multiple threads
    print(f"Processing reports with RadGraph using {num_workers} threads...")
    processed_data = process_dataset_with_radgraph(dataset, num_workers)
    
    # Save processed data to new JSON file
    print(f"Saving processed data to {output_path}...")
    with open(output_path, 'w') as f:
        json.dump(processed_data, f)
    
    print("Done!")

Loading dataset...
Processing reports with RadGraph using 8 threads...
model_type not provided, defaulting to radgraph-xl
Processing train split...


Processing train: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2069/2069 [01:08<00:00, 30.17it/s]


Processing val split...


Processing val: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 296/296 [00:09<00:00, 30.18it/s]


Processing test split...


Processing test: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 590/590 [00:18<00:00, 31.07it/s]


Saving processed data to /home/ghan/R2Gen/data/iu_xray/annotation_label_with_tokens.json...
Done!
