# Bio-Domain Datasets

In [1]:
import json
import os
import xml.etree.ElementTree as ET

from tqdm.auto import tqdm
from transformers import BertTokenizerFast
from datasets import load_dataset, dataset_dict
from pprint import pprint
from typing import List, Dict

In [2]:
tokenizer: BertTokenizerFast = BertTokenizerFast.from_pretrained('bert-base-uncased')

def char_to_token_indices(offset_mapping, entities: List[Dict]) -> List[Dict]:
    """
    Converts character start/end indices to BERT token-level start/end indices for a list of entities.
    """
    for entity in entities:
        entity['token_offsets'] = [(-1, -1)] * len(entity['char_offsets'])
        for i, (start, end) in enumerate(entity['char_offsets']):
            char_start, char_end = start, end
            token_start, token_end = entity['token_offsets'][i][0], entity['token_offsets'][i][1]

            for j, (start, end) in enumerate(offset_mapping):
                # Skip special tokens ((0, 0) offset)
                if start == end == 0:
                    continue

                if token_start == -1 and start >= char_start:
                    token_start = j

                if token_start != -1 and end >= char_end:
                    token_end = j
                    break

            entity['token_offsets'][i] = (token_start, token_end)

    return entities

#### Testing offset mapping to character indices relationship

In [None]:
text = "Manipulation of kinetic profiles in 2-aryl propionic acid cyclooxygenase inhibitors.\nThe nonsteroidal anti-inflammatory drugs flurbiprofen and ibuprofen were modified in an attempt to alter the kinetics of inhibitor binding by COX-1. Contrary to prior predictions, a halogen substituent is not sufficient to confer slow tight-binding behavior. Conversion of the carboxylate moiety of flurbiprofen to an ester or amide abolishes slow tight-binding behavior, regardless of halogenation state."
tokens = tokenizer.encode_plus(text, add_special_tokens=True, return_offsets_mapping=True)
offset_mapping = tokens['offset_mapping']
print(offset_mapping)

[(0, 0), (0, 12), (13, 15), (16, 23), (24, 32), (33, 35), (36, 37), (37, 38), (38, 40), (40, 42), (43, 47), (47, 50), (50, 52), (53, 57), (58, 60), (60, 62), (62, 64), (64, 66), (66, 72), (73, 83), (83, 84), (85, 88), (89, 92), (92, 96), (96, 99), (99, 101), (102, 106), (106, 107), (107, 119), (120, 125), (126, 129), (129, 131), (131, 133), (133, 135), (135, 138), (139, 142), (143, 145), (145, 147), (147, 149), (149, 152), (153, 157), (158, 166), (167, 169), (170, 172), (173, 180), (181, 183), (184, 189), (190, 193), (194, 201), (201, 202), (203, 205), (206, 215), (216, 223), (224, 226), (227, 230), (230, 231), (231, 232), (232, 233), (234, 242), (243, 245), (246, 251), (252, 263), (263, 264), (265, 266), (267, 271), (271, 274), (275, 278), (278, 281), (281, 283), (283, 286), (287, 289), (290, 293), (294, 304), (305, 307), (308, 311), (311, 314), (315, 319), (320, 325), (325, 326), (326, 333), (334, 342), (342, 343), (344, 354), (355, 357), (358, 361), (362, 365), (365, 368), (368, 371

In [None]:
detokenized = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
print(detokenized)

manipulation of kinetic profiles in 2 - aryl propionic acid cyclooxygenase inhibitors. the nonsteroidal anti - inflammatory drugs flurbiprofen and ibuprofen were modified in an attempt to alter the kinetics of inhibitor binding by cox - 1. contrary to prior predictions, a halogen substituent is not sufficient to confer slow tight - binding behavior. conversion of the carboxylate moiety of flurbiprofen to an ester or amide abolishes slow tight - binding behavior, regardless of halogenation state.


In [20]:
def find_max_num_offsets(dataset: dataset_dict) -> int:
    max_offsets = 0
    max_example = None
    for example in dataset:
        for entity in example['entities']:
            num_offsets = len(entity['offsets'])
            if num_offsets > max_offsets:
                max_offsets = num_offsets
                max_example = entity['offsets']

    return max_offsets, max_example

### BigBiokB Dataset Processing Function

In [3]:
def process_bigbio(dataset: dataset_dict, output_dir: str, split: str) -> None:
    """
    Processes a bigbio dataset from HuggingFace into a standard format and saves it to a JSON file.
    """
    formatted_data = []
    for example in tqdm(dataset, desc=f"Processing {split}"):
        text = ' '.join([passage['text'][0] for passage in example['passages']])

        # Skip documents exceeding token size limit of BERT
        if len(text) > 512:
            continue

        # Tokenize and get token to character position mapping
        tokenization = tokenizer.encode_plus(text, return_offsets_mapping=True, add_special_tokens=True)
        offset_mapping = tokenization['offset_mapping']
        
        doc_id = example['id']
        
        # Process entities
        entities_map = {}
        entities = []
        for entity in example['entities']:
            entity_id = entity['id']
            entity_type = entity['type']
            entity_text = entity['text'][0]
            offsets = [(offset[0], offset[1]) for offset in entity['offsets']]
            entity_data = {'text': entity_text, 'type': entity_type, 'char_offsets': offsets}
            entities_map[entity_id] = entity_data
            entities.append(entity_data)
        
        entities = char_to_token_indices(offset_mapping, entities)
        
        # Process relations
        relations = []
        for relation in example['relations']:
            rel_type = relation['type']
            head_id, tail_id = relation['arg1_id'], relation['arg2_id']
            head, tail = entities_map.get(head_id), entities_map.get(tail_id)
            relations.append({'type': rel_type, 'head': head, 'tail': tail})

        formatted_data.append({'id': doc_id, 'text': text, 'entities': entities, 'relations': relations})

    # Save data
    os.makedirs(output_dir, exist_ok=True)
    with open(f"{output_dir}/{split}.json", 'w', encoding='utf-8') as f:
        json.dump(formatted_data, f, indent=2)

## ChemProt

In [4]:
chemprot_dataset = load_dataset("bigbio/chemprot", "chemprot_bigbio_kb")

print(chemprot_dataset)

DatasetDict({
    sample: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 50
    })
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 1020
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 612
    })
})


In [6]:
chemprot_sample = chemprot_dataset['sample'][0]

pprint(chemprot_sample)

{'coreferences': [],
 'document_id': '10471277',
 'entities': [{'id': '2',
               'normalized': [],
               'offsets': [[135, 145]],
               'text': ['Salmeterol'],
               'type': 'CHEMICAL'},
              {'id': '3',
               'normalized': [],
               'offsets': [[1248, 1259]],
               'text': ['[(125)I]IAS'],
               'type': 'CHEMICAL'},
              {'id': '4',
               'normalized': [],
               'offsets': [[1285, 1299]],
               'text': ['(-)-alprenolol'],
               'type': 'CHEMICAL'},
              {'id': '5',
               'normalized': [],
               'offsets': [[1329, 1332]],
               'text': ['GTP'],
               'type': 'CHEMICAL'},
              {'id': '6',
               'normalized': [],
               'offsets': [[1346, 1355]],
               'text': ['[125I]IAS'],
               'type': 'CHEMICAL'},
              {'id': '7',
               'normalized': [],
               'o

In [None]:
print(find_max_num_offsets(chemprot_dataset['train']))
print(find_max_num_offsets(chemprot_dataset['test']))

In [5]:
process_bigbio(chemprot_dataset['train'], '../data/chemprot', 'train')
process_bigbio(chemprot_dataset['validation'], '../data/chemprot', 'val')
process_bigbio(chemprot_dataset['test'], '../data/chemprot', 'test')

Processing train:   0%|          | 0/1020 [00:00<?, ?it/s]

Processing val:   0%|          | 0/612 [00:00<?, ?it/s]

Processing test:   0%|          | 0/800 [00:00<?, ?it/s]

## DDI dataset

In [6]:
ddi_dataset = load_dataset("bigbio/ddi_corpus", "ddi_corpus_bigbio_kb")

print(ddi_dataset)

Using the latest cached version of the dataset since bigbio/ddi_corpus couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'ddi_corpus_bigbio_kb' at /home/bt19d200/.cache/huggingface/datasets/bigbio___ddi_corpus/ddi_corpus_bigbio_kb/1.0.0/da8e94986a0c689095b22bed134248b11f9311c7 (last modified on Tue Jul 22 00:18:23 2025).


DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 714
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 303
    })
})


In [18]:
ddi_sample = ddi_dataset['train'][0]

pprint(ddi_sample)

{'coreferences': [],
 'document_id': '15825391',
 'entities': [{'id': '15825391_T1',
               'normalized': [],
               'offsets': [[46, 60]],
               'text': ['amphotericin B'],
               'type': 'DRUG'},
              {'id': '15825391_T2',
               'normalized': [],
               'offsets': [[75, 82]],
               'text': ['filipin'],
               'type': 'DRUG_N'},
              {'id': '15825391_T3',
               'normalized': [],
               'offsets': [[111, 130]],
               'text': ['polyene antibiotics'],
               'type': 'GROUP'},
              {'id': '15825391_T4',
               'normalized': [],
               'offsets': [[143, 150]],
               'text': ['filipin'],
               'type': 'DRUG_N'},
              {'id': '15825391_T5',
               'normalized': [],
               'offsets': [[202, 216]],
               'text': ['amphotericin B'],
               'type': 'DRUG'},
              {'id': '15825391_T6',
   

In [21]:
find_max_num_offsets(ddi_dataset['train'])

(2, [[635, 638], [661, 678]])

In [7]:
process_bigbio(ddi_dataset['train'], '../data/ddi', 'train')
process_bigbio(ddi_dataset['test'], '../data/ddi', 'test')

Processing train:   0%|          | 0/714 [00:00<?, ?it/s]

Processing test:   0%|          | 0/303 [00:00<?, ?it/s]

## BC5CDR

In [None]:
def process_bc5cdr(file_path, output_dir, split):
    """
    Processes a BC5CDR BioC XML file into the standard format and saves it.
    """
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Process documents
    formatted_data = []
    for doc in tqdm(root.findall('document'), desc="Processing Documents"):
        doc_id = doc.find('id').text
        
        curr_char_offset = 0
        # Process entities and texts
        passages_text = []
        entities_map = {}
        for passage in doc.findall('passage'):
            passage_text = passage.find('text').text
            passages_text.append(passage_text)
            
            for ann in passage.findall('annotation'):
                mesh_id = ann.find('infon[@key="MESH"]').text
                location = ann.find('location')
                char_start = int(location.get('offset')) + curr_char_offset
                char_end = char_start + int(location.get('length'))
                if mesh_id not in entities_map:
                    entities_map[mesh_id] = {
                        'text': ann.find('text').text,
                        'type': ann.find('infon[@key="type"]').text,
                        'char_offsets': [(char_start, char_end)]
                    }
                else:
                    entities_map[mesh_id]['char_offsets'].append((char_start, char_end))
            
            curr_char_offset += len(passage_text) + 1
        
        doc_text = " ".join(passages_text)
        
        # Skip documents exceeding token size limit of BERT
        if len(doc_text) > 512:
            continue
        encoding = tokenizer.encode_plus(doc_text, add_special_tokens=True, return_offsets_mapping=True)
        doc_entities = char_to_token_indices(encoding['offset_mapping'], list(entities_map.values()))

        # Process relations
        doc_relations = []
        for rel in doc.findall('relation'):
            chemical_mesh = rel.find('infon[@key="Chemical"]').text
            disease_mesh = rel.find('infon[@key="Disease"]').text
            head_entity = entities_map.get(chemical_mesh)
            tail_entity = entities_map.get(disease_mesh)
            doc_relations.append({
                "type": "CID", # Chemical-Induced Disease
                "head": head_entity,
                "tail": tail_entity
            })
        
        formatted_data.append({
            "id": doc_id,
            "text": doc_text,
            "entities": doc_entities,
            "relations": doc_relations
        })

    os.makedirs(output_dir, exist_ok=True)
    with open(f"{output_dir}/{split}.json", 'w', encoding='utf-8') as f:
        json.dump(formatted_data, f, indent=2)

In [9]:
bc5cdr_train_path = "/home/bt19d200/Ayaan/Datasets/BC5CDR/CDR_Data/CDR.Corpus.v010516/CDR_TrainingSet.BioC.xml"
bc5cdr_val_path = "/home/bt19d200/Ayaan/Datasets/BC5CDR/CDR_Data/CDR.Corpus.v010516/CDR_DevelopmentSet.BioC.xml"
bc5cdr_test_path = "/home/bt19d200/Ayaan/Datasets/BC5CDR/CDR_Data/CDR.Corpus.v010516/CDR_TestSet.BioC.xml"

In [12]:
process_bc5cdr(bc5cdr_train_path, "../data/bc5cdr", "train")
process_bc5cdr(bc5cdr_val_path, "../data/bc5cdr", "val")
process_bc5cdr(bc5cdr_test_path, "../data/bc5cdr", "test")

Processing Documents:   0%|          | 0/500 [00:00<?, ?it/s]

Processing Documents:   0%|          | 0/500 [00:00<?, ?it/s]

Processing Documents:   0%|          | 0/500 [00:00<?, ?it/s]

## BioRED

In [26]:
def process_biored(file_path, output_dir, split):
    """
    Processes a BioRED BioC JSON file into the standard format and saves it.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Process documents
    formatted_data = []
    for doc in tqdm(data['documents'], desc="Processing Documents"):
        doc_id = doc['id']
        
        curr_char_offset = 0
        # Process entities and texts
        passages_text = []
        entities_map = {}
        for passage in doc['passages']:
            passage_text = passage['text']
            passages_text.append(passage_text)
            
            for ann in passage['annotations']:
                entity_id = ann['infons']['identifier']
                char_offsets = [(int(location['offset']) + curr_char_offset,
                                 int(location['offset']) + curr_char_offset + int(location['length']))
                                 for location in ann['locations']]
                if entity_id not in entities_map:
                    entities_map[entity_id] = {
                        'text': ann['text'],
                        'type': ann['infons']['type'],
                        'char_offsets': []
                    }
                
                entities_map[entity_id]['char_offsets'].extend(char_offsets)
            
            curr_char_offset += len(passage_text) + 1
        
        doc_text = " ".join(passages_text)
        # Skip documents exceeding token size limit of BERT
        if len(doc_text) > 512:
            continue
        
        encoding = tokenizer.encode_plus(doc_text, add_special_tokens=True, return_offsets_mapping=True)
        doc_entities = char_to_token_indices(encoding['offset_mapping'], list(entities_map.values()))

        # Process relations
        doc_relations = []
        for rel in doc['relations']:
            head, tail = rel['infons']['entity1'], rel['infons']['entity2']
            head_entity, tail_entity = entities_map.get(head, {}), entities_map.get(tail, {})
            doc_relations.append({
                "type": rel['infons']['type'],
                "head": head_entity,
                "tail": tail_entity
            })
        
        formatted_data.append({
            "id": doc_id,
            "text": doc_text,
            "entities": doc_entities,
            "relations": doc_relations
        })

    os.makedirs(output_dir, exist_ok=True)
    with open(f"{output_dir}/{split}.json", 'w', encoding='utf-8') as f:
        json.dump(formatted_data, f, indent=2)

In [14]:
biored_train_path = "/home/bt19d200/Ayaan/Datasets/BIORED/Train.BioC.JSON"
biored_val_path = "/home/bt19d200/Ayaan/Datasets/BIORED/Dev.BioC.JSON"
biored_test_path = "/home/bt19d200/Ayaan/Datasets/BIORED/Test.BioC.JSON"

In [27]:
process_biored(biored_train_path, "../data/biored", "train")
process_biored(biored_val_path, "../data/biored", "val")
process_biored(biored_test_path, "../data/biored", "test")

Processing Documents:   0%|          | 0/400 [00:00<?, ?it/s]

Processing Documents:   0%|          | 0/100 [00:00<?, ?it/s]

Processing Documents:   0%|          | 0/100 [00:00<?, ?it/s]