# Bio-Domain Datasets

In [9]:
import json
import os
import re
import unicodedata
import xml.etree.ElementTree as ET
from tqdm.auto import tqdm
from transformers import BertTokenizerFast
from datasets import load_dataset, dataset_dict
from pprint import pprint

## Data Preprocessing

### Data Processing Functions

In [8]:
def clean_text(text_str: str) -> str:
    """
    Normalizes whitespace (e.g., \n, \t) to a single space, 
    normalizes Unicode, and strips leading/trailing whitespace.
    """
    # Apply NFKC Unicode normalization to handle compatibility characters
    text_str = unicodedata.normalize('NFKC', text_str)
    
    # Replace all whitespace sequences (\n, \t, multiple spaces) with a single space
    text_str = re.sub(r'\s+', ' ', text_str)
    
    # Remove leading/trailing whitespace
    return text_str.strip()

In [10]:
def process_bigbio(dataset: dataset_dict, output_dir: str, split: str) -> None:
    """
    Processes a bigbio dataset from HuggingFace into the standard format and saves it to a JSONL file.
    """
    formatted_data = []
    for example in tqdm(dataset, desc=f"Processing {split}"):
        doc_id = example['id']
        text = clean_text(' '.join([' '.join(passage['text']) for passage in example['passages']]))
        
        # Process entities
        entities_map = {}
        entities = []
        for entity in example['entities']:
            entity_id = entity['id']
            entity_type = entity['type']
            entity_text = ' '.join(entity['text'])
            offsets = [(offset[0], offset[1]) for offset in entity['offsets']]
            entity_data = {'text': entity_text, 'type': entity_type, 'offsets': offsets}
            entities_map[entity_id] = entity_data
            entities.append(entity_data)
        
        # Process relations
        relations = []
        for relation in example['relations']:
            rel_type = relation['type']
            head_id, tail_id = relation['arg1_id'], relation['arg2_id']
            head, tail = entities_map.get(head_id, 'UNK'), entities_map.get(tail_id, 'UNK')
            relations.append({'type': rel_type, 'head': head, 'tail': tail})

        formatted_data.append({'id': doc_id, 'text': text, 'entities': entities, 'relations': relations})

    # Save data
    os.makedirs(output_dir, exist_ok=True)
    with open(f"{output_dir}/{split}.jsonl", 'w', encoding='utf-8') as f:
        for item in formatted_data:
            # Dump each dictionary as its own line in the file
            f.write(json.dumps(item) + '\n')

In [21]:
def process_bc5cdr(file_path, output_dir, split):
    """
    Processes a BC5CDR BioC XML file into the standard format and saves it as JSONL file.
    """
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Process documents
    formatted_data = []
    for doc in tqdm(root.findall('document'), desc=f"Processing {split}"):
        doc_id = doc.find('id').text
        
        # Process entities and texts
        doc_text = []
        entities_map = {}
        curr_offset = 0
        for passage in doc.findall('passage'):
            text = passage.find('text').text
            doc_text.append(text)
            for ann in passage.findall('annotation'):
                mesh_id = ann.find('infon[@key="MESH"]').text
                location = ann.find('location')
                start = int(location.get('offset')) + curr_offset
                end = start + int(location.get('length'))
                if mesh_id not in entities_map:
                    entities_map[mesh_id] = {
                        'text': ann.find('text').text,
                        'type': ann.find('infon[@key="type"]').text,
                        'offsets': []
                    }
                
                entities_map[mesh_id]['offsets'].append((start, end))
            
            curr_offset += len(text) + 1
        
        doc_text = clean_text(" ".join(doc_text))
        doc_entities = [entity for entity in entities_map.values()]

        # Process relations
        doc_relations = []
        for rel in doc.findall('relation'):
            chemical_mesh = rel.find('infon[@key="Chemical"]').text
            disease_mesh = rel.find('infon[@key="Disease"]').text
            head_entity = entities_map.get(chemical_mesh, 'UNK')
            tail_entity = entities_map.get(disease_mesh, 'UNK')
            doc_relations.append({
                "type": "CID", # Chemical-Induced Disease
                "head": head_entity,
                "tail": tail_entity
            })
        
        formatted_data.append({
            "id": doc_id,
            "text": doc_text,
            "entities": doc_entities,
            "relations": doc_relations
        })

    os.makedirs(output_dir, exist_ok=True)
    with open(f"{output_dir}/{split}.jsonl", 'w', encoding='utf-8') as f:
        for item in formatted_data:
            f.write(json.dumps(item) + '\n')

In [22]:
def process_biored(file_path, output_dir, split):
    """
    Processes a BioRED BioC JSON file into the standard format and saves it as a JSONL file.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Process documents
    formatted_data = []
    for doc in tqdm(data['documents'], desc=f"Processing {split}"):
        doc_id = doc['id']
        
        # Process entities and texts
        doc_text = []
        entities_map = {}
        curr_offset = 0
        for passage in doc['passages']:
            text = passage['text']
            doc_text.append(text)
            for ann in passage['annotations']:
                entity_id = ann['infons']['identifier']
                offsets = []
                for loc in ann['locations']:
                    offsets.append((int(loc['offset']) + curr_offset, int(loc['offset']) + curr_offset + int(loc['length'])))
                
                if entity_id not in entities_map:
                    entities_map[entity_id] = {
                        'text': ann['text'],
                        'type': ann['infons']['type'],
                        'offsets': []
                    }
                
                entities_map[entity_id]['offsets'].extend(offsets)
            
            curr_offset += len(text) + 1
        
        doc_text = clean_text(" ".join(doc_text))
        doc_entities = [entity for entity in entities_map.values()]

        # Process relations
        doc_relations = []
        for rel in doc['relations']:
            head, tail = rel['infons']['entity1'], rel['infons']['entity2']
            head_entity, tail_entity = entities_map.get(head, 'UNK'), entities_map.get(tail, 'UNK')
            doc_relations.append({
                "type": rel['infons']['type'],
                "head": head_entity,
                "tail": tail_entity
            })
        
        formatted_data.append({
            "id": doc_id,
            "text": doc_text,
            "entities": doc_entities,
            "relations": doc_relations
        })

    os.makedirs(output_dir, exist_ok=True)
    with open(f"{output_dir}/{split}.jsonl", 'w', encoding='utf-8') as f:
        for item in formatted_data:
            f.write(json.dumps(item) + '\n')

### ChemProt

In [4]:
chemprot_dataset = load_dataset("bigbio/chemprot", "chemprot_bigbio_kb")

print(chemprot_dataset)

DatasetDict({
    sample: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 50
    })
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 1020
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 612
    })
})


In [5]:
chemprot_sample = chemprot_dataset['sample'][0]

pprint(chemprot_sample)

{'coreferences': [],
 'document_id': '10471277',
 'entities': [{'id': '2',
               'normalized': [],
               'offsets': [[135, 145]],
               'text': ['Salmeterol'],
               'type': 'CHEMICAL'},
              {'id': '3',
               'normalized': [],
               'offsets': [[1248, 1259]],
               'text': ['[(125)I]IAS'],
               'type': 'CHEMICAL'},
              {'id': '4',
               'normalized': [],
               'offsets': [[1285, 1299]],
               'text': ['(-)-alprenolol'],
               'type': 'CHEMICAL'},
              {'id': '5',
               'normalized': [],
               'offsets': [[1329, 1332]],
               'text': ['GTP'],
               'type': 'CHEMICAL'},
              {'id': '6',
               'normalized': [],
               'offsets': [[1346, 1355]],
               'text': ['[125I]IAS'],
               'type': 'CHEMICAL'},
              {'id': '7',
               'normalized': [],
               'o

In [14]:
process_bigbio(chemprot_dataset['train'], '../data/chemprot', 'train')
process_bigbio(chemprot_dataset['validation'], '../data/chemprot', 'val')
process_bigbio(chemprot_dataset['test'], '../data/chemprot', 'test')

Processing train:   0%|          | 0/1020 [00:00<?, ?it/s]

Processing val:   0%|          | 0/612 [00:00<?, ?it/s]

Processing test:   0%|          | 0/800 [00:00<?, ?it/s]

### DDI dataset

In [15]:
ddi_dataset = load_dataset("bigbio/ddi_corpus", "ddi_corpus_bigbio_kb")

print(ddi_dataset)

Using the latest cached version of the dataset since bigbio/ddi_corpus couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'ddi_corpus_bigbio_kb' at /home/bt19d200/.cache/huggingface/datasets/bigbio___ddi_corpus/ddi_corpus_bigbio_kb/1.0.0/da8e94986a0c689095b22bed134248b11f9311c7 (last modified on Tue Jul 22 00:18:23 2025).


DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 714
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 303
    })
})


In [16]:
ddi_sample = ddi_dataset['train'][0]

pprint(ddi_sample)

{'coreferences': [],
 'document_id': '15825391',
 'entities': [{'id': '15825391_T1',
               'normalized': [],
               'offsets': [[46, 60]],
               'text': ['amphotericin B'],
               'type': 'DRUG'},
              {'id': '15825391_T2',
               'normalized': [],
               'offsets': [[75, 82]],
               'text': ['filipin'],
               'type': 'DRUG_N'},
              {'id': '15825391_T3',
               'normalized': [],
               'offsets': [[111, 130]],
               'text': ['polyene antibiotics'],
               'type': 'GROUP'},
              {'id': '15825391_T4',
               'normalized': [],
               'offsets': [[143, 150]],
               'text': ['filipin'],
               'type': 'DRUG_N'},
              {'id': '15825391_T5',
               'normalized': [],
               'offsets': [[202, 216]],
               'text': ['amphotericin B'],
               'type': 'DRUG'},
              {'id': '15825391_T6',
   

In [17]:
process_bigbio(ddi_dataset['train'], '../data/ddi', 'train')
process_bigbio(ddi_dataset['test'], '../data/ddi', 'test')

Processing train:   0%|          | 0/714 [00:00<?, ?it/s]

Processing test:   0%|          | 0/303 [00:00<?, ?it/s]

### BC5CDR

In [18]:
bc5cdr_train_path = "/home/bt19d200/Ayaan/Datasets/BC5CDR/CDR_Data/CDR.Corpus.v010516/CDR_TrainingSet.BioC.xml"
bc5cdr_val_path = "/home/bt19d200/Ayaan/Datasets/BC5CDR/CDR_Data/CDR.Corpus.v010516/CDR_DevelopmentSet.BioC.xml"
bc5cdr_test_path = "/home/bt19d200/Ayaan/Datasets/BC5CDR/CDR_Data/CDR.Corpus.v010516/CDR_TestSet.BioC.xml"

In [23]:
process_bc5cdr(bc5cdr_train_path, "../data/bc5cdr", "train")
process_bc5cdr(bc5cdr_val_path, "../data/bc5cdr", "val")
process_bc5cdr(bc5cdr_test_path, "../data/bc5cdr", "test")

Processing train:   0%|          | 0/500 [00:00<?, ?it/s]

Processing val:   0%|          | 0/500 [00:00<?, ?it/s]

Processing test:   0%|          | 0/500 [00:00<?, ?it/s]

### BioRED

In [24]:
biored_train_path = "/home/bt19d200/Ayaan/Datasets/BIORED/Train.BioC.JSON"
biored_val_path = "/home/bt19d200/Ayaan/Datasets/BIORED/Dev.BioC.JSON"
biored_test_path = "/home/bt19d200/Ayaan/Datasets/BIORED/Test.BioC.JSON"

In [25]:
process_biored(biored_train_path, "../data/biored", "train")
process_biored(biored_val_path, "../data/biored", "val")
process_biored(biored_test_path, "../data/biored", "test")

Processing train:   0%|          | 0/400 [00:00<?, ?it/s]

Processing val:   0%|          | 0/100 [00:00<?, ?it/s]

Processing test:   0%|          | 0/100 [00:00<?, ?it/s]

## Entity and Relation Types

In [36]:
def get_entity_and_relation_types(data_dir, split):
    """
    Returns and entity and relation types present in the given dataset
    """
    data = []
    with open(data_dir + f"/{split}.jsonl", 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    
    entity_types = set()
    relation_types = set()
    for example in tqdm(data, desc=f"Loading {split} entity and relation types"):
        for entity in example['entities']:
            entity_types.add(entity['type'])
        
        for relation in example['relations']:
            relation_types.add(relation['type'])
    
    return sorted(entity_types), sorted(relation_types)

### ChemProt

In [37]:
chemprot_entities_train, chemprot_relations_train = get_entity_and_relation_types("../data/chemprot", 'train')
chemprot_entities_val, chemprot_relations_val = get_entity_and_relation_types("../data/chemprot", 'val')
chemprot_entities_test, chemprot_relations_test = get_entity_and_relation_types("../data/chemprot", 'test')

Loading train entity and relation types:   0%|          | 0/1020 [00:00<?, ?it/s]

Loading val entity and relation types:   0%|          | 0/612 [00:00<?, ?it/s]

Loading test entity and relation types:   0%|          | 0/800 [00:00<?, ?it/s]

In [None]:
print("Chemprot entities in the train set:\n", chemprot_entities_train)
print()
print("Chemprot entities in the val set:\n", chemprot_entities_val)
print()
print("Chemprot entities in the test set:\n", chemprot_entities_test)

Chemprot entities in the train set:
 ['CHEMICAL', 'GENE-N', 'GENE-Y']
Chemprot entities in the val set:
 ['CHEMICAL', 'GENE-N', 'GENE-Y']
Chemprot entities in the test set:
 ['CHEMICAL', 'GENE-N', 'GENE-Y']


In [39]:
print("Chemprot relations in the train set:\n", chemprot_relations_train)
print()
print("Chemprot relations in the val set:\n", chemprot_relations_val)
print()
print("Chemprot relations in the test set:\n", chemprot_relations_test)

Chemprot relations in the train set:
 ['Agonist', 'Antagonist', 'Cofactor', 'Downregulator', 'Modulator', 'Not', 'Part_of', 'Regulator', 'Substrate', 'Undefined', 'Upregulator']

Chemprot relations in the val set:
 ['Agonist', 'Antagonist', 'Cofactor', 'Downregulator', 'Modulator', 'Not', 'Part_of', 'Regulator', 'Substrate', 'Undefined', 'Upregulator']

Chemprot relations in the test set:
 ['Agonist', 'Antagonist', 'Cofactor', 'Downregulator', 'Modulator', 'Not', 'Part_of', 'Regulator', 'Substrate', 'Upregulator']


### DDI

In [40]:
ddi_entities_train, ddi_relations_train = get_entity_and_relation_types("../data/ddi", 'train')
ddi_entities_test, ddi_relations_test = get_entity_and_relation_types("../data/ddi", 'test')

Loading train entity and relation types:   0%|          | 0/714 [00:00<?, ?it/s]

Loading test entity and relation types:   0%|          | 0/303 [00:00<?, ?it/s]

In [41]:
print("DDI entities in the train set:\n", ddi_entities_train)
print()
print("DDI entities in the test set:\n", ddi_entities_test)

DDI entities in the train set:
 ['BRAND', 'DRUG', 'DRUG_N', 'GROUP']

DDI entities in the test set:
 ['BRAND', 'DRUG', 'DRUG_N', 'GROUP']


In [42]:
print("DDI relations in the train set:\n", ddi_relations_train)
print()
print("DDI relations in the test set:\n", ddi_relations_test)

DDI relations in the train set:
 ['ADVISE', 'EFFECT', 'INT', 'MECHANISM']

DDI relations in the test set:
 ['ADVISE', 'EFFECT', 'INT', 'MECHANISM']


### BC5CDR

In [43]:
bc5cdr_entities_train, bc5cdr_relations_train = get_entity_and_relation_types("../data/bc5cdr", 'train')
bc5cdr_entities_val, bc5cdr_relations_val = get_entity_and_relation_types("../data/bc5cdr", 'val')
bc5cdr_entities_test, bc5cdr_relations_test = get_entity_and_relation_types("../data/bc5cdr", 'test')

Loading train entity and relation types:   0%|          | 0/500 [00:00<?, ?it/s]

Loading val entity and relation types:   0%|          | 0/500 [00:00<?, ?it/s]

Loading test entity and relation types:   0%|          | 0/500 [00:00<?, ?it/s]

In [44]:
print("BC5CDR entities in the train set:\n", bc5cdr_entities_train)
print()
print("BC5CDR entities in the val set:\n", bc5cdr_entities_val)
print()
print("BC5CDR entities in the test set:\n", bc5cdr_entities_test)

BC5CDR entities in the train set:
 ['Chemical', 'Disease']

BC5CDR entities in the val set:
 ['Chemical', 'Disease']

BC5CDR entities in the test set:
 ['Chemical', 'Disease']


In [45]:
print("BC5CDR relations in the train set:\n", bc5cdr_relations_train)
print()
print("BC5CDR relations in the val set:\n", bc5cdr_relations_val)
print()
print("BC5CDR relations in the test set:\n", bc5cdr_relations_test)

BC5CDR relations in the train set:
 ['CID']

BC5CDR relations in the val set:
 ['CID']

BC5CDR relations in the test set:
 ['CID']


### BC5CDR

In [46]:
biored_entities_train, biored_relations_train = get_entity_and_relation_types("../data/biored", 'train')
biored_entities_val, biored_relations_val = get_entity_and_relation_types("../data/biored", 'val')
biored_entities_test, biored_relations_test = get_entity_and_relation_types("../data/biored", 'test')

Loading train entity and relation types:   0%|          | 0/400 [00:00<?, ?it/s]

Loading val entity and relation types:   0%|          | 0/100 [00:00<?, ?it/s]

Loading test entity and relation types:   0%|          | 0/100 [00:00<?, ?it/s]

In [47]:
print("BIORED entities in the train set:\n", biored_entities_train)
print()
print("BIORED entities in the val set:\n", biored_entities_val)
print()
print("BIORED entities in the test set:\n", biored_entities_test)

BIORED entities in the train set:
 ['CellLine', 'ChemicalEntity', 'DiseaseOrPhenotypicFeature', 'GeneOrGeneProduct', 'OrganismTaxon', 'SequenceVariant']

BIORED entities in the val set:
 ['CellLine', 'ChemicalEntity', 'DiseaseOrPhenotypicFeature', 'GeneOrGeneProduct', 'OrganismTaxon', 'SequenceVariant']

BIORED entities in the test set:
 ['CellLine', 'ChemicalEntity', 'DiseaseOrPhenotypicFeature', 'GeneOrGeneProduct', 'OrganismTaxon', 'SequenceVariant']


In [48]:
print("BIORED relations in the train set:\n", biored_relations_train)
print()
print("BIORED relations in the val set:\n", biored_relations_val)
print()
print("BIORED relations in the test set:\n", biored_relations_test)

BIORED relations in the train set:
 ['Association', 'Bind', 'Comparison', 'Conversion', 'Cotreatment', 'Drug_Interaction', 'Negative_Correlation', 'Positive_Correlation']

BIORED relations in the val set:
 ['Association', 'Bind', 'Comparison', 'Cotreatment', 'Negative_Correlation', 'Positive_Correlation']

BIORED relations in the test set:
 ['Association', 'Bind', 'Comparison', 'Conversion', 'Cotreatment', 'Drug_Interaction', 'Negative_Correlation', 'Positive_Correlation']
