# Datasets

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import sys

from tqdm.auto import tqdm
from datasets import load_dataset
from pprint import pprint

In [None]:
sys.path.append("../")

In [40]:
from src.process_data import process_bc5cdr, process_biored, process_bigbio, get_types

In [4]:
splits = ['train', 'val', 'test']

## ChemProt

In [5]:
chemprot_dataset = load_dataset("bigbio/chemprot", "chemprot_bigbio_kb")

pprint(chemprot_dataset)

DatasetDict({
    sample: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 50
    })
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 1020
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 612
    })
})


In [6]:
chemprot_sample = chemprot_dataset['sample'][0]

pprint(chemprot_sample)

{'coreferences': [],
 'document_id': '10471277',
 'entities': [{'id': '2',
               'normalized': [],
               'offsets': [[135, 145]],
               'text': ['Salmeterol'],
               'type': 'CHEMICAL'},
              {'id': '3',
               'normalized': [],
               'offsets': [[1248, 1259]],
               'text': ['[(125)I]IAS'],
               'type': 'CHEMICAL'},
              {'id': '4',
               'normalized': [],
               'offsets': [[1285, 1299]],
               'text': ['(-)-alprenolol'],
               'type': 'CHEMICAL'},
              {'id': '5',
               'normalized': [],
               'offsets': [[1329, 1332]],
               'text': ['GTP'],
               'type': 'CHEMICAL'},
              {'id': '6',
               'normalized': [],
               'offsets': [[1346, 1355]],
               'text': ['[125I]IAS'],
               'type': 'CHEMICAL'},
              {'id': '7',
               'normalized': [],
               'o

In [36]:
chemprot_dicts = [chemprot_dataset[split] for split in ['train', 'validation', 'test']]

process_bigbio(chemprot_dicts, "ChemProt", splits)

Processing ChemProt dataset ...



Processing train:   0%|          | 0/1020 [00:00<?, ?it/s]

Saved processed train split to ../data/chemprot/train.jsonl.



Processing val:   0%|          | 0/612 [00:00<?, ?it/s]

Saved processed val split to ../data/chemprot/val.jsonl.



Processing test:   0%|          | 0/800 [00:00<?, ?it/s]

Saved processed test split to ../data/chemprot/test.jsonl.

Finished processing ChemProt dataset.


In [41]:
chemprot_types = get_types("ChemProt", splits)

Getting entity and relation types for ChemProt dataset ...



Getting train entity and relation types:   0%|          | 0/1020 [00:00<?, ?it/s]

Getting val entity and relation types:   0%|          | 0/612 [00:00<?, ?it/s]

Getting test entity and relation types:   0%|          | 0/800 [00:00<?, ?it/s]

Saved entity and relation types to ../data/chemprot/types.json.



In [42]:
pprint(chemprot_types)

{'entity_types': ['GENE-N', 'GENE-Y', 'CHEMICAL'],
 'relation_types': ['Not',
                    'Downregulator',
                    'Upregulator',
                    'Part_of',
                    'Agonist',
                    'Modulator',
                    'Cofactor',
                    'Antagonist',
                    'Substrate',
                    'Regulator',
                    'Undefined']}


## DDI dataset

In [10]:
ddi_dataset = load_dataset("bigbio/ddi_corpus", "ddi_corpus_bigbio_kb")

pprint(ddi_dataset)

Using the latest cached version of the dataset since bigbio/ddi_corpus couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'ddi_corpus_bigbio_kb' at /home/bt19d200/.cache/huggingface/datasets/bigbio___ddi_corpus/ddi_corpus_bigbio_kb/1.0.0/da8e94986a0c689095b22bed134248b11f9311c7 (last modified on Tue Jul 22 00:18:23 2025).


DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 714
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 303
    })
})


In [11]:
ddi_sample = ddi_dataset['train'][0]

pprint(ddi_sample)

{'coreferences': [],
 'document_id': '15825391',
 'entities': [{'id': '15825391_T1',
               'normalized': [],
               'offsets': [[46, 60]],
               'text': ['amphotericin B'],
               'type': 'DRUG'},
              {'id': '15825391_T2',
               'normalized': [],
               'offsets': [[75, 82]],
               'text': ['filipin'],
               'type': 'DRUG_N'},
              {'id': '15825391_T3',
               'normalized': [],
               'offsets': [[111, 130]],
               'text': ['polyene antibiotics'],
               'type': 'GROUP'},
              {'id': '15825391_T4',
               'normalized': [],
               'offsets': [[143, 150]],
               'text': ['filipin'],
               'type': 'DRUG_N'},
              {'id': '15825391_T5',
               'normalized': [],
               'offsets': [[202, 216]],
               'text': ['amphotericin B'],
               'type': 'DRUG'},
              {'id': '15825391_T6',
   

In [50]:
ddi_dicts = [ddi_dataset[split] for split in ['train', 'test']]

process_bigbio(ddi_dicts, "ChemProt", [splits[i] for i in [0, 2]]) # only train and test splits

Processing ChemProt dataset ...



Processing train:   0%|          | 0/714 [00:00<?, ?it/s]

Saved processed train split to ../data/chemprot/train.jsonl.



Processing test:   0%|          | 0/303 [00:00<?, ?it/s]

Saved processed test split to ../data/chemprot/test.jsonl.

Finished processing ChemProt dataset.


In [51]:
ddi_types = get_types("DDI", [splits[i] for i in [0, 2]])

Getting entity and relation types for DDI dataset ...



Getting train entity and relation types:   0%|          | 0/714 [00:00<?, ?it/s]

Getting test entity and relation types:   0%|          | 0/303 [00:00<?, ?it/s]

Saved entity and relation types to ../data/ddi/types.json.



In [52]:
pprint(ddi_types)

{'entity_types': ['DRUG_N', 'DRUG', 'GROUP', 'BRAND'],
 'relation_types': ['MECHANISM', 'EFFECT', 'INT', 'ADVISE']}


## BC5CDR

In [48]:
bc5cdr_train_path = "/home/bt19d200/Ayaan/Datasets/BC5CDR/CDR_Data/CDR.Corpus.v010516/CDR_TrainingSet.BioC.xml"
bc5cdr_val_path = "/home/bt19d200/Ayaan/Datasets/BC5CDR/CDR_Data/CDR.Corpus.v010516/CDR_DevelopmentSet.BioC.xml"
bc5cdr_test_path = "/home/bt19d200/Ayaan/Datasets/BC5CDR/CDR_Data/CDR.Corpus.v010516/CDR_TestSet.BioC.xml"

bc5cdr_paths = [bc5cdr_train_path, bc5cdr_val_path, bc5cdr_test_path]

In [49]:
process_bc5cdr(bc5cdr_paths, splits=splits)

Processing BC5CDR dataset ...



Processing train:   0%|          | 0/500 [00:00<?, ?it/s]

Saved processed train split to ../data/bc5cdr/train.jsonl.



Processing val:   0%|          | 0/500 [00:00<?, ?it/s]

Saved processed val split to ../data/bc5cdr/val.jsonl.



Processing test:   0%|          | 0/500 [00:00<?, ?it/s]

Saved processed test split to ../data/bc5cdr/test.jsonl.

Finished processing BC5CDR dataset.


In [53]:
bc5cdr_types = get_types("BC5CDR", splits)

Getting entity and relation types for BC5CDR dataset ...



Getting train entity and relation types:   0%|          | 0/500 [00:00<?, ?it/s]

Getting val entity and relation types:   0%|          | 0/500 [00:00<?, ?it/s]

Getting test entity and relation types:   0%|          | 0/500 [00:00<?, ?it/s]

Saved entity and relation types to ../data/bc5cdr/types.json.



In [54]:
pprint(bc5cdr_types)

{'entity_types': ['Disease', 'Chemical'], 'relation_types': ['CID']}


## BioRED

In [25]:
biored_train_path = "/home/bt19d200/Ayaan/Datasets/BIORED/Train.BioC.JSON"
biored_val_path = "/home/bt19d200/Ayaan/Datasets/BIORED/Dev.BioC.JSON"
biored_test_path = "/home/bt19d200/Ayaan/Datasets/BIORED/Test.BioC.JSON"

biored_paths = [biored_train_path, biored_val_path, biored_test_path]

In [55]:
process_biored(biored_paths, splits)

Processing BioRED dataset ...



Processing train:   0%|          | 0/400 [00:00<?, ?it/s]

Saved processed train split to ../data/biored/train.jsonl.



Processing val:   0%|          | 0/100 [00:00<?, ?it/s]

Saved processed val split to ../data/biored/val.jsonl.



Processing test:   0%|          | 0/100 [00:00<?, ?it/s]

Saved processed test split to ../data/biored/test.jsonl.

Finished processing BioRED dataset.


In [56]:
biored_types = get_types("BioRED", splits)

Getting entity and relation types for BioRED dataset ...



Getting train entity and relation types:   0%|          | 0/400 [00:00<?, ?it/s]

Getting val entity and relation types:   0%|          | 0/100 [00:00<?, ?it/s]

Getting test entity and relation types:   0%|          | 0/100 [00:00<?, ?it/s]

Saved entity and relation types to ../data/biored/types.json.



In [57]:
pprint(biored_types)

{'entity_types': ['ChemicalEntity',
                  'DiseaseOrPhenotypicFeature',
                  'SequenceVariant',
                  'GeneOrGeneProduct',
                  'OrganismTaxon',
                  'CellLine'],
 'relation_types': ['Comparison',
                    'Negative_Correlation',
                    'Bind',
                    'Cotreatment',
                    'Conversion',
                    'Drug_Interaction',
                    'Positive_Correlation',
                    'Association']}
