# BioCreative IV - ChemProt Dataset

This notebook explores the Chemical Protein Relation Extraction dataset from https://doi.org/10.1093/database/bav123.

**Note**: This dataset is sourced from HuggingFace (https://huggingface.co/bigbio) as the official source is no longer accessible.

In [2]:
from pprint import pprint
from datasets import load_dataset

Load and check out the dataset structure of the three different source types.

#### chemprot_full_source:

In [8]:
ds_fs = load_dataset("bigbio/chemprot", "chemprot_full_source")

pprint(ds_fs)

DatasetDict({
    sample: Dataset({
        features: ['pmid', 'text', 'entities', 'relations'],
        num_rows: 50
    })
    train: Dataset({
        features: ['pmid', 'text', 'entities', 'relations'],
        num_rows: 1020
    })
    test: Dataset({
        features: ['pmid', 'text', 'entities', 'relations'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['pmid', 'text', 'entities', 'relations'],
        num_rows: 612
    })
})


In [12]:
pprint(ds_fs['sample'][0])

{'entities': {'id': ['T1',
                     'T2',
                     'T3',
                     'T4',
                     'T5',
                     'T6',
                     'T7',
                     'T8',
                     'T9',
                     'T10',
                     'T11',
                     'T12',
                     'T13',
                     'T14',
                     'T15',
                     'T16',
                     'T17',
                     'T18',
                     'T19',
                     'T20',
                     'T21',
                     'T22',
                     'T23',
                     'T24',
                     'T25',
                     'T26',
                     'T27',
                     'T28',
                     'T29',
                     'T30',
                     'T31',
                     'T33',
                     'T34',
                     'T35',
                     'T36',
                     'T37',
 

Verify if entity offsets are accurate.

In [15]:
def verify_span(example):
    for ent_txt, ent_offset in zip(example['entities']['text'], example['entities']['offsets']):
        if example['text'][ent_offset[0]: ent_offset[1]] != ent_txt:
            return False
    
    return True

In [16]:
print(verify_span(ds_fs['sample'][0]))

True


Get entity and relation types.

In [25]:
def type_statistics(ds):
    ent_types = {}
    rel_types = {}
    for example in ds:
        for ent_type in example['entities']['type']:
            ent_types[ent_type] = ent_types[ent_type] + 1 if ent_type in ent_types else 1
        
        for rel_type in example['relations']['type']:
            rel_types[rel_type] = rel_types[rel_type] + 1 if rel_type in rel_types else 1
    
    return {'ent_stats': ent_types, 'rel_stats': rel_types}

In [28]:
label_stats_train = type_statistics(ds_fs['train'])
pprint(label_stats_train)

{'ent_stats': {'CHEMICAL': 13017, 'GENE-N': 4387, 'GENE-Y': 8348},
 'rel_stats': {'CPR:0': 1,
               'CPR:1': 308,
               'CPR:10': 241,
               'CPR:2': 1652,
               'CPR:3': 777,
               'CPR:4': 2260,
               'CPR:5': 173,
               'CPR:6': 235,
               'CPR:7': 29,
               'CPR:8': 34,
               'CPR:9': 727}}


In [30]:
label_stats_val = type_statistics(ds_fs['validation'])
pprint(label_stats_val)

{'ent_stats': {'CHEMICAL': 8004, 'GENE-N': 2412, 'GENE-Y': 5151},
 'rel_stats': {'CPR:0': 2,
               'CPR:1': 153,
               'CPR:10': 175,
               'CPR:2': 780,
               'CPR:3': 552,
               'CPR:4': 1103,
               'CPR:5': 116,
               'CPR:6': 199,
               'CPR:7': 19,
               'CPR:8': 2,
               'CPR:9': 457}}


In [31]:
label_stats_test = type_statistics(ds_fs['test'])
pprint(label_stats_test)

{'ent_stats': {'CHEMICAL': 10810, 'GENE-N': 3360, 'GENE-Y': 6658},
 'rel_stats': {'CPR:1': 215,
               'CPR:10': 267,
               'CPR:2': 1743,
               'CPR:3': 667,
               'CPR:4': 1667,
               'CPR:5': 198,
               'CPR:6': 293,
               'CPR:7': 25,
               'CPR:8': 25,
               'CPR:9': 644}}


#### chemprot_bigbio_kb:

In [18]:
ds_kb = load_dataset("bigbio/chemprot", "chemprot_bigbio_kb")

pprint(ds_kb)

DatasetDict({
    sample: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 50
    })
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 1020
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 612
    })
})


In [19]:
pprint(ds_kb['sample'][0])

{'coreferences': [],
 'document_id': '10471277',
 'entities': [{'id': '2',
               'normalized': [],
               'offsets': [[135, 145]],
               'text': ['Salmeterol'],
               'type': 'CHEMICAL'},
              {'id': '3',
               'normalized': [],
               'offsets': [[1248, 1259]],
               'text': ['[(125)I]IAS'],
               'type': 'CHEMICAL'},
              {'id': '4',
               'normalized': [],
               'offsets': [[1285, 1299]],
               'text': ['(-)-alprenolol'],
               'type': 'CHEMICAL'},
              {'id': '5',
               'normalized': [],
               'offsets': [[1329, 1332]],
               'text': ['GTP'],
               'type': 'CHEMICAL'},
              {'id': '6',
               'normalized': [],
               'offsets': [[1346, 1355]],
               'text': ['[125I]IAS'],
               'type': 'CHEMICAL'},
              {'id': '7',
               'normalized': [],
               'o

Clarify entity information

In [40]:
def clarify_ent_info(split):
    max_count, max_offsets, max_texts = 1, 1, 1
    for example in split:
        ent_count = {}
        for ent in example['entities']:
            ent_count[ent['id']] = ent_count[ent['id']] + 1 if ent['id'] in ent_count else 1
            if len(ent['offsets']) > max_offsets:
                max_offsets = len(ent['offsets'])
            
            if len(ent['text']) > max_texts:
                max_texts = len(ent['text'])
            
        max_count_ex = max(ent_count.values())
        if max_count < max_count_ex:
            max_count = max_count_ex
    
    return {'max_offsets': max_offsets, 'max_texts': max_texts, 'max_count': max_count}

In [41]:
print("Train:")
pprint(clarify_ent_info(ds_kb['train']))
print()
print("Validation:")
pprint(clarify_ent_info(ds_kb['validation']))
print()
print("Test:")
pprint(clarify_ent_info(ds_kb['test']))

Train:
{'max_count': 1, 'max_offsets': 1, 'max_texts': 1}

Validation:
{'max_count': 1, 'max_offsets': 1, 'max_texts': 1}

Test:
{'max_count': 1, 'max_offsets': 1, 'max_texts': 1}


This shows that mention-level end-to-end RE can be applied, as there are no repeated mentions for any entity.

In [42]:
def kb_type_statistics(ds):
    ent_types = {}
    rel_types = {}
    for example in ds:
        for ent in example['entities']:
            ent_types[ent['type']] = ent_types[ent['type']] + 1 if ent['type'] in ent_types else 1
        
        for rel in example['relations']:
            rel_types[rel['type']] = rel_types[rel['type']] + 1 if rel['type'] in rel_types else 1
    
    return {'ent_stats': ent_types, 'rel_stats': rel_types}

In [43]:
print("Train stats:")
pprint(kb_type_statistics(ds_kb['train']))
print()

print("Validation stats:")
pprint(kb_type_statistics(ds_kb['validation']))
print()

print("Test stats:")
pprint(kb_type_statistics(ds_kb['test']))
print()

Train stats:
{'ent_stats': {'CHEMICAL': 13017, 'GENE-N': 4387, 'GENE-Y': 8348},
 'rel_stats': {'Agonist': 173,
               'Antagonist': 235,
               'Cofactor': 34,
               'Downregulator': 2260,
               'Modulator': 29,
               'Not': 241,
               'Part_of': 308,
               'Regulator': 1652,
               'Substrate': 727,
               'Undefined': 1,
               'Upregulator': 777}}

Validation stats:
{'ent_stats': {'CHEMICAL': 8004, 'GENE-N': 2412, 'GENE-Y': 5151},
 'rel_stats': {'Agonist': 116,
               'Antagonist': 199,
               'Cofactor': 2,
               'Downregulator': 1103,
               'Modulator': 19,
               'Not': 175,
               'Part_of': 153,
               'Regulator': 780,
               'Substrate': 457,
               'Undefined': 2,
               'Upregulator': 552}}

Test stats:
{'ent_stats': {'CHEMICAL': 10810, 'GENE-N': 3360, 'GENE-Y': 6658},
 'rel_stats': {'Agonist': 198,
          

#### chemprot_shared_task_eval_source:

In [9]:
ds_ste = load_dataset("bigbio/chemprot", "chemprot_shared_task_eval_source")

pprint(ds_ste)

DatasetDict({
    sample: Dataset({
        features: ['pmid', 'text', 'entities', 'relations'],
        num_rows: 50
    })
    train: Dataset({
        features: ['pmid', 'text', 'entities', 'relations'],
        num_rows: 1020
    })
    test: Dataset({
        features: ['pmid', 'text', 'entities', 'relations'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['pmid', 'text', 'entities', 'relations'],
        num_rows: 612
    })
})


In [45]:
pprint(ds_ste['sample'][0])

{'entities': {'id': ['T1',
                     'T2',
                     'T3',
                     'T4',
                     'T5',
                     'T6',
                     'T7',
                     'T8',
                     'T9',
                     'T10',
                     'T11',
                     'T12',
                     'T13',
                     'T14',
                     'T15',
                     'T16',
                     'T17',
                     'T18',
                     'T19',
                     'T20',
                     'T21',
                     'T22',
                     'T23',
                     'T24',
                     'T25',
                     'T26',
                     'T27',
                     'T28',
                     'T29',
                     'T30',
                     'T31',
                     'T33',
                     'T34',
                     'T35',
                     'T36',
                     'T37',
 

In [46]:
print("Train stats:")
pprint(type_statistics(ds_ste['train']))
print()

print("Validation stats:")
pprint(type_statistics(ds_ste['validation']))
print()

print("Test stats:")
pprint(type_statistics(ds_ste['test']))
print()

Train stats:
{'ent_stats': {'CHEMICAL': 13017, 'GENE-N': 4387, 'GENE-Y': 8348},
 'rel_stats': {'CPR:3': 768,
               'CPR:4': 2254,
               'CPR:5': 173,
               'CPR:6': 235,
               'CPR:9': 727}}

Validation stats:
{'ent_stats': {'CHEMICAL': 8004, 'GENE-N': 2412, 'GENE-Y': 5151},
 'rel_stats': {'CPR:3': 550,
               'CPR:4': 1094,
               'CPR:5': 116,
               'CPR:6': 199,
               'CPR:9': 457}}

Test stats:
{'ent_stats': {'CHEMICAL': 10810, 'GENE-N': 3360, 'GENE-Y': 6658},
 'rel_stats': {'CPR:3': 665,
               'CPR:4': 1661,
               'CPR:5': 195,
               'CPR:6': 293,
               'CPR:9': 644}}



This is a restricted version of the dataset which omits rare relations.