# DrugProt Dataset

This notebook explores the Drug-Protein Relation Extraction dataset from https://doi.org/10.1093/database/baad080.

**Note**: This dataset is sourced from https://doi.org/10.5281/zenodo.4955410, the official source linked in the paper.

In [10]:
import os
import sys
import csv
import glob

from pprint import pprint

Load and check out the dataset structure.

In [21]:
csv.field_size_limit(sys.maxsize)

def load_dataset(split_directory):
    """
    Parses DrugProt data from a split directory (train/dev/test).
    Automatically locates abstracts, entities, and relations files based on suffixes.
    
    Args:
        split_directory (str): Path to the folder containing the TSV files.
        
    Returns:
        list: A list of document dictionaries.
    """
    
    # Auto-locate files using glob
    # We look for any file ending with the specific suffixes
    abs_files = glob.glob(os.path.join(split_directory, "*_abstracs.tsv"))
    if not abs_files:
        abs_files = glob.glob(os.path.join(split_directory, "*_abstracts.tsv"))
    ent_files = glob.glob(os.path.join(split_directory, "*_entities.tsv"))
    rel_files = glob.glob(os.path.join(split_directory, "*_relations.tsv"))

    # There is only be one set per directory
    abstracts_path = abs_files[0]
    entities_path = ent_files[0]

    # Relations are optional (does not exist for Test set)
    relations_path = rel_files[0] if rel_files else None

    print(f"Found files:\n  - Abstracts: {os.path.basename(abstracts_path)}\n  - Entities:  {os.path.basename(entities_path)}")
    if relations_path:
        print(f"  - Relations: {os.path.basename(relations_path)}")
    else:
        print("  - Relations: None found (skipping relations)")


    docs = {}
    # Load Abstracts
    with open(abstracts_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            doc_id, title, abstract = row[0], row[1], row[2]
            docs[doc_id] = {
                "doc_id": doc_id,
                "text": f"{title} {abstract}",
                "entities": [], 
                "relations": []
            }
    print(f"Extracted {len(docs)} abstracts")

    # Load Entities
    with open(entities_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            doc_id, ent_id, ent_type, start, end, text = row
            docs[doc_id]["entities"].append({
                "id": ent_id,
                "type": ent_type,
                "offset": [int(start), int(end)],
                "text": text
            })

    # Load Relations
    if relations_path:
        with open(relations_path, 'r', encoding='utf-8') as f:
            reader = csv.reader(f, delimiter='\t')
            for row in reader:
                doc_id, rel_type, arg1_full, arg2_full = row
                arg1_id = arg1_full.split(':')[1]
                arg2_id = arg2_full.split(':')[1]
                docs[doc_id]["relations"].append({
                    "type": rel_type,
                    "arg1": arg1_id,
                    "arg2": arg2_id
                })
        
    return list(docs.values())

In [22]:
ds_train = load_dataset("/home/bt19d200/Ayaan/raw-datasets/DrugProt/training")
ds_dev = load_dataset("/home/bt19d200/Ayaan/raw-datasets/DrugProt/development")
ds_test = load_dataset("/home/bt19d200/Ayaan/raw-datasets/DrugProt/test-background")

Found files:
  - Abstracts: drugprot_training_abstracs.tsv
  - Entities:  drugprot_training_entities.tsv
  - Relations: drugprot_training_relations.tsv
Extracted 3500 abstracts
Found files:
  - Abstracts: drugprot_development_abstracs.tsv
  - Entities:  drugprot_development_entities.tsv
  - Relations: drugprot_development_relations.tsv
Extracted 750 abstracts
Found files:
  - Abstracts: test_background_abstracts.tsv
  - Entities:  test_background_entities.tsv
  - Relations: None found (skipping relations)
Extracted 10750 abstracts


Check out the first example from the train set.

In [20]:
example = ds_train[0]
pprint(example)

{'doc_id': '17512723',
 'entities': [{'id': 'T1',
               'offset': [466, 480],
               'text': 'androstanediol',
               'type': 'CHEMICAL'},
              {'id': 'T2',
               'offset': [115, 122],
               'text': 'retinol',
               'type': 'CHEMICAL'},
              {'id': 'T3',
               'offset': [9, 16],
               'text': 'retinol',
               'type': 'CHEMICAL'},
              {'id': 'T4',
               'offset': [219, 230],
               'text': 'human RDH13',
               'type': 'GENE-Y'},
              {'id': 'T5',
               'offset': [232, 237],
               'text': 'RDH12',
               'type': 'GENE-Y'},
              {'id': 'T6',
               'offset': [326, 338],
               'text': 'murine Rdh12',
               'type': 'GENE-Y'},
              {'id': 'T7',
               'offset': [343, 354],
               'text': 'human RDH13',
               'type': 'GENE-Y'},
              {'id': 'T8',
     

Verify if entity offsets include space between the two passages (title and abstract).

In [23]:
def verify_offset(example):
    ent = example['entities'][0]
    start, end = ent['offset'][0], ent['offset'][1]
    text = example['text']
    if text[start:end] == ent['text']:
        return True
    
    return False

In [25]:
print(verify_offset(example))

True


Get entity repeats (mentions).

In [28]:
def max_ent_counts(dataset):
    max_count = 0
    for ex in dataset:
        ent_count = {}
        for ent in ex['entities']:
            ent_id = ent['id']
            ent_count[ent_id] = ent_count[ent_id] + 1 if ent_id in ent_count else 1
            
        ex_max_count = max(ent_count.values()) if ent_count else 0
        if ex_max_count > max_count:
            max_count = ex_max_count
    
    return {'max_repeats': max_count}

In [29]:
print("Entity counts:")
print("Train:")
pprint(max_ent_counts(ds_train))
print()
print("Test:")
pprint(max_ent_counts(ds_test))

Entity counts:
Train:
{'max_repeats': 1}

Test:
{'max_repeats': 1}


This shows that mention-level end-to-end RE can be applied, as there are no repeated mentions of any entities for any example in the dataset.

**Note:** If you look at the text and type, there definitely is. But I'm assuming the IDs provided in the relation correspong to the IDs signifying the relationship.

In [30]:
def type_statistics(ds):
    ent_types = {}
    rel_types = {}
    for ex in ds:
        entities = ex['entities']
        for ent in entities:
            ent_type = ent['type']
            ent_types[ent_type] = ent_types[ent_type] + 1 if ent_type in ent_types else 1
        
        for rel in ex['relations']:
            rel_type = rel['type']
            rel_types[rel_type] = rel_types[rel_type] + 1 if rel_type in rel_types else 1
    
    return {'ent_stats': ent_types, 'rel_stats': rel_types}

In [32]:
print("Train stats:")
pprint(type_statistics(ds_train))
print()
print("Development stats:")
pprint(type_statistics(ds_dev))
print()
print("Test stats:")
pprint(type_statistics(ds_test))

Train stats:
{'ent_stats': {'CHEMICAL': 46274, 'GENE-N': 14834, 'GENE-Y': 28421},
 'rel_stats': {'ACTIVATOR': 1428,
               'AGONIST': 658,
               'AGONIST-ACTIVATOR': 29,
               'AGONIST-INHIBITOR': 13,
               'ANTAGONIST': 972,
               'DIRECT-REGULATOR': 2247,
               'INDIRECT-DOWNREGULATOR': 1329,
               'INDIRECT-UPREGULATOR': 1378,
               'INHIBITOR': 5388,
               'PART-OF': 885,
               'PRODUCT-OF': 920,
               'SUBSTRATE': 2003,
               'SUBSTRATE_PRODUCT-OF': 24}}

Development stats:
{'ent_stats': {'CHEMICAL': 9853, 'GENE': 9005},
 'rel_stats': {'ACTIVATOR': 246,
               'AGONIST': 131,
               'AGONIST-ACTIVATOR': 10,
               'AGONIST-INHIBITOR': 2,
               'ANTAGONIST': 218,
               'DIRECT-REGULATOR': 458,
               'INDIRECT-DOWNREGULATOR': 332,
               'INDIRECT-UPREGULATOR': 302,
               'INHIBITOR': 1150,
               'PART