# DDI Corpus

This notebook explores the Drug-Drug Interaction Relation Extraction dataset from https://doi.org/10.1016/j.jbi.2013.07.011.

**Note**: This dataset is sourced from the authors' github (https://github.com/isegura/DDICorpus) as the official source is no longer accessible.

In [8]:
import os
import xml.etree.ElementTree as ET

from pprint import pprint

Load and check out the dataset structure.

In [9]:
def parse_ddi(directory_path):
    """
    Parses DDI Corpus XML files into a structured format for NER and RE.
    """
    data_samples = []

    # Walk through all XML files in the directory
    for root_dir, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(".xml"):
                file_path = os.path.join(root_dir, file)
                tree = ET.parse(file_path)
                root = tree.getroot()
                
                doc_id = root.get('id')

                # Iterate through sentences
                for sent in root.findall('sentence'):
                    sent_id = sent.get('id')
                    text = sent.get('text')
                    
                    # Extract Entities (NER)
                    entities = []
                    for ent in sent.findall('entity'):
                        e_id = ent.get('id')
                        e_type = ent.get('type')
                        e_text = ent.get('text')
                        offsets = ent.get('charOffset').split(';')
                        for i, offset in enumerate(offsets):
                            start, end = tuple(offset.split('-'))
                            offset = [int(start), int(end)]
                            offsets[i] = offset
                            
                        entities.append({'id': e_id, 'text': e_text, 'type': e_type, 'offsets': offsets})

                    # Extract Pairs (RE)
                    relations = []
                    for pair in sent.findall('pair'):
                        e1_id = pair.get('e1')
                        e2_id = pair.get('e2')
                        ddi = pair.get('ddi')
                        relation = {'e1_id': e1_id, 'e2_id': e2_id, 'ddi': ddi}
                        if ddi == 'true':
                            rel_type = pair.get('type')
                            relation['type'] = rel_type
                        
                        relations.append(relation)

                    # Store parsed sample
                    data_samples.append({
                        'doc_id': doc_id,
                        'sent_id': sent_id,
                        'text': text,
                        'entities': entities,
                        'relations': relations
                    })
    
    return data_samples

In [12]:
ds_train = parse_ddi("/home/bt19d200/Ayaan/raw-datasets/DDICorpus/Train")
ds_test = parse_ddi("/home/bt19d200/Ayaan/raw-datasets/DDICorpus/Test/Test for DDI Extraction task")

In [13]:
print("Number of train samples:", len(ds_train))
print("Number of test samples:", len(ds_test))

Number of train samples: 6976
Number of test samples: 1299


In [14]:
pprint(ds_train[10])

{'doc_id': 'DDI-MedLine.d62',
 'entities': [{'id': 'DDI-MedLine.d62.s4.e0',
               'offsets': [[46, 49]],
               'text': 'METH',
               'type': 'drug'},
              {'id': 'DDI-MedLine.d62.s4.e1',
               'offsets': [[234, 242]],
               'text': 'SCH-23390',
               'type': 'drug_n'},
              {'id': 'DDI-MedLine.d62.s4.e2',
               'offsets': [[258, 277]],
               'text': 'atypical neuroleptic',
               'type': 'group'},
              {'id': 'DDI-MedLine.d62.s4.e3',
               'offsets': [[279, 287]],
               'text': 'clozapine',
               'type': 'drug'}],
 'relations': [{'ddi': 'true',
                'e1_id': 'DDI-MedLine.d62.s4.e0',
                'e2_id': 'DDI-MedLine.d62.s4.e1',
                'type': 'effect'},
               {'ddi': 'false',
                'e1_id': 'DDI-MedLine.d62.s4.e0',
                'e2_id': 'DDI-MedLine.d62.s4.e2'},
               {'ddi': 'false',
               

Verify if entity offsets are exclusive or inclusive.

In [15]:
def verify_offset_format(example):
    ent = example['entities'][0]
    offset = ent['offsets'][0]
    if example['text'][offset[0]: offset[1] + 1] == ent['text']:
        return 'Inclusive end'
    
    return 'Exclusive end'

In [16]:
print(verify_offset_format(ds_train[10]))

Inclusive end


Get number of entity repeats (mentions, offsets and textual variations).

In [None]:
def ent_repeats(dataset):
    max_count, max_offsets, max_texts = 0, 0, 1
    for example in dataset:
        ent_count = {}
        for ent in example['entities']:
            ent_count[ent['id']] = ent_count[ent['id']] + 1 if ent['id'] in ent_count else 1
            if len(ent['offsets']) > max_offsets:
                max_offsets = len(ent['offsets'])
            
            max_texts_ex = ent['text'].count(';') # offsets were separated by semi-colon, so texts might be as well
            if max_texts_ex > max_texts:
                max_texts = max_texts_ex
            
        max_count_ex = max(ent_count.values()) if ent_count else 0
        if max_count_ex > max_count:
            max_count = max_count_ex
    
    return {'max_offsets': max_offsets, 'max_texts': max_texts, 'max_count': max_count}

In [None]:
print("Entity repeats:")
print("Train:")
pprint(ent_repeats(ds_train))
print()
print("Test:")
pprint(ent_repeats(ds_test))

Entity quirks:
Train:
{'max_count': 1, 'max_offsets': 2, 'max_texts': 1}

Test:
{'max_count': 1, 'max_offsets': 3, 'max_texts': 1}


This shows that mention-level end-to-end RE cannot be applied, as there are repeated mentions for some entities in each text.

In [19]:
def type_statistics(ds):
    ent_types = {}
    rel_types = {}
    for example in ds:
        for ent in example['entities']:
            ent_types[ent['type']] = ent_types[ent['type']] + 1 if ent['type'] in ent_types else 1
        
        for rel in example['relations']:
            if rel['ddi'] == 'true':
                rel_types[rel['type']] = rel_types[rel['type']] + 1 if rel['type'] in rel_types else 1
    
    return {'ent_stats': ent_types, 'rel_stats': rel_types}

In [20]:
print("Train stats:")
pprint(type_statistics(ds_train))
print()

print("Test stats:")
pprint(type_statistics(ds_test))

Train stats:
{'ent_stats': {'brand': 1437, 'drug': 9425, 'drug_n': 504, 'group': 3399},
 'rel_stats': {None: 1,
               'advise': 826,
               'effect': 1687,
               'int': 188,
               'mechanism': 1319}}

Test stats:
{'ent_stats': {'brand': 369, 'drug': 1864, 'drug_n': 140, 'group': 667},
 'rel_stats': {'advise': 221, 'effect': 360, 'int': 96, 'mechanism': 302}}
