# GLiREL RE Results on Bio-Domain Datasets

In [1]:
from datasets import load_dataset
from glirel import GLiREL
from pprint import pprint
from data_processing.common import run_inference
import spacy
import torch

Load tokenizer and GLiREL model:

In [2]:
nlp = spacy.load("en_core_web_sm")
model = GLiREL.from_pretrained("jackboyla/glirel-large-v0", use_fast=False)



Check model device and move to gpu if not already using it

In [3]:
print(f"Model device: {model.device}")

Model device: cpu


In [4]:
if torch.cuda.is_available():
    model.to('cuda')
    print(f"Model moved to GPU. New device: {model.device}")

Model moved to GPU. New device: cuda


## ChemProt

In [5]:
from data_processing.chemprot import get_chemprot_labels, create_chemprot_input, evaluate_chemprot, evaluate_chemprot_by_entity_text

### Data Pre-processing

Load the dataset:

In [6]:
chemprot_data = load_dataset("bigbio/chemprot", "chemprot_bigbio_kb", split="test")

print(chemprot_data)

Dataset({
    features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
    num_rows: 800
})


Explore dataset format:

In [7]:
print("First data point:")
print("Passages:")
pprint(chemprot_data[0]["passages"])
print("\nEntities:")
pprint(chemprot_data[0]["entities"])
print("\nRelations:")
pprint(chemprot_data[0]["relations"])
print("\nCoreferences:")
pprint(chemprot_data[0]["coreferences"])

First data point:
Passages:
[{'id': '1',
  'offsets': [[0, 2675]],
  'text': ['Androgen antagonistic effect of estramustine phosphate (EMP) '
           'metabolites on wild-type and mutated androgen receptor.\n'
           'Estramustine phosphate is used frequently, alone or in combination '
           'with other antitumor agents, for the treatment of '
           'hormone-refractory prostate cancer. Estramustine phosphate is '
           'metabolically activated in vivo, and its metabolites, '
           'estramustine, estromustine, estrone, and beta-estradiol inhibit '
           'the assembly of microtubules [for review see: Kreis W, In: '
           'Concepts, Mechanisms, and New Targets for Chemotherapy (Ed. Muggia '
           'FM), pp. 163-184. Kluwer Academic Publishers, Boston, 1995]. We '
           'investigated, by displacement of [3H]methyltrienolone in the '
           'presence of 2.5 mM of triamcinolone acetonide, the binding of '
           'estramustine phosphate an

Obtain all relation labels used in the dataset (to be used as input to the model):

In [8]:
chemprot_labels = get_chemprot_labels(chemprot_data)
chemprot_labels = sorted(set(chemprot_labels))

print("No. of relaton labels", len(chemprot_labels))
print("Relation types:", chemprot_labels)

No. of relaton labels 10
Relation types: ['Agonist', 'Antagonist', 'Cofactor', 'Downregulator', 'Modulator', 'Not', 'Part_of', 'Regulator', 'Substrate', 'Upregulator']


Convert input data into the format used by GLiREL:

In [9]:
chemprot_input = [create_chemprot_input(example, nlp) for example in chemprot_data]

print(f"Number of examples: {len(chemprot_input)}")

Number of examples: 800


### Example Testing

In [10]:
chemprot_example = chemprot_input[0]
chemprot_tokens = chemprot_example["tokens"]
chemprot_ner = chemprot_example["ner"]

print("Tokens:", chemprot_tokens)
print("NER:", chemprot_ner)
print("Gold labels:", chemprot_example["gold_relations"])

Tokens: ['Androgen', 'antagonistic', 'effect', 'of', 'estramustine', 'phosphate', '(', 'EMP', ')', 'metabolites', 'on', 'wild', '-', 'type', 'and', 'mutated', 'androgen', 'receptor', '.', '\n', 'Estramustine', 'phosphate', 'is', 'used', 'frequently', ',', 'alone', 'or', 'in', 'combination', 'with', 'other', 'antitumor', 'agents', ',', 'for', 'the', 'treatment', 'of', 'hormone', '-', 'refractory', 'prostate', 'cancer', '.', 'Estramustine', 'phosphate', 'is', 'metabolically', 'activated', 'in', 'vivo', ',', 'and', 'its', 'metabolites', ',', 'estramustine', ',', 'estromustine', ',', 'estrone', ',', 'and', 'beta', '-', 'estradiol', 'inhibit', 'the', 'assembly', 'of', 'microtubules', '[', 'for', 'review', 'see', ':', 'Kreis', 'W', ',', 'In', ':', 'Concepts', ',', 'Mechanisms', ',', 'and', 'New', 'Targets', 'for', 'Chemotherapy', '(', 'Ed', '.', 'Muggia', 'FM', ')', ',', 'pp', '.', '163', '-', '184', '.', 'Kluwer', 'Academic', 'Publishers', ',', 'Boston', ',', '1995', ']', '.', 'We', 'invest

In [12]:
chemprot_relations = model.predict_relations(chemprot_tokens, chemprot_labels, threshold=0.0, ner=chemprot_ner, top_k=len(chemprot_labels))

chemprot_sorted_rels = sorted(chemprot_relations, key=lambda x: x['score'], reverse=True)
print("\nDescending Order by Score:")
for item in chemprot_sorted_rels:
    print(f"{item['head_text']} --> {item['label']} --> {item['tail_text']} | score: {item['score']}")


Descending Order by Score:
['estramustine'] --> Antagonist --> ['Androgen'] | score: 0.3457149267196655
['estromustine'] --> Antagonist --> ['EMP'] | score: 0.3340665102005005
['estramustine'] --> Antagonist --> ['EMP'] | score: 0.3289058804512024
['estromustine'] --> Antagonist --> ['Androgen'] | score: 0.3233489394187927
['Androgen'] --> Antagonist --> ['estramustine'] | score: 0.3195524513721466
['EMP'] --> Antagonist --> ['estromustine'] | score: 0.31803280115127563
['EMP'] --> Antagonist --> ['estramustine'] | score: 0.3079228699207306
['estramustine'] --> Downregulator --> ['PSA'] | score: 0.3051915764808655
['estromustine'] --> Antagonist --> ['estramustine'] | score: 0.3033345639705658
['estromustine'] --> Antagonist --> ['estramustine'] | score: 0.3014456033706665
['Androgen'] --> Antagonist --> ['estromustine'] | score: 0.29957762360572815
['estramustine', 'phosphate'] --> Antagonist --> ['estromustine'] | score: 0.29866647720336914
['estromustine'] --> Antagonist --> ['estr

### Model Inference

Filter input data to exclude examples exceeding 512 tokens (the backbone DeBERTa is limited to an input sequence size of 512):

In [13]:
chemprot_input_filtered = [input for input in chemprot_input if len(input["tokens"])<=512]

print(f"Number of examples after filtering: {len(chemprot_input_filtered)}")

Number of examples after filtering: 783


Predict relations (with a maximum of 10 relations allowed between an entity pair):

In [14]:
chemprot_predictions = run_inference(model, chemprot_input_filtered, chemprot_labels, threshold=0.0, top_k=len(chemprot_labels))

Running inference on device: cuda


### Model Performance

#### Evaluation Using Entity Spans

In [21]:
chemprot_results = evaluate_chemprot(chemprot_input_filtered, chemprot_predictions, threshold=0.5)

print(chemprot_results)

{'precision': 0.006, 'recall': 0.0012, 'f1': 0.0021, 'TP': 7, 'FP': 1155, 'FN': 5599}


#### Evaluation Using Entity Names

In [20]:
chemprot_results_text = evaluate_chemprot_by_entity_text(chemprot_input_filtered, chemprot_predictions, threshold=0.5)

print(chemprot_results_text)

{'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'TP': 0, 'FP': 211, 'FN': 5606}
