# Universal Information Extraction Benchmark on BioRED Dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import torch

from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pprint import pprint

In [3]:
sys.path.append("../../")

from src.common.utils import load_jsonl, save_predictions
from src.common.metrics import get_metrics
from src.common.data import generative_collate_fn
from src.models.uie.data import UIEDataset
from src.models.uie.inference import run_inference

Initialize model and tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("luyaojie/uie-large-en")
model = AutoModelForSeq2SeqLM.from_pretrained("luyaojie/uie-large-en")

device = torch.device('cuda:5')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32102, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32102, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

### Zero-shot Testing

Create dataset and dataloader

In [5]:
data = load_jsonl("../../data/biored/test.jsonl")
test_dataset = UIEDataset(data=data, dataset_name='biored', tokenizer=tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=generative_collate_fn)

In [6]:
pprint(test_dataset[1]['text'])

('<spot> gene or gene product <spot> disease or phenotypic feature <spot> '
 'chemical or drug <spot> genomic or protein variant <spot> species <spot> '
 'cell line <asoc> is associated with <asoc> positively correlates with <asoc> '
 'negatively correlates with <asoc> binds to <asoc> is cotreated with <asoc> '
 'is compared to <asoc> converts to <asoc> interacts with drug <extra_id_2> '
 'Allelic expression imbalance of human mu opioid receptor (OPRM1) caused by '
 'variant A118G. As a primary target for opioid drugs and peptides, the mu '
 'opioid receptor (OPRM1) plays a key role in pain perception and addiction. '
 'Genetic variants of OPRM1 have been implicated in predisposition to drug '
 'addiction, in particular the single nucleotide polymorphism A118G, leading '
 'to an N40D substitution, with an allele frequency of 10-32%, and uncertain '
 'functions. We have measured allele-specific mRNA expression of OPRM1 in '
 'human autopsy brain tissues, using A118G as a marker. In 8 he

In [7]:
pprint(test_dataset[1]['label'])

('<extra_id_0> <extra_id_0> species <extra_id_5> human <extra_id_1> '
 '<extra_id_0> gene or gene product <extra_id_5> mu opioid receptor '
 '<extra_id_0> is associated with <extra_id_5> pain <extra_id_1> <extra_id_0> '
 'is associated with <extra_id_5> drug addiction <extra_id_1> <extra_id_1> '
 '<extra_id_0> genomic or protein variant <extra_id_5> A118G <extra_id_1> '
 '<extra_id_0> disease or phenotypic feature <extra_id_5> pain <extra_id_1> '
 '<extra_id_0> disease or phenotypic feature <extra_id_5> drug addiction '
 '<extra_id_0> is associated with <extra_id_5> A118G <extra_id_1> <extra_id_1> '
 '<extra_id_0> species <extra_id_5> Chinese hamster <extra_id_1> <extra_id_0> '
 'gene or gene product <extra_id_5> OPRM1 <extra_id_1> <extra_id_0> chemical '
 'or drug <extra_id_5> actinomycin D <extra_id_1> <extra_id_1>')


In [8]:
predictions = run_inference(model=model, dataloader=test_loader, tokenizer=tokenizer)

save_predictions(predictions, "../../predictions/uie/zero-shot", "biored.jsonl")

Running inference:   0%|          | 0/4 [00:00<?, ?it/s]

Predictions saved to ../../predictions/uie/zero-shot/biored.jsonl


In [9]:
metrics = get_metrics(f"../../predictions/uie/zero-shot/biored.jsonl")

print(f"\nEntity F1: {metrics['entity_f1']:.4f}")
print(f"Relation F1: {metrics['relation_f1']:.4f}")


Entity F1: 0.0000
Relation F1: 0.0000
