# Universal Information Extraction Benchmark on BC5CDR Dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import torch

from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pprint import pprint

In [3]:
sys.path.append("../../")

from src.common.utils import load_jsonl, save_predictions
from src.common.metrics import get_metrics
from src.common.data import generative_collate_fn
from src.models.uie.data import UIEDataset
from src.models.uie.inference import run_inference

Initialize model and tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("luyaojie/uie-large-en")
model = AutoModelForSeq2SeqLM.from_pretrained("luyaojie/uie-large-en")

device = torch.device('cuda:5')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32102, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32102, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

### Zero-shot Testing

Create dataset and dataloader

In [5]:
data = load_jsonl("../../data/bc5cdr/test.jsonl")
test_dataset = UIEDataset(data=data, dataset_name='bc5cdr', tokenizer=tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=generative_collate_fn)

In [6]:
pprint(test_dataset[1]['text'])

('<spot> chemical <spot> disease <asoc> causes or induces <extra_id_2> '
 'Indomethacin induced hypotension in sodium and volume depleted rats. After a '
 'single oral dose of 4 mg/kg indomethacin (IDM) to sodium and volume depleted '
 'rats plasma renin activity (PRA) and systolic blood pressure fell '
 'significantly within four hours. In sodium repleted animals indomethacin did '
 'not change systolic blood pressure (BP) although plasma renin activity was '
 'decreased. Thus, indomethacin by inhibition of prostaglandin synthesis may '
 'diminish the blood pressure maintaining effect of the stimulated '
 'renin-angiotensin system in sodium and volume depletion.')


In [7]:
pprint(test_dataset[1]['label'])

('<extra_id_0> <extra_id_0> disease <extra_id_5> hypotension <extra_id_1> '
 '<extra_id_0> chemical <extra_id_5> sodium <extra_id_1> <extra_id_0> chemical '
 '<extra_id_5> indomethacin <extra_id_0> causes or induces <extra_id_5> '
 'hypotension <extra_id_1> <extra_id_1> <extra_id_0> chemical <extra_id_5> '
 'prostaglandin <extra_id_1> <extra_id_0> chemical <extra_id_5> angiotensin '
 '<extra_id_1> <extra_id_1>')


In [8]:
predictions = run_inference(model=model, dataloader=test_loader, tokenizer=tokenizer)

save_predictions(predictions, "../../predictions/uie/zero-shot", "bc5cdr.jsonl")

Running inference:   0%|          | 0/16 [00:00<?, ?it/s]

Predictions saved to ../../predictions/uie/zero-shot/bc5cdr.jsonl


In [9]:
metrics = get_metrics(f"../../predictions/uie/zero-shot/bc5cdr.jsonl")

print(f"\nEntity F1: {metrics['entity_f1']:.4f}")
print(f"Relation F1: {metrics['relation_f1']:.4f}")


Entity F1: 0.0000
Relation F1: 0.0000
