# Universal Information Extraction Benchmark on ChemProt Dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
import sys
import torch

from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pprint import pprint

In [3]:
sys.path.append("../../")

from src.common.utils import load_jsonl, save_predictions
from src.common.metrics import get_metrics
from src.common.data import generative_collate_fn
from src.models.uie.data import UIEDataset
from src.models.uie.inference import run_inference

Initialize model and tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("luyaojie/uie-large-en")
model = AutoModelForSeq2SeqLM.from_pretrained("luyaojie/uie-large-en")

device = torch.device('cuda:5')
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32102, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32102, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

## ChemProt

### Zero-shot Testing

Create dataset and dataloader

In [15]:
data = load_jsonl("../../data/chemprot/test.jsonl")
test_dataset = UIEDataset(data=data, dataset_name='chemprot', tokenizer=tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=generative_collate_fn)

In [None]:
pprint(test_dataset[1])

In [17]:
predictions = run_inference(model=model, dataloader=test_loader, tokenizer=tokenizer)

save_predictions(predictions, "../../predictions/uie/zero-shot", "chemprot.jsonl")

Running inference:   0%|          | 0/10 [00:00<?, ?it/s]

Predictions saved to ../../predictions/uie/zero-shot/chemprot.jsonl


In [20]:
for prediction in predictions:
    print(prediction['pred_entities'])
    print(prediction['gt_entities'])
    print()

[['etizolam', 'chemical substance']]
[['promethazine', 'DRUG'], ['psychotropic drug', 'GROUP'], ['psychotropic drugs', 'GROUP'], ['chlorpromazine', 'DRUG'], ['phenobarbital', 'DRUG'], ['etizolam', 'DRUG']]

[]
[['antiretroviral', 'GROUP']]

[['P-glycoprotein', 'gene product']]
[]

[['pharmaceutical', 'chemical substance']]
[]

[]
[['Oseltamivir', 'DRUG'], ['tacrolimus', 'DRUG'], ['oseltamivir', 'DRUG'], ['immunosuppresive drug', 'GROUP']]

[]
[]

[['tuberculosis', 'anomaly']]
[['antituberculosis drugs', 'GROUP']]

[['ticagrelor', 'chemical substance']]
[['clopidogrel', 'DRUG'], ['Brilinta', 'BRAND'], ['Clopidogrel', 'DRUG'], ['antiplatelet drug', 'GROUP'], ['Effient', 'BRAND'], ['ticagrelor', 'DRUG'], ['aspirin', 'BRAND'], ['prasugrel', 'DRUG'], ['Ticagrelor', 'DRUG'], ['Plavix', 'BRAND']]

[]
[['ketoconazole', 'DRUG'], ['Panobinostat', 'DRUG'], ['histone deacetylase inhibitor', 'GROUP'], ['panobinostat', 'DRUG'], ['LBH589', 'DRUG']]

[]
[['CPP-ACP complex', 'DRUG_N'], ['Casein phospho

In [18]:
metrics = get_metrics(f"../predictions/uie/zero-shot/chemprot.jsonl")

print(f"\nEntity F1: {metrics['entity_f1']:.4f}")
print(f"Relation F1: {metrics['relation_f1']:.4f}")


Entity F1: 0.0000
Relation F1: 0.0000
