#Inference of pre-trained models using HuggingFace library (on GPU)
The trained tensorflow models have been ported to huggingface but they don't match the reported results (are actually lower) due to some implementation differences between huggingface and tensorflow.

Use the AACT_MOIE_Tensorflow sheet if you want to use the best models or want to train the models from scratch - [Colab link](https://colab.research.google.com/drive/1gzVL5ZGHRCKJJcwj8DSghbmYjUIsC2VM?usp=sharing).

##Install Requirements

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install ipdb
!pip install --upgrade gdown
!pip install sklearn
!git clone https://github.com/dair-iitd/moie.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.0-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 69.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.5.1.tar.gz (14 kB)


In [None]:
#@title Fill in parameters
language = 'en' #@param ["en", "es", "pt", "hi", "te", "zh"] {allow-input: true}
import os
os.environ['LANG'] = language
device = 'cuda:0' #@param {type:"string"}
evaluate_carb = False #@param {type:"boolean"}
if not evaluate_carb:
  input_file = 'file_path_containing_sentences' #@param {type:"string"}
else:
  input_file = f'moie/carb/data/{language}_test.input'

##Download models

In [None]:
%%shell

if [ "$LANG" = "en" ]; then
  gdown 1MmHBqWApqNhRwuNPw3mTpZnY0wdghqZy ## EN
  unzip aact_moie_gen2oie_en.zip
elif [ "$LANG" = "es" ]; then
  gdown 1m3zzAsktqx_6V4aj4LUq3cpQEH_MH-G1 ## ES
  unzip aact_moie_gen2oie_es.zip
elif [ "$LANG" = "pt" ]; then
  gdown 1__AqeNNAsdIzL_Zpjko1BF3nexwAUCQV ## PT
  unzip aact_moie_gen2oie_pt.zip
elif [ "$LANG" = "hi" ]; then
  gdown 130EwTdJa5O278tD4sQsRN0rTyijWoI6z ## HI
  unzip aact_moie_gen2oie_hi.zip
elif [ "$LANG" = "te" ]; then
  gdown 1geLobUxGwOTtY54ogblDIyUa2HlAtv-M ## TE
  unzip aact_moie_gen2oie_te.zip
elif [ "$LANG" = "zh" ]; then
  gdown 13VCbjE--4-EiX8adExgPHcl1Tsy_MX-T ## ZH
  unzip aact_moie_gen2oie_zh.zip
fi

##Load the models

In [None]:
import os
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

s1_path = f'aact_moie_gen2oie_{language}/stage1/'
s1_model = AutoModelForSeq2SeqLM.from_pretrained(s1_path).to(device)

s2_path = f'aact_moie_gen2oie_{language}/stage2/'
s2_model = AutoModelForSeq2SeqLM.from_pretrained(s2_path).to(device)

tokenizer = AutoTokenizer.from_pretrained('google/mt5-base')

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

##Load Input Sentences on which OpenIE has to be run

In [None]:
sentences = open(input_file).readlines()
sentences = [s.strip() for s in sentences]

##Run the code

In [None]:
import math
from tqdm import tqdm

s1_results = []
batch_size = 32
num_batches = math.ceil(len(sentences)/batch_size)
print('Stage-1 prediction:')
for k in tqdm(range(num_batches)):
  sentences_k = sentences[k*batch_size:(k+1)*batch_size]

  s1_inputs_toks = tokenizer(sentences_k, return_tensors="pt", padding=True)
  s1_inputs_toks = s1_inputs_toks['input_ids'].to(device)
  preds_k = s1_model.generate(s1_inputs_toks, max_length=50, num_beams=1, early_stopping=True)
  s1_results_k = tokenizer.batch_decode(preds_k, skip_special_tokens=True)
  s1_results.extend(s1_results_k)

all_outputs = []
s2_inputs, s2_input_sentences = [], []
num_relations = []
nrs = 0
num_relations.append(0)
for sentence_i, sentence in enumerate(sentences):
    for rel in s1_results[sentence_i].split('<r>'):
        s2_input = rel.strip() + ' <r> ' + sentence
        s2_inputs.append(s2_input)
        nrs += 1
    num_relations.append(nrs)

s2_results = []
batch_size = 8
num_batches = math.ceil(len(s2_inputs)/batch_size)
for k in tqdm(range(num_batches)):
    s2_input_k = s2_inputs[k*batch_size:(k+1)*batch_size]
    s2_inputs_toks = tokenizer(s2_input_k, return_tensors="pt", padding=True)
    s2_inputs_toks = s2_inputs_toks['input_ids'].to(device)

    preds_k = s2_model.generate(s2_inputs_toks, max_length=128, num_beams=1, early_stopping=True)
    s2_results_k = tokenizer.batch_decode(preds_k, skip_special_tokens=True)
    s2_results.extend(s2_results_k)

s2r = []
for ri, result in enumerate(s2_results):
    if ri in num_relations:
        s2r.append([result])
    else:
        s2r[-1].append(result)

for sentence, s2_result in zip(sentences, s2r):
    def flatten(t):
        return [item for sublist in t for item in sublist if item.strip()]
    extractions = []
    for e1 in s2_result:
        for e2 in e1.split('<e>'):
            if e2.strip():
                extractions.append(e2)
    extractions = set(extractions)
    all_outputs.append([sentence, extractions])


Stage-1 prediction:


100%|██████████| 21/21 [00:15<00:00,  1.36it/s]
100%|██████████| 293/293 [04:31<00:00,  1.08it/s]


## Print the generated extractions

In [None]:
import re

f = open('extractions','w')

for sentence, extractions in all_outputs:
  if not evaluate_carb:
    print('Sentence: ')
    f.write(sentence+'\n')
    print(sentence)
    print('Extractions: ')
    print('\n'.join(extractions))
    f.write('\n'.join(extractions)+'\n\n')
    print('\n')
  else:
    for extraction in extractions:
      time = re.findall('<t>(.*?)</t>', extraction)
      time = ' '.join([t.strip() for t in time]).strip()
      loc = re.findall('<l>(.*?)</l>', extraction)
      loc = ' '.join([t.strip() for t in loc]).strip()
      arg2 = re.findall('<a2>(.*?)</a2>', extraction)
      arg2 = ' '.join([t.strip() for t in arg2]).strip()
      if loc:
        arg2 = arg2+' '+loc
      if time:
        arg2 = arg2+' '+time
      arg2 = arg2.strip()

      arg1 = re.findall('<a1>(.*?)</a1>', extraction)
      arg1 = ' '.join([t.strip() for t in arg1]).strip()
      rel = re.findall('<r>(.*?)</r>', extraction)
      rel = ' '.join([t.strip() for t in rel]).strip()
      extraction = f'<arg1> {arg1} </arg1> <rel> {rel} </rel>'
      if arg2:
        extraction += f' <arg2> {arg2} </arg2>'
      f.write(f'{sentence}\t{extraction}\t1\n')
f.close()

In [None]:
## Run only in case of evaluating CaRB
!python moie/carb/carb.py --allennlp extractions --gold moie/carb/data/gold/${LANG}_test.tsv --out /dev/null

INFO:root:Writing PR curve of Allennlp to /dev/null
Num Final Sentences =  634
AUC: 0.3731	 Optimal (precision, recall, F1): (0.5407, 0.4843, 0.511)	Zero Conf (precision, recall, F1): (0.5407, 0.4843, 0.511)
