# Align bigram with existing annotation

In [30]:
import torch
import numpy as np
import pickle

from tqdm import tqdm

from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging

from transformers import BertTokenizer

from tokenizations import get_alignments

# Read Data

In [1]:
def read_data(datapath):
    data = open(datapath).readlines()
    states = []
    words = []
    word_idx = []
    for li, l in enumerate(data):
        if(li % 3 == 0): word_idx.append([int(w) for w in l.split()])
        elif(li % 3 == 1): states.append([int(z) for z in l.split()])
        else: words.append(l.split())
    return words, states, word_idx

In [2]:
words, states, word_idx = read_data('bertnet_0.0.6.4_dev_epoch_-1_state_seq.txt')

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [53]:
len(states[0]), len(words[0])

(23, 23)

In [51]:
print(words[0])

['you', "'", 're', 'right', ',', '"', 'the', 'bottom', 'line', 'is', 'truth', ',', '"', 'ind', '##ep', '##end', '##ant', 'from', 'you', 'or', 'anyone', 'else', '.']


## Convert Bert tokenized to spacy tokenized

In [10]:
bert_to_spacy_all = pickle.load(open('bert_to_spacy.pkl', 'rb'))

In [74]:
def convert_state(bert2spacy, tags):
    prev_spacy_idx = -1
    tags_converted = []
    assert(len(bert2spacy) == len(tags)) # make sure pre-stored data matches the dev data
    for bi, (si_, t) in enumerate(zip(bert2spacy, tags)):
        for si in si_:
            assert(si == prev_spacy_idx or si == prev_spacy_idx + 1) # every spacy token should be aligned
            # if many consequtive BERT token correspond to the same spacy token, 
            # then only use the tag for the first bert token
            if(si == prev_spacy_idx + 1): 
                prev_spacy_idx += 1
                tags_converted.append(t)
    return tags_converted

In [8]:
def bert_to_spacy_convert(latent_tags, bert_to_spacy_all):
    ## convert a tag seq upon bert-tokenized to spacy-tokenized
    latent_tags_spacy = []
    assert(len(bert_to_spacy_all) == len(latent_tags))
    for bert2spacy, tags in tqdm(zip(bert_to_spacy_all, latent_tags)):
        prev_spacy_idx = -1
        tags_converted = []
        assert(len(bert2spacy) == len(tags)) # make sure pre-stored data matches the dev data
        for bi, (si_, t) in enumerate(zip(bert2spacy, tags)):
            for si in si_:
                assert(si == prev_spacy_idx or si == prev_spacy_idx + 1)
                # if many consequtive BERT token correspond to the same spacy token, 
                # then only use the tag for the first bert token
                if(si == prev_spacy_idx + 1): 
                    prev_spacy_idx += 1
                    tags_converted.append(t)
        latent_tags_spacy.append(tags_converted)
    return latent_tags_spacy

In [13]:
states_spacy = bert_to_spacy_convert(states, bert_to_spacy_all)

26121it [00:00, 126802.09it/s]


In [16]:
print(states[0])
print(states_spacy[0])

[772, 1359, 1529, 344, 465, 1532, 681, 681, 1874, 1445, 1602, 465, 1532, 445, 445, 445, 426, 369, 1045, 1138, 128, 128, 64]
[772, 1359, 344, 465, 1532, 681, 681, 1874, 1445, 1602, 465, 1532, 445, 369, 1045, 1138, 128, 128, 64]


# Open IE

In [31]:
openie_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz")

In [37]:
out = openie_predictor.predict(tokenizer.convert_tokens_to_string(words[0]))

In [73]:
out.keys()

dict_keys(['verbs', 'words'])

In [66]:
len(words[0])

23

In [68]:
len(tokenizer.convert_tokens_to_string(words[0]).split())

20

In [65]:
len(out['words'])

20

In [58]:
bert_to_spacy, _ = get_alignments(words[0], out['words'])

In [62]:
tag = convert_tag(bert_to_spacy, states[0])

In [64]:
len(tag)

20

In [81]:
with torch.no_grad():
    states_openie = []
    tags_openie = []
    for w, s in tqdm(list(zip(words[:100], states[:100]))):
        out = openie_predictor.predict(tokenizer.convert_tokens_to_string(w))
        tags_openie.append(out['verbs'])
        bert_to_spacy, _ = get_alignments(w, out['words'])
        s_ = convert_state(bert_to_spacy, s)
        states_openie.append(s_)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:13<00:00,  7.57it/s]


In [83]:
pickle.dump(tags_openie, open('tags_openie.pkl', 'wb'))

In [84]:
pickle.dump(states_openie, open('states_openie.pkl', 'wb'))

In [None]:
## For each bigram, construct a bigram - IE tag dictionary

# SRL

In [5]:
srl_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz")

In [None]:
## For each bigram, construct a bigram - SRL tag dictionary

# Constituency

For each bigram, construct a bigram - SRL tag dictionary

In [6]:
parser = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz")



In [40]:
dir(parser)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_batch_json_to_instances',
 '_build_hierplane_tree',
 '_dataset_reader',
 '_json_to_instance',
 '_model',
 '_register_embedding_gradient_hooks',
 '_registry',
 '_token_offsets',
 '_tokenizer',
 'by_name',
 'capture_model_internals',
 'cuda_device',
 'default_implementation',
 'dump_line',
 'from_archive',
 'from_params',
 'from_path',
 'get_gradients',
 'get_interpretable_layer',
 'get_interpretable_text_field_embedder',
 'json_to_labeled_instances',
 'list_available',
 'load_line',
 'predict',
 'predict_batch_instance',
 'predict_batch_json',
 'predict_instance',
 'predict_json',
 'predictions_to_lab

In [28]:
out = parser.predict(
    sentence="If you bring $10 with you tomorrow, can you pay for me to eat too?.".lower()
)

In [21]:
out['hierplane_tree']['root']

dict_keys(['word', 'nodeType', 'attributes', 'link', 'children'])

In [41]:
out['hierplane_tree']['root']['word']

'if you bring $ 10 with you tomorrow , can you pay for me to eat too ? .'

In [22]:
out['hierplane_tree']['root']['attributes']

['SQ']

In [37]:
def traversal(root):
    q = [root]
    out = []
    while(len(q) > 0):
        node = q[0]
        q = q[1:]
        # print(node['word'])
        if(len(node['word'].split()) == 2):
            out.append((node['word'], node['attributes']))
        if('children' in node):
            for c in node['children']: q.append(c)
    return out

In [38]:
bigrams = traversal(out['hierplane_tree']['root'])

In [39]:
bigrams

[('$ 10', ['NP']), ('with you', ['PP']), ('eat too', ['VP'])]