In [None]:
import json
import torch
import transformers
import numpy as np
from tqdm import tqdm
from transformers import pipeline
from transformers import AutoTokenizer, BertForQuestionAnswering
transformers.logging.set_verbosity_error()

In [None]:
# Replace this with your own checkpoint
model_checkpoint = "exp/test_0"
# model_checkpoint = "deepset/bert-base-uncased-squad2"
qa_pipeline = pipeline("question-answering", model=model_checkpoint, tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased"))

In [None]:
with open("./data/squad_v2/raw/dev-v2.0.json", "r") as source_file:
    raw_data = json.load(source_file)["data"]

In [None]:
model = BertForQuestionAnswering.from_pretrained('deepset/bert-base-uncased-squad2').to('cuda:1')
tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-uncased-squad2")

def get_answer(question, context):
    inputs = tokenizer(question, context, max_length=384, padding="max_length", truncation="only_second", return_tensors="pt").to("cuda:1")
    with torch.no_grad():
        outputs = model(**inputs)

    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    answer = tokenizer.decode(predict_answer_tokens)
    return answer

In [None]:
answer_dict = {}
for article in tqdm(raw_data):
        title = article["title"]
        paragraphs = article["paragraphs"]
        for paragraph in paragraphs:
            context = paragraph["context"]
            qas = paragraph["qas"]
            for qa in qas:
                qid = qa["id"]
                question = qa["question"]
                # answer = qa_pipeline(question=question, context=context, handle_impossible_answer=True)
                # answer_dict[qid] = answer["answer"]
                answer = get_answer(question=question, context=context)
                answer_dict[qid] = answer

In [None]:
with open("./data/squad_v2/processed/pred.json", "w") as wf:
    wf.write(json.dumps(answer_dict))

In [None]:
!python squad_eval.py ./data/squad_v2/raw/dev-v2.0.json ./data/squad_v2/processed/pred_squad.json 

In [1]:
from components.hg_parser import ConstituencyParser, ConstituencyNode

In [2]:
cp = ConstituencyParser()

2022-11-29 16:23:01 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2022-11-29 16:23:01 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2022-11-29 16:23:01 INFO: Use device: gpu
2022-11-29 16:23:01 INFO: Loading: tokenize
2022-11-29 16:23:03 INFO: Loading: pos
2022-11-29 16:23:04 INFO: Loading: constituency
2022-11-29 16:23:04 INFO: Done loading processors!


In [3]:
sentences = cp.get_sentences("How are you doing today?")

In [4]:
tid, cid = 0, 100000

def iterate_tree(root):
    global tid, cid
    if root.is_preterminal():
        leaf_node = ConstituencyNode(cid=tid, label=root.label, text=root.leaf_labels(), lids=[tid], children=[])
        tid += 1
        return leaf_node
    else:
        child_nodes = list()
        lids = list()
        for child in root.children:
            child_node = iterate_tree(child)
            child_nodes += [child_node]
            lids += child_node.lids

        leaf_node = ConstituencyNode(cid=cid, label=root.label, text=root.leaf_labels(), lids=lids, children=child_nodes)
        cid += 1
        return leaf_node
            
root = iterate_tree(sentences[0].constituency)

In [5]:
print(root)

cid: 100006 | label: ROOT | text: ['How', 'are', 'you', 'doing', 'today', '?'] | lids: [0, 1, 2, 3, 4, 5] | children: [100005] | answer: False


In [6]:
def dfs(root):
    print(root)
    
    for child in root.children:
        dfs(child)
    
dfs(root)

cid: 100006 | label: ROOT | text: ['How', 'are', 'you', 'doing', 'today', '?'] | lids: [0, 1, 2, 3, 4, 5] | children: [100005] | answer: False
cid: 100005 | label: SBARQ | text: ['How', 'are', 'you', 'doing', 'today', '?'] | lids: [0, 1, 2, 3, 4, 5] | children: [100000, 100004, 5] | answer: False
cid: 100000 | label: WHADVP | text: ['How'] | lids: [0] | children: [0] | answer: False
cid: 0 | label: WRB | text: ['How'] | lids: [0] | children: [] | answer: False
cid: 100004 | label: SQ | text: ['are', 'you', 'doing', 'today'] | lids: [1, 2, 3, 4] | children: [1, 100001, 100003] | answer: False
cid: 1 | label: VBP | text: ['are'] | lids: [1] | children: [] | answer: False
cid: 100001 | label: NP | text: ['you'] | lids: [2] | children: [2] | answer: False
cid: 2 | label: PRP | text: ['you'] | lids: [2] | children: [] | answer: False
cid: 100003 | label: VP | text: ['doing', 'today'] | lids: [3, 4] | children: [3, 100002] | answer: False
cid: 3 | label: VBG | text: ['doing'] | lids: [3] | c

: 