In [3]:
import random
import nltk
import numpy as np
import torch
from pytorch_transformers import BertTokenizer
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

In [4]:
import bertmodel
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)

from utils_squad import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad

In [5]:
def load_and_cache_examples(input_file, tokenizer):
    cached_features_file = 'Ecached_dev_{}_{}'.format(model_name, str(max_seq_length))
    """ 
    if os.path.exists(cached_features_file):
        #print("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    """
    print("Creating features from dataset file at %s", input_file)
    examples = read_squad_examples(input_file=input_file,
                                   is_training=False,
                                   version_2_with_negative=False)
    features = convert_examples_to_features(examples=examples,
                                            tokenizer=tokenizer,
                                            max_seq_length=max_seq_length,
                                            doc_stride=doc_stride,
                                            max_query_length=max_query_length,
                                            is_training=False)
    print("Saving features into cached file %s", cached_features_file)
    torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_example_index, all_cls_index, all_p_mask)
    return dataset, examples, features

In [11]:
max_seq_length = 384
model_name = "bert-base-cased"
do_lower_case = False
dev_file = "dev-v1.1.json"
doc_stride = 128
max_query_length = 64
null_score_diff_threshold = 0
max_answer_length = 30

tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)
output_null_log_odds_file = None

In [43]:
device = torch.device("cuda:4")

In [12]:
dataset, examples, features = load_and_cache_examples(dev_file, tokenizer)
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=1)

Creating features from dataset file at %s dev-v1.1.json
Saving features into cached file %s Ecached_dev_bert-base-cased_384


In [44]:
from allennlp.predictors.predictor import Predictor
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz")
predictor._model = predictor._model.to(device)

In [45]:
srls = {}
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    example_indices = batch[3]
    for example_idx in example_indices:
        eval_feature = features[example_idx.item()]
        unique_id = int(eval_feature.unique_id)
        srls[eval_feature.example_index] = predictor.predict(sentence=examples[eval_feature.example_index].question_text)

Evaluating: 100%|██████████| 10881/10881 [36:03<00:00,  5.09it/s]


In [20]:
len(examples)

10570

In [26]:
len(features)

10881

In [31]:
features[0].unique_id

1000000000

In [34]:
examples[0].question_text

'Which NFL team represented the AFC at Super Bowl 50?'

In [40]:
len(srls)

10570