# SynNLI v0

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from allennlp.predictors.predictor import Predictor #
import allennlp_models.structured_prediction
import allennlp_models.coref

In [121]:
from collections import Counter, defaultdict


from allennlp.data.fields import TextField, LabelField, SequenceLabelField
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.instance import Instance

from typing import Iterable
import logging
import jsonlines

In [4]:
import config

In [5]:
predictor_srl = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.03.24.tar.gz")

In [6]:
doc = predictor_srl.predict(
  sentence="Did Uriah honestly think he could beat the game in under three hours?"
)
print(doc)

{'verbs': [{'verb': 'think', 'description': 'Did [ARG0: Uriah] [ARGM-MNR: honestly] [V: think] [ARG1: he could beat the game in under three hours] ?', 'tags': ['O', 'B-ARG0', 'B-ARGM-MNR', 'B-V', 'B-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'I-ARG1', 'O']}, {'verb': 'could', 'description': 'Did Uriah honestly think he [V: could] beat the game in under three hours ?', 'tags': ['O', 'O', 'O', 'O', 'O', 'B-V', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, {'verb': 'beat', 'description': 'Did Uriah honestly think [ARG0: he] [ARGM-MOD: could] [V: beat] [ARG1: the game] in [ARGM-TMP: under three hours] ?', 'tags': ['O', 'O', 'O', 'O', 'B-ARG0', 'B-ARGM-MOD', 'B-V', 'B-ARG1', 'I-ARG1', 'O', 'B-ARGM-TMP', 'I-ARGM-TMP', 'I-ARGM-TMP', 'O']}], 'words': ['Did', 'Uriah', 'honestly', 'think', 'he', 'could', 'beat', 'the', 'game', 'in', 'under', 'three', 'hours', '?']}


In [7]:
predictor_dep = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz")

Did not use initialization regex that was passed: .*weight_hh.*
Did not use initialization regex that was passed: .*projection.*weight
Did not use initialization regex that was passed: .*weight_ih.*
Did not use initialization regex that was passed: .*bias_hh.*
Did not use initialization regex that was passed: .*projection.*bias
Did not use initialization regex that was passed: .*bias_ih.*


In [8]:
doc = predictor_dep.predict(
  sentence="If I bring 10 dollars tomorrow, can you buy me lunch?"
)
print(doc)
print("\n", doc["words"], doc["predicted_heads"])

Your label namespace was 'pos'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.


{'arc_loss': 0.37905168533325195, 'tag_loss': 0.5370486974716187, 'loss': 0.9161003828048706, 'words': ['If', 'I', 'bring', '10', 'dollars', 'tomorrow', ',', 'can', 'you', 'buy', 'me', 'lunch', '?'], 'pos': ['SCONJ', 'PRON', 'VERB', 'NUM', 'NOUN', 'NOUN', 'PUNCT', 'VERB', 'PRON', 'VERB', 'PRON', 'NOUN', 'PUNCT'], 'predicted_dependencies': ['mark', 'nsubj', 'advcl', 'dep', 'dobj', 'tmod', 'advmod', 'aux', 'nsubj', 'root', 'dobj', 'dep', 'discourse'], 'predicted_heads': [3, 3, 10, 5, 3, 3, 10, 10, 10, 0, 10, 11, 10], 'hierplane_tree': {'text': 'If I bring 10 dollars tomorrow , can you buy me lunch ?', 'root': {'word': 'buy', 'nodeType': 'root', 'attributes': ['VERB'], 'link': 'root', 'spans': [{'start': 41, 'end': 45}], 'children': [{'word': 'bring', 'nodeType': 'advcl', 'attributes': ['VERB'], 'link': 'advcl', 'spans': [{'start': 5, 'end': 11}], 'children': [{'word': 'If', 'nodeType': 'mark', 'attributes': ['SCONJ'], 'link': 'mark', 'spans': [{'start': 0, 'end': 3}]}, {'word': 'I', 'nod

In [9]:
predictor_coref = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz")

Did not use initialization regex that was passed: _context_layer._module.weight_ih.*
Did not use initialization regex that was passed: _context_layer._module.weight_hh.*


In [10]:
predictor_coref.predict(
  document="The woman reading a newspaper sat on the bench with her dog."
)

{'top_spans': [[0, 4], [5, 5], [7, 8], [10, 10], [10, 11]],
 'antecedent_indices': [[0, 1, 2, 3, 4],
  [0, 1, 2, 3, 4],
  [0, 1, 2, 3, 4],
  [0, 1, 2, 3, 4],
  [0, 1, 2, 3, 4]],
 'predicted_antecedents': [-1, -1, -1, 0, -1],
 'document': ['The',
  'woman',
  'reading',
  'a',
  'newspaper',
  'sat',
  'on',
  'the',
  'bench',
  'with',
  'her',
  'dog',
  '.'],
 'clusters': [[[0, 4], [10, 10]]]}

In [None]:
@DatasetReader.register('nli-jsonl')
class NLI_Jsonl_Reader(DatasetReader):
    def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_tokens: int = None,
                 **kwargs):
        super().__init__(**kwargs)
        self.tokenizer = tokenizer or WhitespaceTokenizer()
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self.max_tokens = max_tokens

    def dict_to_instance(self, jdata: dict, label: str = None) -> Instance:
        p, h, l = jdata[config.pf], jdata[config.hf], jdata[config.lf]
        p_tokens = self.tokenizer.tokenize(p)
        h_tokens = self.tokenizer.tokenize(h)
        if self.max_tokens:
            p_tokens = p_tokens[:self.max_tokens]
            h_tokens = h_tokens[:self.max_tokens]
        text_field = TextField(tokens, self.token_indexers)
        fields = {config.pf: text_field, config.hf: }
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)

    def _read(self, file_path: str) -> Iterable[Instance]:
        with jsonlines.open(file_path, "r") as fo:
            for jdata in fo.iter():
                yield self.dict_to_instance(jdata)
                #yield self.text_to_instance(premise, hypothesis, sentiment)


# Instantiate and use the dataset reader to read a file containing the data
reader = ClassificationTsvReader()
dataset = reader.read('quick_start/data/movie_review/train.tsv')

# Returned dataset is a list of Instances by default
print('type of dataset: ', type(dataset))
print('type of its first element: ', type(dataset[0]))
print('size of dataset: ', len(dataset))

In [134]:
gt = torch.rand((1,2)).to("cuda")

In [135]:
gt.dtype

torch.float32

In [137]:
gt.device.type

'cuda'

In [144]:
gt2 = torch.ones((1,2), dtype=torch.long)
gt2.device.type              

'cpu'

In [148]:
gt2 = gt2.to(gt.device)
gt2 = gt2.to(gt.dtype)
print(gt2.device.type, gt2.dtype)

cuda torch.float32
