# SynNLI Clean Code

In [2]:
%load_ext autoreload
%autoreload 2

In [12]:
import config
import utils 
import data
import model

import stanza

## Load Raw Data, Parse Data, Save Data
- dependency parse by Stanza
- implicit done tokenize

In [13]:
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 120kB [00:00, 903kB/s]                     
2020-08-07 00:28:32 INFO: Downloading default packages for language: en (English)...
2020-08-07 00:28:33 INFO: File exists: /root/stanza_resources/en/default.zip.
2020-08-07 00:28:40 INFO: Finished downloading models and saved to /root/stanza_resources.


In [15]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse', use_gpu=False)

2020-08-07 00:30:03 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |
| depparse  | ewt     |

2020-08-07 00:30:03 INFO: Use device: cpu
2020-08-07 00:30:03 INFO: Loading: tokenize
2020-08-07 00:30:03 INFO: Loading: pos
2020-08-07 00:30:04 INFO: Loading: lemma
2020-08-07 00:30:04 INFO: Loading: depparse
2020-08-07 00:30:05 INFO: Done loading processors!


In [24]:
raw_data = utils.read_json_data(file_name=config.DEV_MA_FILE)
print(raw_data[0])

{'annotator_labels': ['neutral', 'entailment', 'neutral', 'neutral', 'neutral'], 'genre': 'slate', 'gold_label': 'neutral', 'pairID': '63735n', 'promptID': '63735', 'sentence1': 'The new rights are nice enough', 'sentence1_binary_parse': '( ( The ( new rights ) ) ( are ( nice enough ) ) )', 'sentence1_parse': '(ROOT (S (NP (DT The) (JJ new) (NNS rights)) (VP (VBP are) (ADJP (JJ nice) (RB enough)))))', 'sentence2': 'Everyone really likes the newest benefits ', 'sentence2_binary_parse': '( Everyone ( really ( likes ( the ( newest benefits ) ) ) ) )', 'sentence2_parse': '(ROOT (S (NP (NN Everyone)) (VP (ADVP (RB really)) (VBZ likes) (NP (DT the) (JJS newest) (NNS benefits)))))'}


## Load Parsed Data and to Graph Data
- edge_attr and node_attr should store edge_type and node_type by text
- GraphDataset should build voc and edge_type here
- visualize an instance

In [27]:
p_data = utils.parse_data(data_file=config.DEV_MA_FILE, target=config.PDEV_MA_FILE, nlp=nlp, function_test=False, force_exe=False)
p_data = utils.read_json_data(file_name=config.PDEV_MA_FILE)
print(p_data[0])

file /work/2020-IIS-NLU-internship/MNLI/data/MNLI_Stanza/pre_multinli_1.0_dev_matched.jsonl already exist
if u still want to procceed, add force_exe=True in function arg
exiting
{'pairID': '63735n', 'sentence1': [[{'id': '1', 'text': 'The', 'lemma': 'the', 'upos': 'DET', 'xpos': 'DT', 'feats': 'Definite=Def|PronType=Art', 'head': 3, 'deprel': 'det', 'misc': 'start_char=0|end_char=3'}, {'id': '2', 'text': 'new', 'lemma': 'new', 'upos': 'ADJ', 'xpos': 'JJ', 'feats': 'Degree=Pos', 'head': 3, 'deprel': 'amod', 'misc': 'start_char=4|end_char=7'}, {'id': '3', 'text': 'rights', 'lemma': 'rights', 'upos': 'NOUN', 'xpos': 'NNS', 'feats': 'Number=Plur', 'head': 6, 'deprel': 'nsubj', 'misc': 'start_char=8|end_char=14'}, {'id': '4', 'text': 'are', 'lemma': 'be', 'upos': 'AUX', 'xpos': 'VBP', 'feats': 'Mood=Ind|Tense=Pres|VerbForm=Fin', 'head': 6, 'deprel': 'cop', 'misc': 'start_char=15|end_char=18'}, {'id': '5', 'text': 'nice', 'lemma': 'nice', 'upos': 'ADJ', 'xpos': 'JJ', 'feats': 'Degree=Pos', '

In [50]:
gdata = data.GraphData(p_data[0])
gdata.print_self()

63735n
[(3, 1, 'nsubj'), (3, 2, 'advmod'), (0, 3, 'root'), (6, 4, 'det'), (6, 5, 'amod'), (3, 6, 'obj')]
['[ROOT]', 'Everyone', 'really', 'likes', 'the', 'newest', 'benefits']
tensor([[3, 3, 0, 6, 6, 3],
        [1, 2, 3, 4, 5, 6]])
tensor([[0., 1., 0.]])


In [37]:
dev_set = data.GraphDataset(data_file=config.PDEV_MA_FILE)

100%|██████████| 9815/9815 [00:19<00:00, 494.47it/s]


In [38]:
print(dev_set[0])

GraphData(edge_attr_h=[6], edge_attr_p=[6], edge_index_h=[2, 6], edge_index_p=[2, 6], label=[1, 3], node_attr_h=[7], node_attr_p=[7], pid="63735n")


In [58]:
dev_loader = data.DataLoader(dev_set, batch_size=3, follow_batch=['node_attr_p', 'node_attr_h'])

In [59]:
test_batch = next(iter(dev_loader))
print(test_batch.pid)

print(test_batch.label)
print(test_batch.node_attr_h)
print(test_batch.node_attr_h_batch)
print(test_batch.edge_attr_h)
print(test_batch.edge_index_h)

['63735n', '91383c', '755e']
tensor([[0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.]])
[['[ROOT]', 'Everyone', 'really', 'likes', 'the', 'newest', 'benefits'], ['[ROOT]', 'The', 'Government', 'Executive', 'articles', 'housed', 'on', 'the', 'website', 'are', 'not', 'able', 'to', 'be', 'searched', '.'], ['[ROOT]', 'I', 'like', 'him', 'for', 'the', 'most', 'part', ',', 'but', 'would', 'still', 'enjoy', 'seeing', 'someone', 'beat', 'him', '.']]
tensor([0, 1, 2])
[[[3, 1, 'nsubj'], [3, 2, 'advmod'], [0, 3, 'root'], [6, 4, 'det'], [6, 5, 'amod'], [3, 6, 'obj']], [[4, 1, 'det'], [4, 2, 'compound'], [4, 3, 'amod'], [11, 4, 'nsubj'], [4, 5, 'acl'], [8, 6, 'case'], [8, 7, 'det'], [5, 8, 'obl'], [11, 9, 'cop'], [11, 10, 'advmod'], [0, 11, 'root'], [14, 12, 'mark'], [14, 13, 'aux:pass'], [11, 14, 'xcomp'], [11, 15, 'punct']], [[2, 1, 'nsubj'], [0, 2, 'root'], [2, 3, 'obj'], [7, 4, 'case'], [7, 5, 'det'], [7, 6, 'advmod'], [2, 7, 'obl'], [12, 8, 'punct'], [12, 9, 'cc'], [12, 10, 'aux'], [1

## Model Testing
- feed a batch to model here

In [None]:
loader = DataLoader(dev_set, batch_size=train_config.batch_size, follow_batch=[])