# Usage
- direct execution should do
- load trained model for 
- load glove embedding + Stanza NLP Pipeline
- test with my_data_list
    - you can modify data in here
- 0, 1, 2 is contradiction, neutral, entail, respectively

In [1]:
%load_ext autoreload
%autoreload 2

In [223]:
from model import *
import config
import data
import utils

from pathlib import Path
from copy import deepcopy

from torch_geometric.data.data import Data
from torch_geometric.data import DataLoader
## model
import stanza
from stanza.models.common.doc import Document

from collections import defaultdict
from sklearn import metrics
from random import sample
from tqdm import tqdm as tqdm

# Load Model

In [3]:
PATH = config.PARAM_PATH / "SynNLIv0.1_glove_GAT3"
PATH

PosixPath('/work/2020-IIS-NLU-internship/MNLI/param/SynNLIv0.1_glove_GAT3')

In [4]:
PARAM = PATH / "model_epoch4_precision:0.611_recall:0.618_f1:0.612_acc:0.612.m"

In [5]:
nli_config_dict_probe = deepcopy(config.nli_config_dict)
nli_config_dict_probe["embedding"] = None
nli_config_dict_probe = config.Model_Config(nli_config_dict_probe)

In [6]:
model = SynNLI_Model(nli_config=nli_config_dict_probe)

In [7]:
model.load_state_dict(torch.load(PARAM))

<All keys matched successfully>

# Load Stanza nlp and GLOVE data

In [14]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')

2020-08-10 16:58:11 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |
| depparse  | ewt     |

2020-08-10 16:58:11 INFO: Use device: gpu
2020-08-10 16:58:11 INFO: Loading: tokenize
2020-08-10 16:58:11 INFO: Loading: pos
2020-08-10 16:58:13 INFO: Loading: lemma
2020-08-10 16:58:13 INFO: Loading: depparse
2020-08-10 16:58:15 INFO: Done loading processors!


In [24]:
glove_data = utils.load_glove_vector()

100%|██████████| 1917494/1917494 [02:27<00:00, 13016.51it/s]


In [26]:
glove, words, word2idx, idx = glove_data

# Probing

In [18]:
ANLI2 ="""
"Don Wayne Reno (born February 8, 1963 in Roanoke, Virginia) is a bluegrass musician and banjo player, and also an ordained minister.
He is a son of famed bluegrass musician Don Reno.
Reno was for several years a mainstay of Hayseed Dixie with his brother Dale Reno as the mandolinist.
He currently works with his brother and Mitch Harrell in the band Reno and Harrell."
"""

In [33]:
my_data_list = [ {
    config.lf : config.label_to_id["entailment"],
    config.pf : nlp("He is a cat with furries").to_dict(),
    config.hf : nlp("He is a cat").to_dict(),
    config.idf : "0001e"
},
{
    config.lf : config.label_to_id["contradiction"],
    config.pf : nlp("He is a cat").to_dict(),
    config.hf : nlp("He is not a cat").to_dict(),
    config.idf : "0001e"
},
{
    config.lf : config.label_to_id["entailment"],
    config.pf : nlp("He is a good cat").to_dict(),
    config.hf : nlp("He is good").to_dict(),
    config.idf : "0001e"
},
{
    config.lf : config.label_to_id["contradiction"],
    config.pf : nlp("He is a good cat").to_dict(),
    config.hf : nlp("He is a bad cat").to_dict(),
    config.idf : "0001e"
},
{# wrong
    config.lf : config.label_to_id["entailment"],
    config.pf : nlp("He is a good cat").to_dict(),
    config.hf : nlp("He is not a bad cat").to_dict(),
    config.idf : "0001e"
}, 
{# catch neutral in ANLI!

    config.lf : config.label_to_id["neutral"],
    config.pf : nlp("Don Wayne Reno is a musician").to_dict(),
    config.hf : nlp(ANLI2).to_dict(),
    config.idf : "0001e"
}, 
{# catch entail in ANLI!

    config.lf : config.label_to_id["entailment"],
    config.pf : nlp(ANLI2).to_dict(),
    config.hf : nlp("Don Wayne Reno is a musician").to_dict(),
    config.idf : "0001e"
}, 
{# wrong, basketball

    config.lf : config.label_to_id["contradiction"],
    config.pf : nlp(ANLI2).to_dict(),
    config.hf : nlp("Don Wayne Reno is a basketball player").to_dict(),
    config.idf : "0001e"
}, 
{# vcan catch a an no, but cannot catch other adj that can harm maning of words

    config.lf : config.label_to_id["contradiction"],
    config.pf : nlp(ANLI2).to_dict(),
    config.hf : nlp("Don Wayne Reno has no brother").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"
}, 
    {# vcan catch a an no, but cannot catch other adj that can harm maning of words

    config.lf : config.label_to_id["contradiction"],
    config.pf : nlp("He did it from top to bottom").to_dict(),
    config.hf : nlp("He did it from bottom to top").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"
}, 
]

In [34]:
tl = DataLoader([data.GraphData(my_data, word2idx) for my_data in my_data_list], batch_size=32, follow_batch=config.follow_batch)

In [35]:
model._predict(next(iter(tl)))

tensor([2, 0, 2, 0, 0, 1, 2, 2, 0, 2])

In [31]:
config.id_to_label

dict_keys(['contradiction', 'neutral', 'entailment'])

# Probing Functions 

In [50]:
def probe(model=model, data_list=None):
    labels = ["contradiction", "neutral", "entailment"]
    loader = DataLoader([data.GraphData(my_data, word2idx) for my_data in data_list], batch_size=32, follow_batch=config.follow_batch)
    print([labels[data[config.lf]] for data in data_list])
    print([labels[idx]for idx in model._predict(next(iter(loader)))])
    return None

# negation probing
- single negation
    - simple
    - long
    - long and change position (actually trivial for graph?)
- double negation
    - simple 
    - long

In [74]:
data_neg = [
    # simple negation
    {config.lf : config.label_to_id["contradiction"],
    config.pf : nlp("Allen likes to eat pizza.").to_dict(),
    config.hf : nlp("Allen does not like to eat pizza").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
    # long simple negation
    {config.lf : config.label_to_id["contradiction"],
    config.pf : nlp("Tim likes to eat pizza. Tim likes to eat pizza. Tim likes to eat pizza. Allen likes to eat pizza.").to_dict(),
    config.hf : nlp("Allen does not like to eat pizza").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
    # long change pos simple negation
    {config.lf : config.label_to_id["contradiction"],
    config.pf : nlp("Tim likes to eat pizza. Tim likes to eat pizza. Tim likes to eat pizza. Allen likes to eat pizza.").to_dict(),
    config.hf : nlp("Allen does not like to eat pizza. Tim likes to eat pizza. Tim likes to eat pizza. Tim likes to eat pizza.").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
    # long simple entailment
    {config.lf : config.label_to_id["entailment"],
    config.pf : nlp("Tim likes to eat pizza. Tim likes to eat pizza. Tim likes to eat pizza. Allen likes to eat pizza.").to_dict(),
    config.hf : nlp("Allen likes to eat pizza").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
]
probe(model, data_neg)

['contradiction', 'contradiction', 'contradiction', 'entailment']
['contradiction', 'contradiction', 'contradiction', 'entailment']


In [251]:
data_double_neg = [
    # simple negation
    {config.lf : config.label_to_id["entailment"],
    config.pf : nlp("Allen likes to eat pizza.").to_dict(),
    config.hf : nlp("Allen does not hate to eat pizza").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
    # long simple negation
    {config.lf : config.label_to_id["entailment"],
    config.pf : nlp("Allen likes to eat pizza.").to_dict(),
    config.hf : nlp("Allen does not not like to eat pizza").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
    # long simple negation
    {config.lf : config.label_to_id["contradiction"],
    config.pf : nlp("Allen does likes to eat pizza.").to_dict(),
    config.hf : nlp("Allen doesn't like to eat pizza").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
]
probe(model, data_double_neg)

['entailment', 'entailment', 'contradiction']
['contradiction', 'contradiction', 'contradiction']


In [244]:
data_time = [
    # simple negation
    {config.lf : config.label_to_id["entailment"],
    config.pf : nlp("I go to school. After that, Allen not goes to school").to_dict(),
    config.hf : nlp("I go to school. After that, Allen doest not goes to school").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
    # long simple negation
    {config.lf : config.label_to_id["contradiction"],
    config.pf : nlp("I go to school. After that, Allen goes to school").to_dict(),
    config.hf : nlp("Allen goes to school. After that, I go to school").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
    # long change pos simple negation
    {config.lf : config.label_to_id["entailment"],
    config.pf : nlp("Allen is happy").to_dict(),
    config.hf : nlp("Not Allen is not happy").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
]
probe(model, data_time)

['entailment', 'contradiction', 'entailment']
['entailment', 'entailment', 'contradiction']


In [305]:
data_ner = [
    # simple negation
    {config.lf : config.label_to_id["neutral"],
    config.pf : nlp("The Great Wall is a famous building").to_dict(),
    config.hf : nlp("The Great Building is a famous buildingl").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
    # long simple negation
    {config.lf : config.label_to_id["neutral"],
    config.pf : nlp("The Great River of China  is a famous river").to_dict(),
    config.hf : nlp("The Great River of Newyork is a famous river").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
    # long change pos simple negation
    {config.lf : config.label_to_id["neutral"],
    config.pf : nlp("Allen likes to eat pizza.").to_dict(),
    config.hf : nlp("Tim does not likes to eat pizza.").to_dict(), # test a, no , aged, handsome
    config.idf : "0001e"},
]
probe(model, data_ner)

['neutral', 'neutral', 'neutral']
['entailment', 'entailment', 'contradiction']


# MisMatch Data

In [77]:
testset = data.GraphDataset(config.PDEV_MMA_FILE, word2idx=word2idx)

100%|██████████| 9832/9832 [00:17<00:00, 549.37it/s]


In [289]:
def get_report(model=model, dataset=None):
    pred = []
    label = []
    
    dev_loader = DataLoader(dataset, batch_size = 32, follow_batch=config.follow_batch, shuffle=False)
    for batch_i, batch in enumerate(tqdm(dev_loader)):
        #label.extend(torch.argmax(batch[config.label_field].cpu(), dim = 1).numpy().tolist()) # label, one hot to tensor, shape = (batch) now
        label.extend(torch.argmax(batch.label.view([-1, config.NUM_CLASSES]), dim=1, keepdim=False).tolist())
        for k in config.tensor_attr_list:
                batch.__dict__[k] = batch.__dict__[k]
        pred_batch = model._predict(batch).numpy().tolist()
        pred.extend(pred_batch)
        
    report = {
        'report' : metrics.classification_report(pred, label, output_dict=True),
        'confusion_matrix' : metrics.confusion_matrix(pred, label)
    }
    return report

# return list of (id, prediction, label)
def get_errors_from_dataset(model=model, dataset=None):
    loader = DataLoader(dataset, batch_size = 32, follow_batch=config.follow_batch, shuffle=False)
    errors
    from tqdm import tqdm
    for batch in tqdm(loader):
        pred_batch  = model._predict(batch)
        #print(batch.label.size())
        #print(dir(batch))
        label_batch = torch.argmax(batch.label, dim=1, keepdim=False)
        errors.extend([(idx, pred, label) for (idx, pred, label) in zip(batch.pid, pred_batch, label_batch) if pred != label])
    acc = 1 - (len(errors_id) / len(dataset))
    return errors,  acc

def get_instances_by_id(errors_id, dataset):
    instances = []
    errors_id = set(errors_id)
    #print(dir(dataset[0]))
    for data in dataset:
        if data.pid in errors_id:
            instances.append((utils.token2sent(data.x_p, words), utils.token2sent(data.x_h, words), data.label))
    return instances

In [293]:
errors, acc = get_errors_from_dataset(model, testset)
print("accuracy is " + str(acc))

100%|██████████| 308/308 [01:09<00:00,  4.41it/s]

accuracy is 0.624593165174939





In [None]:
report = get_report(model, testset)

In [298]:
report

{'report': {'0': {'precision': 0.5876543209876544,
   'recall': 0.7245053272450532,
   'f1-score': 0.648943421949557,
   'support': 2628},
  '1': {'precision': 0.6241610738255033,
   'recall': 0.541747572815534,
   'f1-score': 0.58004158004158,
   'support': 3605},
  '2': {'precision': 0.6595437481952064,
   'recall': 0.6346207279799945,
   'f1-score': 0.6468422543188898,
   'support': 3599},
  'accuracy': 0.624593165174939,
  'macro avg': {'precision': 0.623786381002788,
   'recall': 0.6336245426801939,
   'f1-score': 0.6252757521033422,
   'support': 9832},
  'weighted avg': {'precision': 0.627354981331473,
   'recall': 0.624593165174939,
   'f1-score': 0.622910748802585,
   'support': 9832}},
 'confusion_matrix': array([[1904,  457,  267],
        [ 740, 1953,  912],
        [ 596,  719, 2284]])}

In [297]:
print(len(errors))

7382


In [None]:
instances = get_instances_by_id(errors[:10][0], devset[:100])

In [291]:
for i, line in enumerate(errors[:10]):
    print("ID: " , errors_id[i])
    print("Premise: " , line[0], sep='\n')
    print("Hypthesis: ", line[1], sep='\n')
    print("Gold Label: ", line[2])
    print("Predicted Label: ", errors[i])

ID:  133794c
Premise: 
[ROOT] the answer has nothing to do with their cause , however , but with the simple fact that dictionaries are not exercises in [UNK] substitutability ; in other words , if one of the senses of run is ` operate ' ( as in she runs an engine factory ) , that does not make it valid to assume that one can substitute operate for run in we run in the marathon every year [UNK] [ROOT] although recognizing this as a shortcoming of dictionaries and assigning it arbitrarily to what , for lack of a better term , we might call the genius of the language , might seem trivial to the casual observer , it is a valid matter for concern in the realm of lexicology .
Hypthesis: 
[ROOT] dictionaries are indeed exercises in [UNK] substitutability .
Gold Label:  tensor([[1., 0., 0.]])
Predicted Label:  tensor(1)
ID:  25267e
Premise: 
[ROOT] for ` family hold back , ' an exhortation ensuring ample provender for guests .
Hypthesis: 
[ROOT] ' family hold back , ' an emphatic command to gu