In [1]:
import argparse
import glob
import logging
import os
import random

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tensorboardX import SummaryWriter
from tqdm import tqdm, trange
from XLNet import XLNetForMultiSequenceClassification

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    XLNetTokenizer,
    XLNetConfig,
    get_linear_schedule_with_warmup
)

from processor import SnliProcessor as processors
from processor import convert_examples_to_features
from metrics import snli_compute_metrics as compute_metrics
from statsmodels.sandbox.stats.runs import mcnemar

In [2]:
logger = logging.getLogger(__name__)
device = torch.device("cuda")

In [6]:
processor = processors()

In [65]:
examples = processor.get_dev_examples('data/')
features = convert_examples_to_features(
    examples,
    tokenizer,
    max_length=64,
)
all_inputs_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
all_labels = torch.tensor([int(f.label) for f in features], dtype=torch.long)
eval_dataset = TensorDataset(all_inputs_ids, all_attention_mask, all_token_type_ids, all_labels)

In [8]:
def evalute(eval_dataset, model, tokenizer, prefix=""): 
    eval_task_names = ("snli",)
    eval_outputs_dirs = ('data/')

    pred = []
    label = []
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):

        eval_batch_size = 16
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating", position=0, leave=True, ncols=100):
            model.eval()
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2],
                          'labels':         batch[3],
                          'task':                 0,
                          }
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

        preds = np.argmax(preds, axis=1)

        pred.append(preds)
        label.append(out_label_ids)
                
    return pred, label

In [66]:
model = XLNetForMultiSequenceClassification.from_pretrained('models/SNLI-1%/1t821-1')
tokenizer = XLNetTokenizer.from_pretrained('models/SNLI-1%/1t827')
model.to(device)
pred_1_1t, label = evalute(eval_dataset, model, tokenizer, prefix="")
model = XLNetForMultiSequenceClassification.from_pretrained('models/SNLI-1%/2t829-1')
tokenizer = XLNetTokenizer.from_pretrained('models/SNLI-1%/2t833')
model.to(device)
pred_1_2t, label = evalute(eval_dataset, model, tokenizer, prefix="")

Evaluating: 100%|█████████████████████████████████████████████████| 616/616 [00:45<00:00, 13.54it/s]
Evaluating: 100%|█████████████████████████████████████████████████| 616/616 [00:45<00:00, 13.49it/s]


In [10]:
def simple_accuracy(preds, labels):
    preds = np.asarray(preds)
    labels = np.asarray(labels)
    assert len(preds) == len(labels)
    return (preds == labels).mean()
def convert_bi(preds, labels):
    preds = np.asarray(preds).flatten()
    labels = np.asarray(labels).flatten()

    return preds == labels

In [60]:
simple_accuracy(pred_1_1t, label)

0.8131107491856677

In [61]:
simple_accuracy(pred_1_2t, label)

0.8189128664495114

In [62]:
result_1t = convert_bi(pred_1_1t, label)

In [63]:
result_2t = convert_bi(pred_1_2t, label)

In [64]:
mcnemar(result_1t, result_2t, exact=False, correction=False)

(3.7216494845360826, 0.05371128596527664)