In [1]:
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir) 
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, SequentialSampler, DataLoader
from transformers import XLNetTokenizer, XLNetForSequenceClassification, XLNetPreTrainedModel, XLNetModel
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from XLNet import (Dataset_Span_Detection,
                   XLNetForMultiSequenceClassification,
                   SpanDetectionResult, 
                   SquadExample,
                   SquadFeatures,
                   squad_convert_example_to_features)
# from span_detection_metrics import compute_predictions_log_probs, span_evaluate
from span_detection_metrics import *

from utils import *



import pandas as pd
import numpy as np
import random
from tqdm.notebook import tqdm, trange
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

dataset = Dataset_Span_Detection("RTE5_test_span", tokenizer=tokenizer)
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=1)
#model = XLNetForMultiSequenceClassification.from_pretrained("xlnet-base-cased", output_attentions=True,)
model = torch.load('../3multi_task/0513-3multi_0.61, 38.9, 7.pkl', map_location=torch.device('cpu'))
#model = torch.load('../2multi_task/2multi_0.61, 22.2.pkl', map_location=torch.device('cpu'))
#model = torch.load('../single_task/0512_single_task_0.61, 17.8.pkl', map_location=torch.device('cpu'))
#model = torch.load('../3multi_task/multi_0.6, 25, 15.pkl')


In [3]:
all_results = []
all_examples = []
all_features = []

for data in tqdm(eval_dataloader, desc="Evaluating"):
    model.eval()
    
    with torch.no_grad():
        task = data[0]
        example_index = data[6]
        unique_id = data[7]
        input_ids, attention_mask, token_type_ids, cls_index, p_mask = [t.squeeze(0).to(device) for t in data[1:6]]
        
        question_text, context_text, answer_text, start_position_character, label = [t[0] for t in data[-5:]]
        
        
        
        output = model(input_ids=input_ids, 
                       token_type_ids=token_type_ids, 
                       attention_mask=attention_mask, 
                       cls_index=cls_index,
                       p_mask=p_mask,
                       task=task)
        eval_task = 0
        outputs_3way = model(input_ids=input_ids,
                            token_type_ids=token_type_ids,
                            attention_mask=attention_mask,
                            task=eval_task,
                           )
        logits = outputs_3way[0]
        _, pred = torch.max(logits.data, 1)
        pred = pred if pred == label else None
        
        example = SquadExample(
            question_text=question_text,
            context_text=context_text,
            answer_text=answer_text,
            start_position_character=start_position_character,
            unique_id=unique_id,
            pred=pred,
        )
        
        feature = squad_convert_example_to_features(example,
                                                    max_seq_length=384,
                                                    doc_stride=128,
                                                    max_query_length=64,
                                                    is_training=False,
                                                    example_index=example_index,
                                                    unique_id=unique_id,
                                                    )
        
        #eval_feature = features
        
#         start_logits = output[0]
#         start_top_index = output[1]
#         end_logits = output[2]
#         end_top_index = output[3]
#         cls_logits = output[4]
        #top_n = top
        start_logits, start_top_index, end_logits, end_top_index, cls_logits, top_n = attention_weight_span(data, feature, output)
        
        result = SpanDetectionResult(
            unique_id,
            start_logits.unsqueeze(0),
            end_logits,
            start_top_index=start_top_index.unsqueeze(0),
            end_top_index=end_top_index,
            cls_logits=cls_logits,
            top_n=top_n
        )
        
        all_results.append(result)
        all_examples.append(example)
        all_features.append(feature)

HBox(children=(IntProgress(value=0, description='Evaluating', max=600, style=ProgressStyle(description_width='…






In [4]:
start_n_top = 5
end_n_top = 10
n_best_size = 5
max_answer_length = 50
min_answer_length = 5
do_lower_case=False

output_dir = "../evaluation/"
#prefix = '50-5, 3Tasks_3-46'
#prefix = '3tasks_1%'
prefix = 'pretrained'

output_prediction_file = os.path.join(output_dir, "prediction_{}.json".format(prefix))
output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
output_best_answers_file = os.path.join(output_dir, "best_answers_{}.json".format(prefix))

predictions = compute_predictions_log_probs(
    all_examples,
    all_features,
    all_results,
    n_best_size,
    max_answer_length,
    min_answer_length,
    output_prediction_file,
    output_nbest_file,
    start_n_top,
    end_n_top,
    tokenizer,
    verbose_logging=True,
)

result = span_evaluate(all_examples, predictions, output_best_answers_file)
result

OrderedDict([('exact', 16.0),
             ('f1', 65.19),
             ('total', 600),
             ('Entailment_exact', 12.82),
             ('Entailment_f1', 70.44),
             ('Entailment_accuracy', 26.0),
             ('Neutral_exact', 6.17),
             ('Neutral_f1', 50.34),
             ('Neutral_accuracy', 38.57),
             ('Contradiction_exact', 21.95),
             ('Contradiction_f1', 58.45),
             ('Contradiction_accuracy', 45.56)])

In [None]:
model = torch.load('../2multi_task/2multi_0.61, 22.2.pkl', map_location=torch.device('cpu'))
#model = torch.load('../single_task/0512_single_task_0.61, 17.8.pkl', map_location=torch.device('cpu'))
#model = torch.load('../3multi_task/multi_0.6, 25, 15.pkl')

all_results = []
all_examples = []
all_features = []

for data in tqdm(eval_dataloader, desc="Evaluating"):
    model.eval()
    
    with torch.no_grad():
        task = data[0]
        example_index = data[6]
        unique_id = data[7]
        input_ids, attention_mask, token_type_ids, cls_index, p_mask = [t.squeeze(0).to(device) for t in data[1:6]]
        
        question_text, context_text, answer_text, start_position_character, label = [t[0] for t in data[-5:]]
        
        
        
        output = model(input_ids=input_ids, 
                       token_type_ids=token_type_ids, 
                       attention_mask=attention_mask, 
                       cls_index=cls_index,
                       p_mask=p_mask,
                       task=task)
        eval_task = 0
        outputs_3way = model(input_ids=input_ids,
                            token_type_ids=token_type_ids,
                            attention_mask=attention_mask,
                            task=eval_task,
                           )
        logits = outputs_3way[0]
        _, pred = torch.max(logits.data, 1)
        pred = pred if pred == label else None
        
        example = SquadExample(
            question_text=question_text,
            context_text=context_text,
            answer_text=answer_text,
            start_position_character=start_position_character,
            unique_id=unique_id,
            pred=pred,
        )
        
        feature = squad_convert_example_to_features(example,
                                                    max_seq_length=384,
                                                    doc_stride=128,
                                                    max_query_length=64,
                                                    is_training=False,
                                                    example_index=example_index,
                                                    unique_id=unique_id,
                                                    )
        
        #eval_feature = features
        
#         start_logits = output[0]
#         start_top_index = output[1]
#         end_logits = output[2]
#         end_top_index = output[3]
#         cls_logits = output[4]
        start_logits, start_top_index, end_logits, end_top_index, cls_logits, top_n = attention_weight_span(data, feature, output)
        
        result = SpanDetectionResult(
            unique_id,
            start_logits.unsqueeze(0),
            end_logits,
            start_top_index=start_top_index.unsqueeze(0),
            end_top_index=end_top_index,
            cls_logits=cls_logits,
            top_n=top_n
        )
        
        all_results.append(result)
        all_examples.append(example)
        all_features.append(feature)
        
start_n_top = 5
end_n_top = 10
n_best_size = 5
max_answer_length = 50
min_answer_length = 5
do_lower_case=False

output_dir = "../evaluation/"
#prefix = '50-5, 3Tasks_3-46'
#prefix = '3tasks_1%'
prefix = '2tasks'

output_prediction_file = os.path.join(output_dir, "prediction_{}.json".format(prefix))
output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
output_best_answers_file = os.path.join(output_dir, "best_answers_{}.json".format(prefix))

predictions = compute_predictions_log_probs(
    all_examples,
    all_features,
    all_results,
    n_best_size,
    max_answer_length,
    min_answer_length,
    output_prediction_file,
    output_nbest_file,
    start_n_top,
    end_n_top,
    tokenizer,
    verbose_logging=True,
)

result = span_evaluate(all_examples, predictions, output_best_answers_file)
result

In [None]:
model = torch.load('../single_task/0512_single_task_0.61, 17.8.pkl', map_location=torch.device('cpu'))
#model = torch.load('../3multi_task/multi_0.6, 25, 15.pkl')

all_results = []
all_examples = []
all_features = []

for data in tqdm(eval_dataloader, desc="Evaluating"):
    model.eval()
    
    with torch.no_grad():
        task = data[0]
        example_index = data[6]
        unique_id = data[7]
        input_ids, attention_mask, token_type_ids, cls_index, p_mask = [t.squeeze(0).to(device) for t in data[1:6]]
        
        question_text, context_text, answer_text, start_position_character, label = [t[0] for t in data[-5:]]
        
        
        
        output = model(input_ids=input_ids, 
                       token_type_ids=token_type_ids, 
                       attention_mask=attention_mask, 
                       cls_index=cls_index,
                       p_mask=p_mask,
                       task=task)
        eval_task = 0
        outputs_3way = model(input_ids=input_ids,
                            token_type_ids=token_type_ids,
                            attention_mask=attention_mask,
                            task=eval_task,
                           )
        logits = outputs_3way[0]
        _, pred = torch.max(logits.data, 1)
        pred = pred if pred == label else None
        
        example = SquadExample(
            question_text=question_text,
            context_text=context_text,
            answer_text=answer_text,
            start_position_character=start_position_character,
            unique_id=unique_id,
            pred=pred,
        )
        
        feature = squad_convert_example_to_features(example,
                                                    max_seq_length=384,
                                                    doc_stride=128,
                                                    max_query_length=64,
                                                    is_training=False,
                                                    example_index=example_index,
                                                    unique_id=unique_id,
                                                    )
        
        #eval_feature = features
        
#         start_logits = output[0]
#         start_top_index = output[1]
#         end_logits = output[2]
#         end_top_index = output[3]
#         cls_logits = output[4]
        start_logits, start_top_index, end_logits, end_top_index, cls_logits, top_n = attention_weight_span(data, feature, output)
        
        result = SpanDetectionResult(
            unique_id,
            start_logits.unsqueeze(0),
            end_logits,
            start_top_index=start_top_index.unsqueeze(0),
            end_top_index=end_top_index,
            cls_logits=cls_logits,
            top_n=top_n
        )
        
        all_results.append(result)
        all_examples.append(example)
        all_features.append(feature)
        
start_n_top = 5
end_n_top = 10
n_best_size = 5
max_answer_length = 50
min_answer_length = 5
do_lower_case=False

output_dir = "../evaluation/"
#prefix = '50-5, 3Tasks_3-46'
#prefix = '3tasks_1%'
prefix = '1task'

output_prediction_file = os.path.join(output_dir, "prediction_{}.json".format(prefix))
output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
output_best_answers_file = os.path.join(output_dir, "best_answers_{}.json".format(prefix))

predictions = compute_predictions_log_probs(
    all_examples,
    all_features,
    all_results,
    n_best_size,
    max_answer_length,
    min_answer_length,
    output_prediction_file,
    output_nbest_file,
    start_n_top,
    end_n_top,
    tokenizer,
    verbose_logging=True,
)

result = span_evaluate(all_examples, predictions, output_best_answers_file)
result