<a href="https://colab.research.google.com/github/guyez/NLP/blob/main/DistilBertQA_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
'''
Question Answering (QA) System using NLP with SQuAD
EE562 Group3 Project
Megha Chandra Nandyala
Amisha Himanshu Somaiya


APPROACH                   : DistilBERT Pretrained + Additional Head

ADDITIONAL HEAD            : 768->512->32->2
ACTIVATION FUNCTION        : GeLU_new

Evaluation on best model from training file

REFERENCES :
https://arxiv.org/abs/1810.04805
https://arxiv.org/abs/1910.01108
https://rajpurkar.github.io/SQuAD-explorer/
https://huggingface.co/models
https://huggingface.co/nlpunibo
https://huggingface.co/docs/transformers/model_doc/auto
https://huggingface.co/docs/transformers/main_classes/data_collator
https://discuss.huggingface.co/t/squad-bert-why-max-length-384-by-default-and-not-512/11693



'''


# Install required packages
!pip install transformers
!pip install datasets
!pip install accelerate -U

# Import required libraries
import numpy as np
import pandas as pd
import torch
import json
import sys
import time
import datetime
import random
import collections
from pathlib import Path
import transformers
import datasets
from datasets import load_dataset
# Connect Drive
from google.colab import drive
drive.mount("/content/drive")

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [

In [2]:
#connect GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
!nvidia-smi

Tue Dec 12 07:41:26 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
# #Path to get SQuAD   megha
# FOLDER_NAME = "Question-Answering-SQUAD-main/data"
# JSON_TEST_FILE = "test_set.json"
# data_path = "drive/My Drive/Colab Notebooks/" + FOLDER_NAME +"/"
# file_path = data_path + JSON_TEST_FILE
# checkpoint_path = data_path



#Path to get SQuAD    amisha
FOLDER_NAME = "Question-Answering-SQUAD-main/data"
JSON_TEST_FILE = "test_set.json"
data_path = "drive/My Drive/EE562_Group3_Project/distilled_bert/" + FOLDER_NAME +"/"
file_path = data_path + JSON_TEST_FILE
checkpoint_path = data_path

In [4]:
'''
Load SQuAD version 1.1 since our system is closed-domain.
Future Work is to add system functionality to work with open questions in SQuAD version 2.
'''
class LoadData():
    def __init__(self,
                 path_to_json_file: str,
                 checkpoint_path: str,
                 train_file: str = 'train.json',
                 val_file: str = 'val.json') -> None:

        self.path_to_json_file = path_to_json_file #specify paths to laoad
        self.checkpoint_path = checkpoint_path
        self.train_file = train_file
        self.val_file = val_file
        self.data = self.load_data()

    def load_data(self): #load data
        with open(self.path_to_json_file, 'r') as f:
            squad_data = json.load(f)
        version = squad_data.get("version", "")
        train_data, val_data, errors = self.load_squad_data(squad_data)
        with open(Path(self.checkpoint_path) / Path(self.train_file), 'w') as file:
            json.dump({"data": train_data}, file)
        with open(Path(self.checkpoint_path) / Path(self.val_file), 'w') as file:
            json.dump({"data": val_data}, file)
        return squad_data

    def load_squad_data(self, data, split=0.2):
        errors = 0
        flattened_data_train = []  #initialize empty lists for training and validation data
        flattened_data_val = []
        train_range = int(len(data['data']) * (1 - split)) #calculate split index
        for i, article in enumerate(data["data"]):
            title = article.get("title", "").strip()  #separate title of the article
            for paragraph in article["paragraphs"]:
                context = paragraph["context"].strip() #context of paragraph
                for qa in paragraph["qas"]:
                    question = qa["question"].strip() #separate the question
                    id_ = qa["id"]
                    answer_starts = [answer["answer_start"] for answer in qa["answers"]] #answer start position
                    answers = [answer["text"].strip() for answer in qa["answers"]] #answer text
                    flattened_data = {"title": title, #store all of the above in a dictionary for current record
                                      "context": context,
                                      "question": question,
                                      "id": id_,
                                      "answers": {
                                          "answer_start": answer_starts,
                                          "text": answers}
                                      }
                    if i <= train_range:
                        flattened_data_train.append(flattened_data) #repeat for all
                    else:
                        flattened_data_val.append(flattened_data)
        return flattened_data_train, flattened_data_val, errors
_ = LoadData(file_path, checkpoint_path)

In [5]:
#load data
from datasets import load_dataset
test_data = load_dataset('json', data_files=data_path+"test.json", field='data')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
def get_text(answer: list) -> str:
    '''
    Extract only the text from the answers.text column
    '''
    return answer[0]

def get_json_data(json_path: str) -> dict:
    """Get the json data in form of a dictionary
    """
    with open(json_path, 'r') as f:
        json_data = json.load(f)
    return json_data

test_dataframe = pd.json_normalize(get_json_data(data_path+"test.json"), record_path='data') #load data and normalize
test_dataframe["answers.text"] = test_dataframe["answers.text"].apply(get_text)

test_dataframe

Unnamed: 0,title,context,question,id,answers.answer_start,answers.text
0,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,56be4db0acb8001400a502ec,"[177, 177, 177]",Denver Broncos
1,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,56be4db0acb8001400a502ed,"[249, 249, 249]",Carolina Panthers
2,Super_Bowl_50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,56be4db0acb8001400a502ee,"[403, 355, 355]","Santa Clara, California"
3,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?,56be4db0acb8001400a502ef,"[177, 177, 177]",Denver Broncos
4,Super_Bowl_50,Super Bowl 50 was an American football game to...,What color was used to emphasize the 50th anni...,56be4db0acb8001400a502f0,"[488, 488, 521]",gold
...,...,...,...,...,...,...
10565,Force,"The pound-force has a metric counterpart, less...",What is the metric term less used than the New...,5737aafd1c456719005744fb,"[82, 4, 82, 82, 78]",kilogram-force
10566,Force,"The pound-force has a metric counterpart, less...",What is the kilogram-force sometimes reffered ...,5737aafd1c456719005744fc,"[114, 114, 114, 114, 114]",kilopond
10567,Force,"The pound-force has a metric counterpart, less...",What is a very seldom used unit of mass in the...,5737aafd1c456719005744fd,"[274, 267, 267, 267, 263]",slug
10568,Force,"The pound-force has a metric counterpart, less...",What seldom used term of a unit of force equal...,5737aafd1c456719005744fe,"[712, 712, 712, 712, 712]",kip


In [7]:
'''
Tokenization steps using NLTK punkt in ML model are performed by huggingface AutoTokenizer here
'''
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer, DistilBertConfig

# Path to the directory containing the saved model files
model_path = data_path + "test-squad-trained"

# Load the configuration
config = DistilBertConfig.from_pretrained(model_path)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(model_path)

# Load the model
model = DistilBertForQuestionAnswering.from_pretrained(model_path, config=config,ignore_mismatched_sizes=True)

Some weights of the model checkpoint at drive/My Drive/Colab Notebooks/Question-Answering-SQUAD-main/data/test-squad-trained were not used when initializing DistilBertForQuestionAnswering: ['qa_outputs_0.bias', 'LayerNorm.bias', 'qa_outputs_1.bias', 'qa_outputs_1.weight', 'qa_outputs_0.weight', 'LayerNorm.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at drive/My Drive/Colab Notebooks/Question-Answering-SQUAD-main/data/test-squad-trained and are

In [8]:
'''
https://discuss.huggingface.co/t/squad-bert-why-max-length-384-by-default-and-not-512/11693
'''
max_length = 384 #max length as per model capability
doc_stride = 128 #authorized overlap between two parts of the context when splitting it is needed.
pad_on_right = True #regular model with padding on right

In [9]:
import math
from transformers.modeling_outputs import QuestionAnsweringModelOutput
from transformers import DistilBertPreTrainedModel, DistilBertModel
from torch import nn
from torch.nn import CrossEntropyLoss

def gelu(x):
    """
    Original Implementation of the GELU activation function in Google BERT repo when initially created.
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

def gelu_new(x):
    """
    Implementation of the GELU activation function currently in Google BERT repo.
    """
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.distilbert = DistilBertModel(config)   #distilbert
        self.qa_outputs_0 = nn.Linear(config.dim, 512) #additional head
        self.qa_outputs_1 = nn.Linear(512, 32)
        self.qa_outputs = nn.Linear(32, config.num_labels)
        assert config.num_labels == 2
        self.dropout = nn.Dropout(config.qa_dropout) #dropout
        self.LayerNorm = nn.LayerNorm(normalized_shape=[384, 2]) #output
        self.init_weights()
    def forward(    #forward pass
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        distilbert_output = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = distilbert_output.last_hidden_state  #last hidden state
        hidden_states = self.dropout(hidden_states)   #dropout
        logits = gelu_new(self.qa_outputs_0(hidden_states))  #apply activation function
        logits = gelu_new(self.qa_outputs_1(logits))
        logits = self.qa_outputs(logits)
        logits = self.LayerNorm(logits) #layer normalization
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)  # (bs, max_query_len)
        end_logits = end_logits.squeeze(-1)  # (bs, max_query_len)
        total_loss = None
        if start_positions is not None and end_positions is not None: #calculate loss if start and end position are valid
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)
            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
        if not return_dict:
            output = (start_logits, end_logits) + distilbert_output[1:]
            return ((total_loss,) + output) if total_loss is not None else output #return output
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=distilbert_output.hidden_states,
            attentions=distilbert_output.attentions
        )

In [10]:
#instantiate the model
from transformers import TrainingArguments, Trainer
if torch.cuda.is_available():
  model.cuda()

In [11]:
'''
If there is more than 1 prediction for a query then the query with higher similarity score is outputted as the prediction
'''
import collections
from tqdm import tqdm, tqdm_notebook
def postprocess_qa_predictions(examples: datasets.arrow_dataset.Dataset,
                               features: datasets.arrow_dataset.Dataset,
                               raw_predictions: tuple,
                               n_best_size: int = 20,
                               max_answer_length: int = 50) -> collections.OrderedDict:
    '''
    Function used to select the best answer from the raw predictions
    '''
    all_start_logits, all_end_logits = raw_predictions   #unpack

    example_id_to_index = {k: i for i, k in enumerate(examples["id"])} #map id to index
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features): #group features
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()  #dictionary to store final predictions
    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]
        valid_answers = []
        context = example["context"]  #get context
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]  #get start and end logits for current feature
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]  #get offset mapping for current feature
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index] #calculate feature null score
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() #indices of top-n start and end logits
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (    #invalid indices or null offset mappings check
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length: #invalid answer span or exceeding maximum answer length check
                        continue
                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(  #list of valid answers
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0] #answer with highest score is best answer
        else:
            best_answer = {"text": "", "score": 0.0}    #if no valid answers then empty string
        predictions[example["id"]] = best_answer["text"]
    return predictions

In [12]:
def prepare_validation_features(examples: collections.OrderedDict or dict) -> transformers.tokenization_utils_base.BatchEncoding:
    '''
    To check a given span is inside the context (and not the question) and to get back the text inside.
    '''
    tokenized_examples = tokenizer(                         #tokenize
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples["offset_mapping"]
    tokenized_examples["start_positions"] = []  #lists to store start and end positions
    tokenized_examples["end_positions"] = []
    for i, offsets in enumerate(offset_mapping):
        # CLS index
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)  #CLS index
        sequence_ids = tokenized_examples.sequence_ids(i)    #sequence ids from tokenized examples
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        start_char = answers["answer_start"][0]       #start and end character positions from answers
        end_char = start_char + len(answers["text"][0])
        token_start_index = 0                         #token start and end indices
        while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
            token_start_index += 1
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
            token_end_index -= 1
        if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                token_start_index += 1
            tokenized_examples["start_positions"].append(token_start_index - 1)
            while offsets[token_end_index][1] >= end_char:
                token_end_index -= 1
            tokenized_examples["end_positions"].append(token_end_index + 1)
    tokenized_examples["example_id"] = []
    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        tokenized_examples["offset_mapping"][i] = [             #modify offset mappings based on context_index
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]
    return tokenized_examples

In [13]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# Your existing code for loading data
test_features = test_data['train'].map(prepare_validation_features, batched=True, remove_columns=test_data['train'].column_names)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [14]:
args = TrainingArguments(
    output_dir='./results',
    label_names=["start_positions", "end_positions"]
)
trainer = Trainer(model, args)
# Get final predictions
with torch.no_grad():
    pred = trainer.predict(test_features)
# The Trainer hides the columns that are not used by the model (here example_id and offset_mapping which we will need for our post-processing), so we set them back
test_features.set_format(type=test_features.format["type"],
                          columns=list(test_features.features.keys()))

# To get the final predictions we can apply our post-processing function to our raw predictions
final_predictions = dict(postprocess_qa_predictions(test_data['train'], test_features, pred.predictions))

100%|██████████| 10570/10570 [00:35<00:00, 298.13it/s]


In [15]:
with open(data_path + "predictions.json", 'w') as file:
    json.dump(final_predictions, file)

In [16]:
formatted_predictions = {k: v for k, v in final_predictions.items()}
metric = datasets.load_metric("squad")
test_dataframe['prediction_text'] = list(formatted_predictions.values())

# Function to get the questions' head
def get_5w(question: str) -> str:
    """Extract only the "head" froma a question

    Args:
        question: the question.
    """
    return question.split()[0].strip().lower()
test_dataframe['question'] = test_dataframe['question'].apply(get_5w)

  metric = datasets.load_metric("squad")


Downloading builder script:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

In [17]:
total = test_dataframe.shape[0]
top_qheads = test_dataframe['question'].value_counts().sort_values(ascending=False)[0:25]
qheads = list(top_qheads.index)
count = top_qheads.to_list()

print("Label\t\tCount\t\tPercentage\n")
for i, j in enumerate(top_qheads):
  print("{: <15} {: <15} {:.1f}%".format(qheads[i],j,j/total*100))

Label		Count		Percentage

what            4749            44.9%
how             1090            10.3%
who             1059            10.0%
when            696             6.6%
which           454             4.3%
in              443             4.2%
where           433             4.1%
the             237             2.2%
why             151             1.4%
on              44              0.4%
to              43              0.4%
by              38              0.4%
along           36              0.3%
at              35              0.3%
whose           34              0.3%
if              32              0.3%
after           31              0.3%
besides         30              0.3%
a               29              0.3%
other           29              0.3%
for             27              0.3%
from            26              0.2%
approximately   26              0.2%
according       23              0.2%
during          22              0.2%


In [18]:
f1_list = []
em_list = []

for q in qheads:
  temp = test_dataframe
  temp['answers'] = test_data["train"]['answers']
  temp = test_dataframe[test_dataframe['question'].isin([q])]
  final_predictions = temp[['id','prediction_text']].to_dict('records')
  references = temp[['answers','id']].to_dict('records')
  metrics = metric.compute(predictions=final_predictions, references=references)
  em_list.append(metrics['exact_match'])
  f1_list.append(metrics['f1'])

In [19]:
def get_char_diff(row: pd.core.series.Series) -> int:
    '''
    Compute the difference, in terms of the number of different charecters
       between the real answer and the predicted one
    '''
    return sum(1 for a, b in zip(row["answers.text"], row.prediction_text) if a != b) + abs(len(row["answers.text"]) - len(row.prediction_text))

test_dataframe['difference'] = test_dataframe.apply(get_char_diff, axis=1)
test_dataframe

Unnamed: 0,title,context,question,id,answers.answer_start,answers.text,prediction_text,answers,difference
0,Super_Bowl_50,Super Bowl 50 was an American football game to...,which,56be4db0acb8001400a502ec,"[177, 177, 177]",Denver Broncos,to earn their third Super Bowl title,"{'answer_start': [177, 177, 177], 'text': ['De...",35
1,Super_Bowl_50,Super Bowl 50 was an American football game to...,which,56be4db0acb8001400a502ed,"[249, 249, 249]",Carolina Panthers,to earn their third Super Bowl title. The game...,"{'answer_start': [249, 249, 249], 'text': ['Ca...",102
2,Super_Bowl_50,Super Bowl 50 was an American football game to...,where,56be4db0acb8001400a502ee,"[403, 355, 355]","Santa Clara, California",to earn their third Super Bowl title. The game...,"{'answer_start': [403, 355, 355], 'text': ['Sa...",101
3,Super_Bowl_50,Super Bowl 50 was an American football game to...,which,56be4db0acb8001400a502ef,"[177, 177, 177]",Denver Broncos,to earn their third Super Bowl title. The game...,"{'answer_start': [177, 177, 177], 'text': ['De...",102
4,Super_Bowl_50,Super Bowl 50 was an American football game to...,what,56be4db0acb8001400a502f0,"[488, 488, 521]",gold,to earn their third Super Bowl title. The game...,"{'answer_start': [488, 488, 521], 'text': ['go...",102
...,...,...,...,...,...,...,...,...,...
10565,Force,"The pound-force has a metric counterpart, less...",what,5737aafd1c456719005744fb,"[82, 4, 82, 82, 78]",kilogram-force,", is","{'answer_start': [82, 4, 82, 82, 78], 'text': ...",14
10566,Force,"The pound-force has a metric counterpart, less...",what,5737aafd1c456719005744fc,"[114, 114, 114, 114, 114]",kilopond,"f) (sometimes kilopond), is","{'answer_start': [114, 114, 114, 114, 114], 't...",26
10567,Force,"The pound-force has a metric counterpart, less...",what,5737aafd1c456719005744fd,"[274, 267, 267, 267, 263]",slug,"f) (sometimes kilopond), is","{'answer_start': [274, 267, 267, 267, 263], 't...",27
10568,Force,"The pound-force has a metric counterpart, less...",what,5737aafd1c456719005744fe,"[712, 712, 712, 712, 712]",kip,"f) (sometimes kilopond), is","{'answer_start': [712, 712, 712, 712, 712], 't...",27


In [20]:
total = test_dataframe['difference'][test_dataframe['difference'] != 0].shape[0]
sorted_series = test_dataframe['difference'][test_dataframe['difference'] != 0].value_counts().sort_values(ascending=False)[0:25]
numbers = list(sorted_series.index)

print("# Diff.Chars\tCount\t\tPercentage\n")
for i, j in enumerate(sorted_series):
  print("{: <15} {: <15} {:.1f}%".format(numbers[i],j,j/total*100))

# Diff.Chars	Count		Percentage

19              122             1.2%
15              122             1.2%
12              121             1.1%
22              120             1.1%
16              120             1.1%
21              118             1.1%
11              118             1.1%
18              114             1.1%
13              111             1.1%
23              111             1.1%
17              110             1.0%
32              109             1.0%
25              107             1.0%
20              106             1.0%
10              105             1.0%
41              104             1.0%
30              104             1.0%
28              102             1.0%
27              99              0.9%
26              99              0.9%
14              98              0.9%
8               97              0.9%
29              95              0.9%
9               95              0.9%
40              94              0.9%


In [23]:
def postprocess_qa_5predictions(examples: datasets.arrow_dataset.Dataset,
                               features: datasets.arrow_dataset.Dataset,
                               raw_predictions: tuple,
                               n_best_size: int = 20,
                               max_answer_length: int = 50) -> collections.OrderedDict:
    '''
    Function used to select the best answer from the raw predictions
    '''
    all_start_logits, all_end_logits = raw_predictions  #unpack

    example_id_to_index = {k: i for i, k in enumerate(examples["id"])} #map id to index
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features): #group features
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict() #dictionary to store final predictions
    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]
        valid_answers = []
        context = example["context"] #get context
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]  #get start and end logits for the current feature
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue
                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": 0.0}
        predictions[example["id"]] = best_answer["text"]
    return predictions

In [24]:
# The Trainer hides the columns that are not used by the model (here example_id and offset_mapping which we will need for our post-processing), so we set them back
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))

# To get the final predictions we can apply our post-processing function to our raw predictions
final_predictions = postprocess_qa_5predictions(test_data['train'], test_features, pred.predictions)

formatted_5predictions = {k : v for k, v in final_predictions.items()}

# Hide again the columns that are not used by the model
test_features.set_format(type=test_features.format["type"], columns=['attention_mask', 'end_positions', 'input_ids', 'start_positions'])

100%|██████████| 10570/10570 [00:34<00:00, 310.35it/s]


In [38]:
count = 0
for id in test_dataframe['id']:
  if test_dataframe['answers.text'][test_dataframe.id == id].values[0] in formatted_5predictions[id]:
    count += 1

final_predictions = test_dataframe[['id','prediction_text']].to_dict('records')
references = test_dataframe[['answers','id']].to_dict('records')

metrics = metric.compute(predictions=final_predictions, references=references)
print(f"Exact Match:{metrics['exact_match']}, F1 :{metrics['f1']}")


Exact Match:75.31220435193946, F1 :83.93165328200715
