In [1]:
from __future__ import print_function
import json
import os
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
from collections import Counter
import string
import re
import argparse
import sys



def read_dataset(path):
    with open(path, 'r') as f:
        squad_dict = json.load(f)
    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # for answer in qa['answers']:
                answer = qa['answers'][0]
                answer_text = answer['text']
                start_idx = answer['answer_start']
                end_idx = start_idx + len(answer_text)
                if context[start_idx:end_idx] == answer_text:
                    answer['answer_end'] = end_idx
                else:
                    for n in [1, 2]:
                        if context[start_idx-n:end_idx-n] == answer_text:
                            answer['answer_start'] = start_idx - n
                            answer['answer_end'] = end_idx - n
                contexts.append(context)
                questions.append(question)
                answers.append(answer)
                
    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_dataset('spoken_train-v1.1.json')
test_contexts, test_questions, test_answers = read_dataset('spoken_test-v1.1.json')


# tokenize contexts and questions
tokenizer = AutoTokenizer.from_pretrained("rein5/bert-base-uncased-finetuned-spoken-squad")
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True)


# Encode and add token positions
def encode_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

encode_token_positions(train_encodings, train_answers)
encode_token_positions(test_encodings, test_answers)


class ProcessDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for training and testing 
train_dataset = ProcessDataset(train_encodings)
test_dataset = ProcessDataset(test_encodings)


model = AutoModelForQuestionAnswering.from_pretrained("rein5/bert-base-uncased-finetuned-spoken-squad")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)



# Training of model
def train_model(train_loader, optimizer, max_epochs):
    model.train()
    for epoch in range(max_epochs):
        loop = tqdm(train_loader, leave=True)
        for batch in loop:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            loop.set_description(f'Epoch {epoch}')
            loop.set_postfix(loss=loss.item())

optimizer = AdamW(model.parameters(), lr=2e-6)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
train_model(train_loader, optimizer, max_epochs=1)



# Testing of model
def test_model(test_loader):
    model.eval()
    accuraccies = []
    loop = tqdm(test_loader)
    answers = []
    references = []
    for batch in loop:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            actual_start_positions = batch['start_positions'].to(device)
            actual_end_positions = batch['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predicted_start_positions = torch.argmax(outputs['start_logits'], dim=1)
            predicted_end_positions = torch.argmax(outputs['end_logits'], dim=1)
            accuraccies.append(((predicted_start_positions == actual_start_positions).sum()/len(actual_start_positions)).item())
            accuraccies.append(((predicted_end_positions == actual_end_positions).sum()/len(predicted_end_positions)).item())
            for i in range(predicted_start_positions.shape[0]):
                all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i])
                answer = ' '.join(all_tokens[predicted_start_positions[i] : predicted_end_positions[i]+1])
                ref = ' '.join(all_tokens[actual_start_positions[i] : actual_end_positions[i]+1])
                ans_ids = tokenizer.convert_tokens_to_ids(answer.split())
                answer = tokenizer.decode(ans_ids)
                answers.append(answer)
                references.append(ref)
    return answers, references

test_loader = DataLoader(test_dataset, batch_size=16)
answers, references = test_model(test_loader)


def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    if len(scores_for_ground_truths)==0: return 0
    return max(scores_for_ground_truths)


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0
    for ground_truths, prediction in zip(gold_answers, predictions):
        total += 1
        exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(f1_score, prediction, [ground_truths])
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total
    return {'f1': f1}


evaluate(references,answers)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/2320 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 7.92 GiB total capacity; 7.05 GiB already allocated; 93.25 MiB free; 7.09 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF