In [1]:
import json

def load_squad_data(file_path):
    # Open and load the JSON file into a dictionary
    with open(file_path, 'rb') as f:
        squad_data = json.load(f)

    # Create lists to store contexts, questions, and answers
    contexts = []
    questions = []
    answers = []

    # Traverse through each section of data in the SQuAD file
    for section in squad_data['data']:
        for paragraph in section['paragraphs']:
            context_text = paragraph['context']
            for qa_pair in paragraph['qas']:
                question_text = qa_pair['question']
                # Determine if 'answers' or 'plausible_answers' should be accessed
                answer_type = 'plausible_answers' if 'plausible_answers' in qa_pair else 'answers'
                for answer in qa_pair[answer_type]:
                    # Add the extracted data to their respective lists
                    contexts.append(context_text)
                    questions.append(question_text)
                    answers.append(answer)
    
    # Return the lists with collected contexts, questions, and answers
    return contexts, questions, answers

# Use the function to load training and validation datasets
train_contexts, train_questions, train_answers = load_squad_data('spoken_train-v1.1.json')
val_contexts, val_questions, val_answers = load_squad_data('spoken_test-v1.1.json')


In [2]:
def set_end_idx(answers, contexts):
    # Iterate through each pair of answer and context
    for answer, context in zip(answers, contexts):
        # Extract the expected answer text and its starting index
        target_text = answer['text']
        start_index = answer['answer_start']
        # Calculate the tentative end index
        end_index = start_index + len(target_text)

        # Check if the answer aligns perfectly within the context
        if context[start_index:end_index] == target_text:
            # If it matches, assign the end index
            answer['answer_end'] = end_index
        else:
            # Adjust for cases where the answer is slightly shifted
            for offset in [1, 2]:
                if context[start_index - offset:end_index - offset] == target_text:
                    answer['answer_start'] = start_index - offset
                    answer['answer_end'] = end_index - offset

# Apply the function to both the training and validation answer lists
set_end_idx(train_answers, train_contexts)
set_end_idx(val_answers, val_contexts)


In [3]:
from transformers import DistilBertTokenizerFast

# Set up the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the training and validation datasets
train_encodings = tokenizer(
    train_contexts,
    train_questions,
    truncation=True,
    padding=True
)
val_encodings = tokenizer(
    val_contexts,
    val_questions,
    truncation=True,
    padding=True
)




In [4]:
def map_token_positions(encodings, answers):
    # Create lists to store the token indices for start and end positions
    start_positions = []
    end_positions = []

    for idx in range(len(answers)):
        # Append start and end token positions using char_to_token method
        start_positions.append(encodings.char_to_token(idx, answers[idx]['answer_start']))
        end_positions.append(encodings.char_to_token(idx, answers[idx]['answer_end']))

        # Handle cases where the start position is None due to truncation
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length

        # Adjust end position in case char_to_token returns None
        offset = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(idx, answers[idx]['answer_end'] - offset)
            offset += 1

    # Update the encodings object to include the new token-based start and end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# Use the function to process token positions for training and validation sets
map_token_positions(train_encodings, train_answers)
map_token_positions(val_encodings, val_answers)


In [5]:
import torch

class QADataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, index):
        # Retrieve each encoding component as a tensor for a given index
        return {key: torch.tensor(val[index]) for key, val in self.encodings.items()}

    def __len__(self):
        # Return the total number of samples in the dataset
        return len(self.encodings.input_ids)

# Create datasets for training and validation data
train_dataset = QADataset(train_encodings)
val_dataset = QADataset(val_encodings)


In [6]:
from transformers import DistilBertForQuestionAnswering

# Load the DistilBERT model for question answering
qa_model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from accelerate import Accelerator

# Determine the device (GPU or CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
qa_model.to(device)
qa_model.train()

# Initialize Adam optimizer with learning rate
optimizer = AdamW(qa_model.parameters(), lr=2e-6)

# Set up DataLoader for the training dataset
train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Initialize scheduler to adjust learning rate over training
total_training_steps = len(train_data_loader) * 30
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_training_steps
)

# Prepare the model, optimizer, and dataloader for multi-GPU/multi-CPU
accelerator = Accelerator()
model, optimizer, train_data_loader, lr_scheduler = accelerator.prepare(qa_model, optimizer, train_data_loader, lr_scheduler)

# Training loop for 5 epochs
for epoch in range(5):
    model.train()
    progress_bar = tqdm(train_data_loader, leave=True)

    # Iterate through batches
    for batch in progress_bar:
        # Zero out previous gradients
        optimizer.zero_grad()

        # Move batch data to the correct device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Perform a forward pass and calculate loss
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions, end_positions=end_positions)
        
        loss = outputs[0]

        # Perform backward pass and update parameters
        accelerator.backward(loss)
        optimizer.step()

        # Update the learning rate schedule
        lr_scheduler.step()

        # Update the progress bar with loss and learning rate
        progress_bar.set_description(f'Epoch {epoch}')
        progress_bar.set_postfix(loss=loss.item(), lr=optimizer.param_groups[0]['lr'])


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Epoch 0: 100%|██████████| 2320/2320 [06:58<00:00,  5.54it/s, loss=3.49, lr=1.93e-6]
Epoch 1: 100%|██████████| 2320/2320 [06:59<00:00,  5.52it/s, loss=1.64, lr=1.87e-6]
Epoch 2: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=2.14, lr=1.8e-6]  
Epoch 3: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=2.95, lr=1.73e-6] 
Epoch 4: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=2.6, lr=1.67e-6]  


In [17]:
import os
if not os.path.exists('models'):
   os.makedirs('models')
model_path = 'models/distilbert-custom'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('models/distilbert-custom/tokenizer_config.json',
 'models/distilbert-custom/special_tokens_map.json',
 'models/distilbert-custom/vocab.txt',
 'models/distilbert-custom/added_tokens.json',
 'models/distilbert-custom/tokenizer.json')

In [18]:
from transformers import DistilBertForQuestionAnswering

# Load the pre-trained model from the specified directory
qa_model = DistilBertForQuestionAnswering.from_pretrained("models/HW3 Dataset for BERT Base Model")

# Move the model to the appropriate device (GPU or CPU)
qa_model.to(device)


  return torch.load(checkpoint_file, map_location=map_location)


DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [19]:
# Set the model to evaluation mode
model.eval()

# Initialize data loader for validation data
val_loader = DataLoader(val_dataset, batch_size=16)

# Initialize list to store accuracy scores
accuracy_scores = []

# Set up the progress bar loop
loop = tqdm(val_loader)

# Initialize lists for storing predictions and references
predicted_answers = []
ground_truth_answers = []

# Iterate through the validation dataset
for batch in loop:
    # No need for gradients since we're not training
    with torch.no_grad():
        # Extract the required tensors from the batch
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        
        # Generate predictions from the model
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # Get the start and end position predictions
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        
        # Calculate accuracy for both start and end positions
        accuracy_scores.append(((start_pred == start_true).sum() / len(start_pred)).item())
        accuracy_scores.append(((end_pred == end_true).sum() / len(end_pred)).item())
        
        # For each batch element, get the predicted and reference answers
        for i in range(start_pred.shape[0]):
            all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i])
            predicted_answer = ' '.join(all_tokens[start_pred[i]: end_pred[i] + 1])
            reference_answer = ' '.join(all_tokens[start_true[i]: end_true[i] + 1])
            
            # Decode predicted answer tokens back to a string
            predicted_answer_ids = tokenizer.convert_tokens_to_ids(predicted_answer.split())
            decoded_answer = tokenizer.decode(predicted_answer_ids)
            
            # Append both the predicted and reference answers to their respective lists
            predicted_answers.append(decoded_answer)
            ground_truth_answers.append(reference_answer)

# Compute the average accuracy


100%|██████████| 993/993 [01:12<00:00, 13.77it/s]


In [20]:
from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys


def process_answer(s):
    """Normalize text by converting to lowercase, removing punctuation, articles, and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def fix_whitespace(text):
        return ' '.join(text.split())

    def eliminate_punctuation(text):
        punctuation_set = set(string.punctuation)
        return ''.join(char for char in text if char not in punctuation_set)

    def to_lowercase(text):
        return text.lower()

    return fix_whitespace(remove_articles(eliminate_punctuation(to_lowercase(s))))


def calculate_exact_match(prediction, ground_truth):
    return (process_answer(prediction) == process_answer(ground_truth))


def get_max_metric_for_truths(metric_fn, prediction, ground_truths):
    scores = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores.append(score)
    if len(scores) == 0:
        return 0
    return max(scores)


def calculate_f1(prediction, ground_truth):
    pred_tokens = process_answer(prediction).split()
    gt_tokens = process_answer(ground_truth).split()
    common_tokens = Counter(pred_tokens) & Counter(gt_tokens)
    num_common = sum(common_tokens.values())
    if num_common == 0:
        return 0
    precision = num_common / len(pred_tokens)
    recall = num_common / len(gt_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def evaluate(gold_answers, predicted_answers):
    total_f1 = total_exact_match = num_examples = 0

    for ground_truths, prediction in zip(gold_answers, predicted_answers):
        num_examples += 1
        total_exact_match += get_max_metric_for_truths(calculate_exact_match, prediction, ground_truths)
        total_f1 += get_max_metric_for_truths(calculate_f1, prediction, [ground_truths])

    exact_match_percentage = 100.0 * total_exact_match / num_examples
    f1_percentage = 100.0 * total_f1 / num_examples

    return {'f1': f1_percentage}


In [22]:
evaluate(ground_truth_answers, predicted_answers)


{'f1': 47.93227405210848}