In [40]:
import json 
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, BertTokenizerFast, AdamW
from torch.utils.data import Dataset, DataLoader
from evaluate import load
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [41]:
import json

def load_squad_data(filepath):
    # Load the JSON file into a dictionary
    with open(filepath, 'rb') as file:
        squad_data = json.load(file)

    # Prepare lists to store contexts, questions, and answers
    contexts, questions, answers = [], [], []

    # Iterate through each data entry in the SQuAD dataset
    for group in squad_data['data']:
        for paragraph in group['paragraphs']:
            context_text = paragraph['context']
            for qa_pair in paragraph['qas']:
                question_text = qa_pair['question']
                
                # Determine if 'plausible_answers' or 'answers' should be used
                answer_key = 'plausible_answers' if 'plausible_answers' in qa_pair else 'answers'
                
                # Append context, question, and each answer to the respective lists
                for answer in qa_pair[answer_key]:
                    contexts.append(context_text)
                    questions.append(question_text)
                    answers.append(answer)
    
    # Return the lists of contexts, questions, and answers
    return contexts, questions, answers

# Load the training and validation datasets
train_contexts, train_questions, train_answers = load_squad_data('spoken_train-v1.1.json')
val_contexts, val_questions, val_answers = load_squad_data('spoken_test-v1.1.json')


In [42]:
def adjust_end_index(answers, contexts):
    # Iterate over each answer-context pair
    for answer, context in zip(answers, contexts):
        # Get the expected answer text from the context
        expected_text = answer['text']
        # Capture the provided start index of the answer
        start_idx = answer['answer_start']
        # Calculate the initial end index
        end_idx = start_idx + len(expected_text)

        # Check if the context substring matches the answer exactly
        if context[start_idx:end_idx] == expected_text:
            # If it matches, set the end index directly
            answer['answer_end'] = end_idx
        else:
            # Adjust in case the answer position is slightly off
            for offset in [1, 2]:
                if context[start_idx - offset:end_idx - offset] == expected_text:
                    answer['answer_start'] = start_idx - offset
                    answer['answer_end'] = end_idx - offset

# Apply the function to both training and validation answer lists
adjust_end_index(train_answers, train_contexts)
adjust_end_index(val_answers, val_contexts)


In [43]:
from transformers import DistilBertTokenizerFast

# Initialize the tokenizer using the pre-trained DistilBERT model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the training and validation sets with truncation and padding
train_encodings = tokenizer(
    train_contexts, train_questions, padding=True, truncation=True
)
val_encodings = tokenizer(
    val_contexts, val_questions, padding=True, truncation=True
)


In [44]:
def map_token_positions(encodings, answers):
    # Lists to store token start and end indices for each answer
    start_positions, end_positions = [], []

    # Iterate through each answer to determine token-based start and end indices
    for idx in range(len(answers)):
        # Use char_to_token method to get token positions of answer start and end
        start_positions.append(encodings.char_to_token(idx, answers[idx]['answer_start']))
        end_positions.append(encodings.char_to_token(idx, answers[idx]['answer_end']))

        # If the start position is None, it indicates the answer context was truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length

        # Adjust end position if None by shifting until a valid token position is found
        shift_offset = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(idx, answers[idx]['answer_end'] - shift_offset)
            shift_offset += 1

    # Update the encodings object with calculated start and end token positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# Apply the function to training and validation encodings
map_token_positions(train_encodings, train_answers)
map_token_positions(val_encodings, val_answers)


In [45]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        # Retrieve each encoding entry and convert it to a tensor
        return {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}

    def __len__(self):
        # Return the length of the dataset based on input IDs
        return len(self.encodings.input_ids)

# Create dataset objects for the training and validation sets
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)


In [46]:
from transformers import DistilBertForQuestionAnswering

# Load the pre-trained DistilBERT model for question-answering tasks
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# Determine the computing device: GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model to the selected device
model.to(device)

# Enable training mode for the model
model.train()

# Set up AdamW optimizer with weight decay to mitigate overfitting
optimizer = AdamW(model.parameters(), lr=2e-6)

# Create data loader for training with a batch size of 16 and shuffling
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Training loop for 5 epochs
for epoch in range(5):
    model.train()
    # Initialize the progress bar with tqdm
    progress_bar = tqdm(train_loader, leave=True)
    
    for batch in progress_bar:
        # Zero out any previously computed gradients
        optimizer.zero_grad()

        # Retrieve batch tensors and move them to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # Forward pass through the model to get outputs, including loss
        outputs = model(input_ids, 
                        attention_mask=attention_mask, 
                        start_positions=start_positions, 
                        end_positions=end_positions)

        # Extract the loss from the model's output
        loss = outputs.loss

        # Backpropagate the loss to compute gradients
        loss.backward()

        # Apply optimizer step to update model parameters
        optimizer.step()

        # Update progress bar with current loss
        progress_bar.set_description(f'Epoch {epoch}')
        progress_bar.set_postfix(loss=loss.item())


Epoch 0: 100%|██████████| 2320/2320 [06:57<00:00,  5.55it/s, loss=2.87]
Epoch 1: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=1.64]
Epoch 2: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=1.69]
Epoch 3: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=2.5]  
Epoch 4: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=1.08] 


In [48]:
import os

# Create a directory in your home directory
home_dir = os.path.expanduser("~")
model_dir = os.path.join(home_dir, "models")

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save the model
model_path = os.path.join(model_dir, "distilbert-custom")
model.save_pretrained(model_path)


In [49]:
model = DistilBertForQuestionAnswering.from_pretrained("models/HW3 Dataset for BERT Base Model")
model.to(device)

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [50]:
# Switch model to evaluation mode
model.eval()

# Set up data loader for the validation set with a batch size of 16
val_loader = DataLoader(val_dataset, batch_size=16)

# Initialize list to store accuracy values
accuracy_scores = []

# Initialize progress bar loop
progress_bar = tqdm(val_loader)

# Lists to store predicted answers and reference answers
predicted_answers = []
reference_answers = []

# Iterate through each batch in the validation set
for batch in progress_bar:
    # Disable gradient calculation for evaluation
    with torch.no_grad():
        # Transfer batch data to the specified device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        true_start_positions = batch['start_positions'].to(device)
        true_end_positions = batch['end_positions'].to(device)

        # Get model predictions
        output = model(input_ids, attention_mask=attention_mask)

        # Extract start and end predictions using argmax
        predicted_start = torch.argmax(output['start_logits'], dim=1)
        predicted_end = torch.argmax(output['end_logits'], dim=1)

        # Calculate accuracy for both start and end predictions, add to list
        accuracy_scores.append((predicted_start == true_start_positions).sum().item() / len(predicted_start))
        accuracy_scores.append((predicted_end == true_end_positions).sum().item() / len(predicted_end))

        # Process predictions and references for each item in the batch
        for idx in range(predicted_start.shape[0]):
            tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][idx])
            pred_answer = ' '.join(tokens[predicted_start[idx]:predicted_end[idx] + 1])
            ref_answer = ' '.join(tokens[true_start_positions[idx]:true_end_positions[idx] + 1])
            # Decode the predicted answer to text
            predicted_answer_ids = tokenizer.convert_tokens_to_ids(pred_answer.split())
            predicted_answer = tokenizer.decode(predicted_answer_ids)
            predicted_answers.append(predicted_answer)
            reference_answers.append(ref_answer)

# Calculate the average accuracy over all batches


100%|██████████| 993/993 [01:12<00:00, 13.67it/s]


In [51]:
from __future__ import print_function
from collections import Counter
import string
import re
import json
import sys

def normalize_text(text):
    """Convert text to lowercase, remove punctuation, articles, and extra whitespace."""
    
    def remove_articles(input_text):
        return re.sub(r'\b(a|an|the)\b', ' ', input_text)

    def remove_punctuation(input_text):
        punctuation_set = set(string.punctuation)
        return ''.join(char for char in input_text if char not in punctuation_set)

    def fix_whitespace(input_text):
        return ' '.join(input_text.split())

    def to_lowercase(input_text):
        return input_text.lower()

    # Apply all transformations to the text
    return fix_whitespace(remove_articles(remove_punctuation(to_lowercase(text))))

def compute_exact_match(prediction, reference):
    return normalize_text(prediction) == normalize_text(reference)

def compute_f1_score(prediction, reference):
    prediction_tokens = normalize_text(prediction).split()
    reference_tokens = normalize_text(reference).split()
    common_tokens = Counter(prediction_tokens) & Counter(reference_tokens)
    num_common = sum(common_tokens.values())
    
    if num_common == 0:
        return 0.0
    
    precision = num_common / len(prediction_tokens)
    recall = num_common / len(reference_tokens)
    return (2 * precision * recall) / (precision + recall)

def best_score_over_references(metric_fn, prediction, references):
    scores = [metric_fn(prediction, ref) for ref in references]
    return max(scores) if scores else 0

def evaluate_metrics(true_answers, model_predictions):
    f1_total, exact_match_total, count = 0, 0, 0
    
    for ref_answers, pred_answer in zip(true_answers, model_predictions):
        count += 1
        exact_match_total += best_score_over_references(compute_exact_match, pred_answer, ref_answers)
        f1_total += best_score_over_references(compute_f1_score, pred_answer, [ref_answers])
    
    exact_match_avg = 100.0 * exact_match_total / count
    f1_avg = 100.0 * f1_total / count

    return {'f1': f1_avg, 'exact_match': exact_match_avg}


In [52]:
# Call the evaluate function with references and answers as inputs
evaluate_metrics(reference_answers, predicted_answers)


{'f1': 52.887876274954415, 'exact_match': 7.332283464566929}