In [1]:
import json
import torch
import string
import re
import os
from tqdm import tqdm
from collections import Counter
from torch.utils.data import DataLoader
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, AdamW

2024-11-08 20:00:39.443455: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-08 20:00:39.461506: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-08 20:00:39.466975: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-08 20:00:39.480548: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Define functions for data preprocessing and answer positioning

def preprocess_json(path):
    """Loads a JSON file and extracts contexts, questions, and answers, adjusting answer indices if needed."""
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    
    contexts, questions, answers = [], [], []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                access = 'plausible_answers' if 'plausible_answers' in qa else 'answers'
                for answer in qa[access]:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)
        
        # Adjust answer indices if necessary
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
    return contexts, questions, answers


def add_token_positions(encodings, answers):
    """Adds token-based start and end positions to encodings, handling cases where answers are truncated."""
    start_positions, end_positions = [], []
    for i in range(len(answers)):
        start = encodings.char_to_token(i, answers[i]['answer_start'])
        end = encodings.char_to_token(i, answers[i]['answer_end'])
        
        if start is None:
            start = tokenizer.model_max_length
        shift = 1
        while end is None:
            end = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
        start_positions.append(start)
        end_positions.append(end)
    
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})




In [3]:
# Load and preprocess data
train_contexts, train_questions, train_answers = preprocess_json('squad/spoken_test-v1.1.json')
val_contexts, val_questions, val_answers = preprocess_json('squad/spoken_train-v1.1.json')

# Initialize model and tokenizer
model_name = "rein5/bert-base-uncased-finetuned-spoken-squad"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and add positions for answers
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)


  torch.utils._pytree._register_pytree_node(


In [4]:
# Create Dataset and DataLoader classes
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [5]:
# Build datasets and dataloaders
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Set up device and optimizer
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-6)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [6]:
# Training Loop
for epoch in range(3):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 993/993 [07:54<00:00,  2.09it/s, loss=2.09]
Epoch 1: 100%|██████████| 993/993 [07:54<00:00,  2.09it/s, loss=1.95] 
Epoch 2: 100%|██████████| 993/993 [07:54<00:00,  2.09it/s, loss=1.37] 


In [7]:
# Save the fine-tuned model
os.makedirs('models', exist_ok=True)
model_path = 'models/distilbert-custom2'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('models/distilbert-custom2/tokenizer_config.json',
 'models/distilbert-custom2/special_tokens_map.json',
 'models/distilbert-custom2/vocab.txt',
 'models/distilbert-custom2/added_tokens.json',
 'models/distilbert-custom2/tokenizer.json')

In [13]:
# Evaluation functions
def normalize_answer(s):
    """Lower text and remove punctuation, articles, and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    return (2 * precision * recall) / (precision + recall)

def evaluate(gold_answers, predictions):
    total, f1, exact_match = 0, 0, 0
    for ground_truths, prediction in zip(gold_answers, predictions):
        total += 1
        exact_match += exact_match_score(prediction, ground_truths)
        f1 += f1_score(prediction, ground_truths)
    return {'exact_match': 100 * exact_match / total, 'f1': 100 * f1 / total}


In [14]:
# Validation Loop
model.eval()
answers, references = [], []
with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        
        for i in range(start_pred.shape[0]):
            all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i])
            answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(all_tokens[start_pred[i]: end_pred[i]+1]))
            ref = tokenizer.decode(tokenizer.convert_tokens_to_ids(all_tokens[start_true[i]: end_true[i]+1]))
            answers.append(answer)
            references.append(ref)

# Evaluate performance
metrics = evaluate(references, answers)
print(metrics)

100%|██████████| 2320/2320 [06:41<00:00,  5.77it/s]


{'exact_match': 69.82296354180701, 'f1': 77.08683807250743}
