In [6]:
import torch
import numpy as np
import json
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertTokenizer, 
    DistilBertForQuestionAnswering,
    AdamW,
    get_linear_schedule_with_warmup
)
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import warnings

In [7]:
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=384):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.examples = self.preprocess_data(data)
    
    def preprocess_data(self, data):
        examples = []
        for entry in data['data']:
            for paragraph in entry['paragraphs']:
                context = paragraph['context']
                for qa in paragraph['qas']:
                    if not qa['is_impossible']:
                        answer = qa['answers'][0]
                        # Calculate answer end position properly
                        answer_start = answer['answer_start']
                        answer_text = answer['text']
                        answer_end = answer_start + len(answer_text)
                        
                        examples.append({
                            'context': context,
                            'question': qa['question'],
                            'answer_text': answer_text,
                            'answer_start': answer_start,
                            'answer_end': answer_end
                        })
        return examples[:10000]
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        example = self.examples[idx]
        
        # Tokenize question and context together
        encoding = self.tokenizer(
            example['question'],
            example['context'],
            max_length=self.max_length,
            truncation='only_second',  # Only truncate the context, not the question
            stride=128,  # Add sliding window for long contexts
            padding='max_length',
            return_tensors='pt',
            return_offsets_mapping=True,  # Get character mappings
            return_overflowing_tokens=True  # Handle long sequences properly
        )
        
        # Convert char positions to token positions
        offset_mapping = encoding.pop('offset_mapping').squeeze()
        
        # Find the token positions that correspond to the answer
        start_positions = torch.tensor([0])  # Default to 0 if answer not found
        end_positions = torch.tensor([0])
        
        for idx, (start, end) in enumerate(offset_mapping):
            if start <= example['answer_start'] <= end:
                start_positions = torch.tensor([idx])
            if start <= example['answer_end'] <= end:
                end_positions = torch.tensor([idx])
                break
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'start_positions': start_positions,
            'end_positions': end_positions
        }


In [8]:
class ModelTrainer:
    def __init__(self, model_name='distilbert-base-uncased', device='cuda'):
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        self.model = DistilBertForQuestionAnswering.from_pretrained(
            model_name,
            return_dict=True
        ).to(self.device)
        
        # Suppress specific warnings
        warnings.filterwarnings('ignore', category=FutureWarning)
    
    def train(self, train_dataloader, eval_dataloader, epochs=3, lr=2e-5):
        # Use PyTorch's native AdamW implementation
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr, weight_decay=0.01)
        
        total_steps = len(train_dataloader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=total_steps // 10,  # 10% warmup
            num_training_steps=total_steps
        )
        
        training_stats = []
        best_eval_loss = float('inf')
        
        for epoch in range(epochs):
            print(f'\nEpoch {epoch + 1}/{epochs}')
            self.model.train()
            total_loss = 0
            
            progress_bar = tqdm(train_dataloader, desc=f'Training')
            for batch in progress_bar:
                # Move batch to device
                batch = {k: v.to(self.device) for k, v in batch.items()}
                
                # Forward pass
                outputs = self.model(**batch)
                loss = outputs.loss
                
                # Backward pass with gradient clipping
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                
                total_loss += loss.item()
                progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
            
            avg_train_loss = total_loss / len(train_dataloader)
            eval_loss = self.evaluate(eval_dataloader)
            
            # Save best model
            if eval_loss < best_eval_loss:
                best_eval_loss = eval_loss
                torch.save(self.model.state_dict(), 'best_model.pt')
            
            training_stats.append({
                'epoch': epoch + 1,
                'training_loss': avg_train_loss,
                'eval_loss': eval_loss,
                'learning_rate': scheduler.get_last_lr()[0]
            })
            
            print(f'Average training loss: {avg_train_loss:.4f}')
            print(f'Evaluation loss: {eval_loss:.4f}')
        
        return training_stats
    
    def evaluate(self, eval_dataloader):
        self.model.eval()
        total_eval_loss = 0
        
        with torch.no_grad():
            for batch in tqdm(eval_dataloader, desc='Evaluating'):
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch)
                total_eval_loss += outputs.loss.item()
        
        return total_eval_loss / len(eval_dataloader)


In [4]:
def plot_attention_heatmap(attention_weights, layer_idx=0, head_idx=0):
    """Plot attention heatmap for a specific layer and head"""
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        attention_weights[layer_idx][0, head_idx].cpu().numpy(),
        cmap='Blues'
    )
    plt.title(f'Attention Heatmap - Layer {layer_idx}, Head {head_idx}')
    plt.xlabel('Key tokens')
    plt.ylabel('Query tokens')
    plt.show()


In [2]:
def main():
    # Set random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)
    
    # Load data
    with open('train-v2.0.json', 'r') as f:
        train_data = json.load(f)
    with open('dev-v2.0.json', 'r') as f:
        eval_data = json.load(f)
    
    # Initialize trainer
    trainer = ModelTrainer()
    
    # Create datasets with larger batch size and num_workers
    train_dataset = QADataset(train_data, trainer.tokenizer)
    eval_dataset = QADataset(eval_data, trainer.tokenizer)
    
    train_dataloader = DataLoader(
        train_dataset, 
        batch_size=32,  # Increased batch size
        shuffle=True,
        num_workers=4,  # Parallel data loading
        pin_memory=True  # Faster data transfer to GPU
    )
    
    eval_dataloader = DataLoader(
        eval_dataset, 
        batch_size=32,
        num_workers=4,
        pin_memory=True
    )
    
    # Train model
    training_stats = trainer.train(train_dataloader, eval_dataloader)
    
    # Plot training results
    stats_df = pd.DataFrame(training_stats)
    plt.figure(figsize=(10, 6))
    plt.plot(stats_df['epoch'], stats_df['training_loss'], label='Training Loss')
    plt.plot(stats_df['epoch'], stats_df['eval_loss'], label='Evaluation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Evaluation Loss')
    plt.legend()
    plt.show()

if __name__ == "__main__":
    main()


Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 371/371 [02:23<00:00,  2.58it/s] 
  with amp.autocast(enabled=torch.cuda.is_available()):
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 371/371 [02:23<00:00,  2.58it/s] 
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 371/371 [02:23<00:00,  2.58it/s] 
Average training loss: 3.3860
Average training loss: 3.3860
Evaluation loss: 2.3685
Evaluation loss: 2.3685

Epoch 2/3
Epoch 2/3
Training: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [22:28<00:00,  2.16s/it, loss=1.8109] 
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 371/371 [02:11<00:00,  2.82it/s] 
Aver

In [1]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering

class QAInference:
    def __init__(self, model_path='results/best_model.pt', model_name='distilbert-base-uncased'):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        self.model = DistilBertForQuestionAnswering.from_pretrained(model_name).to(self.device)
        
        # Load the trained model weights
        checkpoint = torch.load(model_path, map_location=self.device)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.eval()

    @torch.no_grad()
    def get_answer(self, question, context, max_length=384):
        # Tokenize input
        inputs = self.tokenizer(
            question,
            context,
            max_length=max_length,
            truncation='only_second',
            stride=128,
            padding='max_length',
            return_tensors='pt',
            return_offsets_mapping=True
        )
        
        # Get offset mapping and send input_ids and attention_mask to device
        offset_mapping = inputs.pop('offset_mapping').squeeze(0)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Get model predictions
        outputs = self.model(**inputs)
        
        # Get the most likely beginning and end of answer
        start_logits = outputs.start_logits.cpu().squeeze(0)
        end_logits = outputs.end_logits.cpu().squeeze(0)
        
        # Get the most likely answer span
        start_idx = torch.argmax(start_logits)
        end_idx = torch.argmax(end_logits)
        
        # Convert to actual text span using offset mapping
        start_char = int(offset_mapping[start_idx][0])
        end_char = int(offset_mapping[end_idx][1])
        
        # Get the answer text
        answer = context[start_char:end_char]
        
        # Calculate confidence scores
        start_prob = torch.softmax(start_logits, dim=0)[start_idx].item()
        end_prob = torch.softmax(end_logits, dim=0)[end_idx].item()
        confidence = (start_prob + end_prob) / 2
        
        return {
            'answer': answer,
            'confidence': confidence,
            'start_char': start_char,
            'end_char': end_char
        }

# Example usage
def main():
    # Initialize the QA system
    qa_system = QAInference()
    
    # Example context and question
    context = "The Python programming language was created by Guido van Rossum and was released in 1991. Python's name comes from Monty Python."
    question = "Who created Python?"
    
    # Get the answer
    result = qa_system.get_answer(question, context)
    
    # Print results
    print(f"Question: {question}")
    print(f"Answer: {result['answer']}")
    print(f"Confidence: {result['confidence']:.2%}")
    print(f"Answer span: characters {result['start_char']} to {result['end_char']}")

if __name__ == '__main__':
    main()

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load(model_path, map_location=self.device)


Question: Who created Python?
Answer: Guido van Rossum
Confidence: 86.35%
Answer span: characters 47 to 63
