**Cell 1**

In [1]:
import torch
print(torch.__version__)

2.4.0


In [2]:
!pip install torch transformers datasets



**Cell 2**

In [3]:
from transformers import AutoTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch import nn, optim
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn.functional as F
from tqdm import tqdm
import time
from datasets import load_dataset
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

**Cell 3**

In [4]:
dataset = load_dataset("squad")
train_dataset = dataset['train']
eval_dataset = dataset['validation']

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

**Cell 4**

In [5]:
from transformers import T5TokenizerFast
tokenizer = T5TokenizerFast.from_pretrained('t5-small')

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

**Cell 5**

In [6]:
def preprocess_data(dataset, tokenizer, max_length):
    input_ids = []
    attention_masks = []
    start_positions = []
    end_positions = []

    for item in tqdm(dataset, desc="Preprocessing Data"):
        question = item['question']
        context = item['context']
        start_char_idx = item['answers']['answer_start'][0]
        end_char_idx = start_char_idx + len(item['answers']['text'][0])

        encoding = tokenizer(
            question,
            context,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_offsets_mapping=True,
            return_tensors='pt'
        )

        offsets = encoding['offset_mapping'][0]
        start_token_idx = None
        end_token_idx = None

        for idx, (start, end) in enumerate(offsets):
            if start <= start_char_idx < end:
                start_token_idx = idx
            if start < end_char_idx <= end:
                end_token_idx = idx

        if start_token_idx is not None and end_token_idx is not None:
            input_ids.append(encoding['input_ids'].squeeze(0))
            attention_masks.append(encoding['attention_mask'].squeeze(0))
            start_positions.append(start_token_idx)
            end_positions.append(end_token_idx)

    return {
        'input_ids': torch.stack(input_ids),
        'attention_masks': torch.stack(attention_masks),
        'start_positions': torch.tensor(start_positions),
        'end_positions': torch.tensor(end_positions)
}

max_length = 512
train_data = preprocess_data(train_dataset, tokenizer, max_length)
eval_data = preprocess_data(eval_dataset, tokenizer, max_length)


Preprocessing Data: 100%|██████████| 87599/87599 [25:29<00:00, 57.29it/s]
Preprocessing Data: 100%|██████████| 10570/10570 [03:05<00:00, 57.00it/s]


**Cell 6**

In [7]:
train_dataset = TensorDataset(
    train_data['input_ids'], 
    train_data['attention_masks'], 
    train_data['start_positions'], 
    train_data['end_positions']
)

eval_dataset = TensorDataset(
    eval_data['input_ids'], 
    eval_data['attention_masks'], 
    eval_data['start_positions'], 
    eval_data['end_positions']
)

batch_size = 4
num_workers = 4



**cell 7**

In [8]:
# Define collate function to handle padding
def collate_fn(batch):
    input_ids = torch.stack([item[0] for item in batch])
    attention_masks = torch.stack([item[1] for item in batch])
    start_positions = torch.tensor([item[2] for item in batch])
    end_positions = torch.tensor([item[3] for item in batch])
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'start_positions': start_positions,
        'end_positions': end_positions
    }

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=num_workers, pin_memory=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=num_workers, pin_memory=True)

**Cell 8**

In [9]:
batch_size = 16
num_workers = 4  # Optimize data loading

# Assuming your dataset is already tokenized in `train_loader` and `eval_loader`
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=num_workers, pin_memory=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=num_workers, pin_memory=True)

**Cell 9**

In [10]:
vocab_size = tokenizer.vocab_size
embedding_dim = 512

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerDecoderModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, num_layers, hidden_dim):
        super(TransformerDecoderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim, dropout=0.1)
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=embedding_dim, nhead=num_heads),
            num_layers=num_layers
        )
        self.start_linear = nn.Linear(embedding_dim, vocab_size)
        self.end_linear = nn.Linear(embedding_dim, vocab_size)
        self.dropout = nn.Dropout(0.1)
        self.layer_norm = nn.LayerNorm(embedding_dim)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        tgt_emb = self.embedding(tgt)
        memory_emb = self.embedding(memory)

        tgt_emb = self.positional_encoding(tgt_emb)
        memory_emb = self.positional_encoding(memory_emb)

        output = self.decoder(
            tgt=tgt_emb.transpose(0, 1),
            memory=memory_emb.transpose(0, 1),
            tgt_mask=tgt_mask, memory_mask=memory_mask,
            tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask
        )
        
        output = output.transpose(0, 1)
        output = self.layer_norm(output)
        
        start_logits = self.start_linear(self.dropout(output))
        end_logits = self.end_linear(self.dropout(output))

        return start_logits, end_logits

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.to(device)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

**Cell 10**

In [11]:
from torch.optim.lr_scheduler import ReduceLROnPlateau



def validate_target_indices(target, num_classes):
    invalid_indices = (target < 0) | (target >= num_classes)
    if invalid_indices.any():
        raise ValueError(f"Target contains invalid indices: {target[invalid_indices]}")

class LabelSmoothingLoss(nn.Module):
    def __init__(self, smoothing=0.1, ignore_index=-100):
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing = smoothing
        self.ignore_index = ignore_index

    def forward(self, pred, target):
        num_classes = pred.size(-1)
        mask = target != self.ignore_index
        target = target[mask]
        pred = pred[mask]
        
        confidence = 1.0 - self.smoothing
        low_confidence = self.smoothing / num_classes
        target_smooth = torch.full_like(pred, low_confidence, dtype=torch.float)
        target_smooth.scatter_(1, target.unsqueeze(1), confidence)
        log_probs = F.log_softmax(pred, dim=-1)
        loss = torch.sum(-target_smooth * log_probs, dim=-1)
        return loss.mean()

# **Cell 11**

In [12]:
from torch.cuda.amp import GradScaler, autocast
num_epochs = 5
loss_fn = nn.CrossEntropyLoss()
scaler = GradScaler()

  scaler = GradScaler()


### **Cell 12**

In [13]:
def exact_match(predictions, labels):
    start_predictions, end_predictions = predictions
    start_labels, end_labels = labels

    # Convert logits to predictions
    start_predictions = torch.argmax(start_predictions, dim=-1)
    end_predictions = torch.argmax(end_predictions, dim=-1)

    start_match = (start_predictions == start_labels).float().mean().item()
    end_match = (end_predictions == end_labels).float().mean().item()

    return (start_match + end_match) / 2

**Cell 13**

In [14]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Calculate total steps
total_steps = len(train_loader) * num_epochs

# Define the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps)



**Cell 14**

In [15]:
def compute_em(predicted, actual):
    """Computes the Exact Match (EM) score between the predicted and actual answer."""
    return int(predicted.strip() == actual.strip())

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_em = 0
    total_count = 0

    print(f"Starting Epoch  {epoch + 1}")  # Add this to see when the epoch starts

    for step, batch in enumerate(train_loader):
        print(f"Processing batch {step + 1}/{len(train_loader)}")  # Log each batch

        input_ids = batch['input_ids'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        target_texts = [tokenizer.decode(input_ids[i][start_positions[i]:end_positions[i] + 1]) for i in range(len(input_ids))]
        labels = tokenizer(target_texts, padding=True, truncation=True, return_tensors="pt").input_ids.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        with torch.no_grad():
            generated_ids = model.generate(input_ids, max_length=50)
            predicted_answers = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]

        em_batch = sum(compute_em(pred, actual) for pred, actual in zip(predicted_answers, target_texts))
        total_em += em_batch
        total_count += len(target_texts)

        # Print loss and EM score periodically
        if step % 10 == 0:  # Adjust frequency as needed
            print(f"Batch {step + 1} | Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    avg_em = (total_em / total_count) * 100

    print(f"Epoch {epoch + 1} | Loss: {avg_loss:.4f} | EM: {avg_em:.2f}%")



Starting Epoch -1-1-1-1--1-1-1-1-1 1
Processing batch 1/5474
Batch 1 | Loss: 5.2175
Processing batch 2/5474
Processing batch 3/5474
Processing batch 4/5474
Processing batch 5/5474
Processing batch 6/5474
Processing batch 7/5474
Processing batch 8/5474
Processing batch 9/5474
Processing batch 10/5474
Processing batch 11/5474
Batch 11 | Loss: 4.6332
Processing batch 12/5474
Processing batch 13/5474
Processing batch 14/5474
Processing batch 15/5474
Processing batch 16/5474
Processing batch 17/5474
Processing batch 18/5474
Processing batch 19/5474
Processing batch 20/5474
Processing batch 21/5474
Batch 21 | Loss: 3.0145
Processing batch 22/5474
Processing batch 23/5474
Processing batch 24/5474
Processing batch 25/5474
Processing batch 26/5474
Processing batch 27/5474
Processing batch 28/5474
Processing batch 29/5474
Processing batch 30/5474
Processing batch 31/5474
Batch 31 | Loss: 1.4587
Processing batch 32/5474
Processing batch 33/5474
Processing batch 34/5474
Processing batch 35/5474
Pr

**Cell 15**

In [16]:
def generate_answer(input_text, model, tokenizer):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    generated_ids = model.generate(input_ids, max_length=50)
    answer = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return answer

example_question = "What is the capital of France?"
answer = generate_answer(example_question, model, tokenizer)
print(f"Predicted Answer: {answer}")

Predicted Answer: capital


### **Cell 16**

In [18]:
def evaluate_model(data_loader, model, tokenizer):
    model.eval()
    total_em = 0
    total_count = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        target_texts = [tokenizer.decode(input_ids[i][start_positions[i]:end_positions[i] + 1]) for i in range(len(input_ids))]

        with torch.no_grad():
            generated_ids = model.generate(input_ids, max_length=50)
            predicted_answers = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]

        em_batch = sum(compute_em(pred, actual) for pred, actual in zip(predicted_answers, target_texts))
        total_em += em_batch
        total_count += len(target_texts)

    avg_em = (total_em / total_count) * 100
    print(f"Evaluation EM: {avg_em:.2f}%")
    return avg_em

# Evaluate on validation set
evaluate_model(eval_loader, model, tokenizer)

Evaluation EM: 58.63%


58.626243486499284