###**Modifying BERT for Q/A Squad dataset**

In [1]:
!pip install torch transformers datasets tqdm

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [2]:
!pip install huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.0.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (54.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.0.5


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, BertModel, BertConfig
from datasets import load_dataset
from tqdm import tqdm

####**Define subjects and load dataset**

In [4]:
subjects = ['Science', 'Literature', 'Computation', 'History']
subject_to_id = {subj: idx for idx, subj in enumerate(subjects)}

# Load dataset
dataset = load_dataset('squad')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

####**Simulate a subject and tokenize the text**

In [5]:
class QADataset(torch.utils.data.Dataset):
    def __init__(self, dataset_split):
        self.data = dataset_split

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['context']
        question = item['question']
        answer_text = item['answers']['text'][0]
        answer_start = item['answers']['answer_start'][0]

        # Simulate a subject randomly
        subject = torch.tensor(subject_to_id[subjects[idx % len(subjects)]])

        # Tokenize
        inputs = tokenizer(context, question, return_tensors='pt', truncation=True, padding='max_length', max_length=384)

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        # Find start and end token positions
        start_positions = inputs.char_to_token(0, answer_start)
        end_positions = inputs.char_to_token(0, answer_start + len(answer_text) - 1)

        # If answer position is None (out of bounds), assign a default value
        # instead of creating a tensor directly
        if start_positions is None:
            start_positions = 0
        if end_positions is None:
            end_positions = 0

        # Convert to tensors after handling None values
        start_positions = torch.tensor(start_positions)
        end_positions = torch.tensor(end_positions)

        return input_ids, attention_mask, start_positions, end_positions, subject

####**BERT**

In [6]:
class CustomBertForQA(nn.Module):
    def __init__(self, model_name='bert-base-uncased', num_subjects=10):
        super(CustomBertForQA, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        hidden_size = self.bert.config.hidden_size
        self.subject_embedding = nn.Embedding(num_subjects, hidden_size)
        self.attention_pool = nn.MultiheadAttention(hidden_size, num_heads=8, batch_first=True)
        self.qa_outputs = nn.Linear(hidden_size, 2)  # start and end

    def forward(self, input_ids, attention_mask, subject_id):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        subject_embeds = self.subject_embedding(subject_id).unsqueeze(1)
        sequence_output = sequence_output + subject_embeds
        pooled_output, _ = self.attention_pool(sequence_output, sequence_output, sequence_output)
        logits = self.qa_outputs(pooled_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        return start_logits, end_logits

####**Training and Evaluation Functions**

In [7]:
def train(model, dataloader, optimizer, device):
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    total_loss = 0
    for input_ids, attention_mask, start_positions, end_positions, subject_id in tqdm(dataloader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        start_positions = start_positions.to(device)
        end_positions = end_positions.to(device)
        subject_id = subject_id.to(device)

        optimizer.zero_grad()
        start_logits, end_logits = model(input_ids, attention_mask, subject_id)

        loss_start = loss_fn(start_logits, start_positions)
        loss_end = loss_fn(end_logits, end_positions)
        loss = (loss_start + loss_end) / 2

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Training Loss: {total_loss/len(dataloader)}")

# Evaluation function (Simple)
def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    for input_ids, attention_mask, start_positions, end_positions, subject_id in tqdm(dataloader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        subject_id = subject_id.to(device)

        with torch.no_grad():
            start_logits, end_logits = model(input_ids, attention_mask, subject_id)
            start_pred = torch.argmax(start_logits, dim=1)
            end_pred = torch.argmax(end_logits, dim=1)

        correct += ((start_pred == start_positions.to(device)) & (end_pred == end_positions.to(device))).sum().item()
        total += input_ids.size(0)
    print(f"Evaluation Accuracy: {correct/total:.4f}")

####**Train and evaluate the model**

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create datasets and dataloaders
train_dataset = QADataset(dataset['train'].select(range(300)))  # Use smaller subset for fast training
val_dataset = QADataset(dataset['validation'].select(range(300)))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Model, optimizer
model = CustomBertForQA(num_subjects=len(subjects)).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):  # Train for 3 epochs
    print(f"Epoch {epoch+1}")
    train(model, train_loader, optimizer, device)
    evaluate(model, val_loader, device)

print("Training Finished!")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1


100%|██████████| 19/19 [00:20<00:00,  1.08s/it]


Training Loss: 5.436970259013929


100%|██████████| 19/19 [00:06<00:00,  3.05it/s]


Evaluation Accuracy: 0.0533
Epoch 2


100%|██████████| 19/19 [00:19<00:00,  1.04s/it]


Training Loss: 4.041204402321263


100%|██████████| 19/19 [00:06<00:00,  2.99it/s]


Evaluation Accuracy: 0.0700
Epoch 3


100%|██████████| 19/19 [00:20<00:00,  1.08s/it]


Training Loss: 3.2182365844124243


100%|██████████| 19/19 [00:06<00:00,  2.87it/s]

Evaluation Accuracy: 0.0433
Training Finished!





In [9]:
torch.save(model.state_dict(), "custom_bert_qa.pth")

####**Additional rephrase**

In [10]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load T5-small
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

def rephrase_answer(answer_text):
    input_text = f"Rephrase: {answer_text}"
    input_ids = t5_tokenizer(input_text, return_tensors='pt').input_ids.to(device)
    outputs = t5_model.generate(input_ids, max_length=50)
    return t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example:
answer = "The capital of France is Paris."
print(rephrase_answer(answer))

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Rephrase: La capitale de France est Paris.


###**A trial on simple paragraph**

In [14]:
def answer_question(question, context, subject_idx, model, tokenizer, device):
    model.eval()
    inputs = tokenizer(
        context,
        question,
        add_special_tokens=True,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Add subject index tensor
    subject_idx_tensor = torch.tensor([subject_idx]).to(device)

    with torch.no_grad():
        start_logits, end_logits = model(input_ids, attention_mask, subject_idx_tensor)

    start_idx = torch.argmax(start_logits, dim=1).item()
    end_idx = torch.argmax(end_logits, dim=1).item()

    if start_idx > end_idx:
        return "Sorry, could not find a good answer."

    answer_tokens = input_ids[0][start_idx:end_idx+1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    return answer

In [16]:
# Assume tokenizer is same as the one you used inside QADataset
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Example context (small science paragraph)
context = """
The Earth revolves around the Sun once every 365.25 days. This revolution, along with the axial tilt of Earth, results in seasons. The gravitational pull between the Earth and the Moon causes ocean tides.
"""

# Example questions
question1 = "How long does the Earth take to revolve around the Sun?"
question2 = "What causes ocean tides?"

subject_idx = 0  # Example: Science subject index

# Get answers
answer1 = answer_question(question1, context, subject_idx, model, tokenizer, device)
answer2 = answer_question(question2, context, subject_idx, model, tokenizer, device)

print(f"Q1: {question1}")
print(f"A1: {answer1}\n")

print(f"Q2: {question2}")
print(f"A2: {answer2}\n")

Q1: How long does the Earth take to revolve around the Sun?
A1: 365. 25 days

Q2: What causes ocean tides?
A2: 365. 25 days

