In [None]:
!pip install torch



In [None]:
import torch
from transformers import LongformerTokenizer, LongformerForQuestionAnswering, LongformerTokenizerFast
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# **1. Tải Tokenizer và mô hình**

In [None]:
model_name = "allenai/longformer-base-4096"
tokenizer = LongformerTokenizerFast.from_pretrained(model_name)
model = LongformerForQuestionAnswering.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForQuestionAnswering were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **2. Load dataset (SQuAD v1.1)**

In [None]:
dataset = load_dataset("squad")

README.md: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/597M [00:00<?, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
dataset["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [None]:
def preprocess_function(rows):
    questions = [q.strip() for q in rows["question"]]
    contexts = rows["context"]
    answers = rows["answers"]

    # Mã hoá câu hỏi và đoạn văn (contexts)
    encodings = tokenizer(
        questions,
        contexts,
        max_length=4096,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True
    )

    '''
    {
        'input_ids': [[0, 1129, 3124, 16, 5, 2232, 35, 2, 2, ...]]
        'attention_mask': [[1, 1, 1, 0, 0, 0]] 1 cho token, 0 cho pad
        'offset_mapping': [[(0, 0), (0, 4), (5, 10), (11, 13)]]
    }
    '''

    # Xác định vị trí bắt đầu và kết thúc của câu trả lời
    start_position = []
    end_position = []

    # Duyệt qua từng sample, lấy start_char/end_char từ answers
    for i, offset_mapping in enumerate(encodings["offset_mapping"]):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        token_start_index = 0
        token_end_index = 0

        # Tìm vị trí token tương ứng với start_char và end_char
        for idx, (start, end) in enumerate(offset_mapping):
            if start <= start_char < end:
                token_start_index = idx
            if start < end_char <= end:
                token_end_index = idx
                break

        start_position.append(token_start_index)
        end_position.append(token_end_index)

    encodings.update(
        {
            "start_position": start_position,
            "end_position": end_position
        }
    )

    return encodings

# I love Machine Learning
# (0,1), (2,6), (7,13), (14,22)

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset.set_format("torch", columns=["input_ids", "attention_mask", "start_position", "end_position"])

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
train_dataset = encoded_dataset["train"]
val_dataset = encoded_dataset["validation"]

In [None]:
train_dataloader = DataLoader(train_dataset.select(range(4000)), batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset.select(range(1000)), batch_size=4)

# **3. Huấn luyện mô hình**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LongformerForQuestionAnswering(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (

In [None]:
# AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
# Huấn luyện mô hình
num_epochs = 5
model.train()

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    total_loss = 0

    for batch in tqdm(train_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_position = batch["start_position"].to(device)
        end_position = batch["end_position"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            start_positions=start_position,
            end_positions=end_position
        )

        loss = outputs.loss
        total_loss += loss.item()

        # Reset gradient
        optimizer.zero_grad()
        # Tính gradient thông qua quá trình lan truyền ngược
        loss.backward()
        # Cập nhật trọng số
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Average loss: {average_loss:.2f}")

Epoch 1/5


 19%|█▉        | 192/1000 [04:10<17:28,  1.30s/it]

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in tqdm(val_dataloader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # Argmax trên start_logits [4,4096] → vector [4] predicted start idx.
        # Chuyển scores thô thành predict positions (class cao nhất là vị trí token có xác suất cao nhất làm start/end).
        predicted_start = torch.argmax(outputs.start_logits, dim=-1)
        # Tương tự cho end.
        predicted_end = torch.argmax(outputs.end_logits, dim=-1)

        correct += ((predicted_start == start_positions) & (predicted_end == end_positions)).sum().item()
        total += start_positions.size(0)

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")

In [None]:
model.to(device)
model.eval()

In [None]:
def predict_answer(question, context):
    # Mã hóa input
    inputs = tokenizer(
        question,
        context,
        max_length=4096,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Chuyển input sang thiết bị
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Dự đoán
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Tìm vị trí bắt đầu và kết thúc có xác suất cao nhất
        start_index = torch.argmax(start_logits, dim=1).item()
        end_index = torch.argmax(end_logits, dim=1).item()

        # Chuyển token sang văn bản
        answer_tokens = input_ids[0, start_index:end_index + 1]
        answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return answer, start_index, end_index

In [None]:
# 4. Test với input mẫu
question = "Who is the president of the United States in 2023?"
context = (
    "In 2023, the president of the United States was Joe Biden. He took office on January 20, 2021, "
    "following his victory in the 2020 presidential election. His administration focused on issues such as "
    "climate change, healthcare reform, and economic recovery following the COVID-19 pandemic."
)

In [None]:
answer, start_index, end_index = predict_answer(question, context)
print(f"Question: {question}")
print(f"Context: {context}")
print(f"Answer: {answer}")
print(f"Start index: {start_index}, End index: {end_index}")