In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm

# **Cấu trúc Longformer Self Attention**

In [None]:
class LongformerSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_heads, window_size=512, num_global_tokens=1):
        super().__init__()
        # Kích thước embedding (số chiều của input/output)
        self.hidden_size = hidden_size
        # Khai báo số head trong multi-head attention
        self.num_heads = num_heads
        # Kích thước mỗi head (hidden_size chi đều, hidden_size // num_heads)
        self.head_size = hidden_size // num_heads
        self.window_size = window_size
        self.num_global = num_global_tokens

        # Khai báo các linear layer để chuẩn hoá input thành Q, K, V
        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, hidden_size)


    def forward(self, hidden_states):
        seq_len = hidden_states.size(1)
        Q = self.query(hidden_states).view(-1, seq_len, self.num_heads, self.head_size).transpose(1, 2)
        K = self.key(hidden_states).view(-1, seq_len, self.num_heads, self.head_size).transpose(1, 2)
        V = self.value(hidden_states).view(-1, seq_len, self.num_heads, self.head_size).transpose(1, 2)

        # Scores
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_size ** 0.5)

        # Chia window ra làm đôi
        half_w = self.window_size // 2 # (ví dụ: 512//2 => 256 trước/sau)
        # Khai báo mask => mask định vị vị trí token nào cần được tính attention
        mask = torch.full_like(scores, float('-inf'))
        indices = torch.arange(seq_len, device=scores.device) # Tạo mảng chỉ số từ 0 đến seq_len - 1
        # Tạo ma trận row và col => dùng để tính được sự khác nhau trong vị trí của các token
        row_indices = indices.view(1, 1, seq_len, 1)
        col_indices = indices.view(1, 1, 1, seq_len)
        # Thiết lập ma trận chênh lệch vị trí
        diff = row_indices - col_indices # diff[i, j] = i - j => dùng để kiểm tra nếu j trong window của i
        # Tạo window mask => xác định vị trí của các key mà query cần tính attention
        window_mask = (diff.abs() <= half_w) # True nếu |i - j| <= half_w (trong window), ngược lại thì False
        # Áp dụng conditional: nếu window_mask=True => set mask = 0.0 (cho phép tính attention), ngược lại => set mask = -inf (không cho phép tính attention)
        mask = torch.where(window_mask, torch.tensor(0.0, device=scores.device), float('-inf'))


        # Tính global attention cho các token <cls>
        for g in range(self.num_global):
            # Cho phép toàn bộ token ở cột g trong ma trận tính attention với g
            mask[:, :, :, g] = 0
            # Cho phép toàn bộ token ở hàng g trong ma trận tính attention với g
            mask[:, :, g, :] = 0

        scores = scores + mask # Thêm mask vào scores => scores sẽ giữ nguyên nhưng chỗ nào bị -inf sẽ làm cho scores = 0
        attn_weights = F.softmax(scores, dim=-1)
        context = torch.matmul(attn_weights, V).transpose(1, 2).contiguous().view(-1, seq_len, self.hidden_size)
        output = self.output(context)
        return output

# **Longformer Layer**

In [None]:
class LongformerLayer(nn.Module):
    def __init__(self, hidden_size, num_heads, window_size):
        super().__init__()
        self.attention = LongformerSelfAttention(hidden_size, num_heads, window_size)
        self.ffn = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.GELU(), # Giúp mô hình học được mối quan hệ giữa các điểm dữ liệu phi tuyến (non-linearity)
            nn.Linear(hidden_size * 4, hidden_size)
        )
        # Ổn định lại quá trình training
        self.norm1 = nn.LayerNorm(hidden_size)
        self.norm2 = nn.LayerNorm(hidden_size)

    def forward(self, hidden_states):
        attn_output = self.attention(hidden_states)
        hidden_states = self.norm1(hidden_states + attn_output)
        ffn_output = self.ffn(hidden_states)
        hidden_states = self.norm2(hidden_states + ffn_output)
        return hidden_states

In [None]:
class Longformer(nn.Module):
    def __init__(self, vocab_size, hidden_size=256, num_layers=2, num_heads=4, window_size=512, max_position_embeddings=1024, num_classes=2):
        super().__init__()
        # Embedding layer cho token, map mỗi token ID
        self.embeddings = nn.Embedding(vocab_size, hidden_size)
        # Embedding layer cho position (vị trí token trong chuỗi), map position ID (0 đến max_position_embeddings-1, mặc định 1024) thành vectors hidden_size.
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)  # Position encoding
        # Mỗi layer tinh chỉnh representation từ layer trước
        self.layers = nn.ModuleList([LongformerLayer(hidden_size, num_heads, window_size) for _ in range(num_layers)])
        # Biến representation cuối (từ layers) thành logits cho classes (ví dụ [logit_neg, logit_pos] để softmax thành probs).
        self.pooler = nn.Linear(hidden_size, num_classes)  # Cho classification

    def forward(self, input_ids):
        seq_len = input_ids.size(1)
        hidden_states = self.embeddings(input_ids)
        position_ids = torch.arange(0, seq_len, dtype=torch.long, device=input_ids.device).unsqueeze(0).expand_as(input_ids)
        hidden_states += self.position_embeddings(position_ids)
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        pooled = self.pooler(hidden_states[:, 0, :])  # CLS token
        return pooled

# **Dataset**

In [None]:
# Custom Dataset cho IMDB
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
# Load dataset
dataset = load_dataset('imdb')
train_texts = dataset["train"]["text"]
train_labels = dataset["train"]["label"]
test_texts = dataset["test"]["text"]
test_labels = dataset["test"]["label"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
vocab_size=tokenizer.vocab_size

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Datasets và Dataloaders
train_dataset = IMDBDataset(train_texts, train_labels, tokenizer, max_len=512)
test_dataset = IMDBDataset(test_texts, test_labels, tokenizer, max_len=512)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Batch nhỏ để tiết kiệm bộ nhớ
test_loader = DataLoader(test_dataset, batch_size=8)

In [None]:
# Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Longformer(vocab_size=vocab_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [None]:
# Train (3 epochs để demo, thực tế cần 10+)
model.train()
for epoch in range(3):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

Epoch 1: 100%|██████████| 3125/3125 [02:13<00:00, 23.48it/s]


Epoch 1, Average Loss: 0.5621


Epoch 2: 100%|██████████| 3125/3125 [02:04<00:00, 25.01it/s]


Epoch 2, Average Loss: 0.4447


Epoch 3: 100%|██████████| 3125/3125 [02:05<00:00, 24.97it/s]

Epoch 3, Average Loss: 0.3521





In [None]:
# Hàm dự đoán cho input tùy chỉnh
def predict_text(model, tokenizer, text, device, max_len=512):
    model.eval()
    # Tokenize input
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
    input_ids = encoding['input_ids'].to(device)

    # Dự đoán
    with torch.no_grad():
        outputs = model(input_ids)
        probs = F.softmax(outputs, dim=-1)  # Chuyển thành xác suất
        _, predicted = torch.max(outputs, 1)  # Lấy nhãn dự đoán

    # Chuyển nhãn dự đoán về 0 (negative) hoặc 1 (positive)
    label = predicted.item()
    label_text = "positive" if label == 1 else "negative"
    prob_positive = probs[0][1].item()  # Xác suất nhãn positive

    return label_text, prob_positive

In [None]:
sample_text = """
Christopher Nolan's Inception is nothing short of a cinematic triumph, a film that redefines the boundaries of imagination, storytelling,
and visual spectacle. Released in 2010, this mind-bending sci-fi thriller starring Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen Page,
and a stellar ensemble cast, has captivated audiences worldwide with its intricate plot, breathtaking visuals, and profound exploration of
the human subconscious. As someone who has revisited this film multiple times, I can confidently say that Inception isn't just a movie—it's
an experience that lingers long after the credits roll, inspiring endless discussions and rewatches. In this extensive review, I'll delve into
why this film deserves its place as one of the greatest achievements in modern cinema, highlighting its narrative brilliance, technical mastery,
character depth, thematic richness, and enduring cultural impact.
"""
label, prob = predict_text(model, tokenizer, sample_text, device)
print(f"Predicted label: {label} (Probability of positive: {prob})")

Predicted label: positive (Probability of positive: 0.8178398609161377)


In [None]:
sample_text = """
Christopher Nolan's Inception from 2010 promised to be a groundbreaking sci-fi thriller, but instead, it delivers a convoluted mess that prioritizes
style over substance, leaving audiences more confused than captivated. Starring Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen Page, and a cast that
seems wasted on underdeveloped roles, this film attempts to explore the depths of the subconscious but ends up drowning in its own overly complicated
plot twists and pseudo-intellectual babble. As a viewer who approached it with high expectations only to be let down repeatedly, I find Inception to
be an overrated spectacle that exemplifies Nolan's worst tendencies: pretentious world-building, emotional shallowness, and a reliance on gimmicks that
fail to mask the narrative's glaring flaws. In this detailed critique, I'll unpack why this movie falls short in nearly every aspect, from its muddled
storytelling and lackluster characters to its overhyped visuals and thematic superficiality, ultimately making it a frustrating watch that doesn't live
up to the hype.
"""
label, prob = predict_text(model, tokenizer, sample_text, device)
print(f"Predicted label: {label} (Probability of positive: {prob})")

Predicted label: negative (Probability of positive: 0.010153206996619701)
