# 1. Cài đặt Thư viện

In [1]:
!pip install transformers datasets torch numpy tqdm tensorboard

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

#2 .Import các thư viện

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, disable_caching
from tqdm.notebook import tqdm
import numpy as np
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

# Thiết lập seed để reproducible
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

set_seed()

# Kiểm tra GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


# 3. Load dataset

In [3]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import torch

# Khởi tạo tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Load datasets từ parquet files
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'validation': 'data/validation-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet'
}

train_df = pd.read_parquet("hf://datasets/stanfordnlp/sst2/" + splits["train"])
val_df = pd.read_parquet("hf://datasets/stanfordnlp/sst2/" + splits["validation"])

# Custom Dataset class
class SST2Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['sentence']
        label = self.data.iloc[idx]['label']

        # Tokenize text
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = SST2Dataset(train_df, tokenizer)
val_dataset = SST2Dataset(val_df, tokenizer)

# Create dataloaders
BATCH_SIZE = 16
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False
)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

# Kiểm tra một batch
sample_batch = next(iter(train_dataloader))
print("\nSample batch shape:")
print(f"Input IDs shape: {sample_batch['input_ids'].shape}")
print(f"Attention mask shape: {sample_batch['attention_mask'].shape}")
print(f"Labels shape: {sample_batch['labels'].shape}")

# Kiểm tra một vài mẫu dữ liệu
print("\nSample data examples:")
for i in range(3):
    text = train_df.iloc[i]['sentence']
    label = train_df.iloc[i]['label']
    print(f"\nText: {text}")
    print(f"Label: {'Positive' if label == 1 else 'Negative'}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Training samples: 67349
Validation samples: 872

Sample batch shape:
Input IDs shape: torch.Size([16, 128])
Attention mask shape: torch.Size([16, 128])
Labels shape: torch.Size([16])

Sample data examples:

Text: hide new secretions from the parental units 
Label: Negative

Text: contains no wit , only labored gags 
Label: Negative

Text: that loves its characters and communicates something rather beautiful about human nature 
Label: Positive


#4. Định nghĩa model

In [4]:
class PPOModel(nn.Module):
    def __init__(self, num_labels=2):
        super().__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            "bert-base-uncased",
            num_labels=num_labels
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        return outputs.logits

class DQNModel(nn.Module):
    def __init__(self, num_labels=2):
        super().__init__()
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            "bert-base-uncased",
            num_labels=num_labels
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        return outputs.logits

# 5. PPo Training

In [5]:
class PPOTrainer:
    def __init__(self, model, optimizer, clip_ratio=0.2):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.clip_ratio = clip_ratio

    def compute_ppo_loss(self, old_logits, new_logits, advantages, actions):
        # Convert logits to probabilities
        old_probs = torch.softmax(old_logits, dim=-1)
        new_probs = torch.softmax(new_logits, dim=-1)

        # Get probabilities for taken actions
        old_action_probs = old_probs.gather(1, actions.unsqueeze(1)).squeeze()
        new_action_probs = new_probs.gather(1, actions.unsqueeze(1)).squeeze()

        # Calculate ratio
        ratio = new_action_probs / (old_action_probs + 1e-8)

        # Calculate surrogate losses
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) * advantages

        return -torch.min(surr1, surr2).mean()

def train_ppo(model, train_loader, val_loader, num_epochs=3):
    optimizer = optim.AdamW(model.parameters(), lr=1e-5)
    trainer = PPOTrainer(model, optimizer)
    writer = SummaryWriter(f'runs/ppo_{datetime.now().strftime("%Y%m%d-%H%M%S")}')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}

            # Get old predictions
            with torch.no_grad():
                old_logits = model(batch['input_ids'], batch['attention_mask'])

            # Get new predictions
            new_logits = model(batch['input_ids'], batch['attention_mask'])

            # Calculate advantage
            predictions = torch.argmax(new_logits, dim=1)
            rewards = (predictions == batch['labels']).float()
            advantages = rewards - 0.5  # Simple baseline

            # Calculate PPO loss
            loss = trainer.compute_ppo_loss(
                old_logits,
                new_logits,
                advantages,
                batch['labels']
            )

            # Optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (predictions == batch['labels']).sum().item()
            total += batch['labels'].size(0)

        # Validation
        model.eval()
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(batch['input_ids'], batch['attention_mask'])
                predictions = torch.argmax(outputs, dim=1)
                val_correct += (predictions == batch['labels']).sum().item()
                val_total += batch['labels'].size(0)

        train_accuracy = correct / total
        val_accuracy = val_correct / val_total

        print(f'Epoch {epoch+1}:')
        print(f'Training Loss: {total_loss/len(train_loader):.4f}')
        print(f'Training Accuracy: {train_accuracy:.4f}')
        print(f'Validation Accuracy: {val_accuracy:.4f}')

        writer.add_scalar('Loss/train', total_loss/len(train_loader), epoch)
        writer.add_scalar('Accuracy/train', train_accuracy, epoch)
        writer.add_scalar('Accuracy/val', val_accuracy, epoch)

    return model

# 6. DQN Training

In [6]:
class DQNTrainer:
    def __init__(self, model, optimizer, gamma=0.99):
        self.model = model.to(device)
        self.optimizer = optimizer
        self.gamma = gamma
        self.criterion = nn.MSELoss()

    def compute_dqn_loss(self, current_q_values, target_q_values, actions, rewards):
        current_q = current_q_values.gather(1, actions.unsqueeze(1))
        target_q = rewards.unsqueeze(1) + self.gamma * target_q_values.max(1)[0].unsqueeze(1)
        return self.criterion(current_q, target_q)

def train_dqn(model, train_loader, val_loader, num_epochs=3):
    optimizer = optim.AdamW(model.parameters(), lr=1e-5)
    trainer = DQNTrainer(model, optimizer)
    writer = SummaryWriter(f'runs/dqn_{datetime.now().strftime("%Y%m%d-%H%M%S")}')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}

            # Get Q-values
            current_q_values = model(batch['input_ids'], batch['attention_mask'])

            with torch.no_grad():
                target_q_values = model(batch['input_ids'], batch['attention_mask'])

            # Calculate rewards
            predictions = torch.argmax(current_q_values, dim=1)
            rewards = (predictions == batch['labels']).float()

            # Calculate loss
            loss = trainer.compute_dqn_loss(
                current_q_values,
                target_q_values,
                batch['labels'],
                rewards
            )

            # Optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (predictions == batch['labels']).sum().item()
            total += batch['labels'].size(0)

        # Validation
        model.eval()
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(batch['input_ids'], batch['attention_mask'])
                predictions = torch.argmax(outputs, dim=1)
                val_correct += (predictions == batch['labels']).sum().item()
                val_total += batch['labels'].size(0)

        train_accuracy = correct / total
        val_accuracy = val_correct / val_total

        print(f'Epoch {epoch+1}:')
        print(f'Training Loss: {total_loss/len(train_loader):.4f}')
        print(f'Training Accuracy: {train_accuracy:.4f}')
        print(f'Validation Accuracy: {val_accuracy:.4f}')

        writer.add_scalar('Loss/train', total_loss/len(train_loader), epoch)
        writer.add_scalar('Accuracy/train', train_accuracy, epoch)
        writer.add_scalar('Accuracy/val', val_accuracy, epoch)

    return model

# 7 . Khởi hành training

In [None]:
# Initialize models
ppo_model = PPOModel().to(device)
dqn_model = DQNModel().to(device)

# Train PPO
print("Training PPO model...")
trained_ppo = train_ppo(ppo_model, train_dataloader, val_dataloader)

# Train DQN
print("\nTraining DQN model...")
trained_dqn = train_dqn(dqn_model, train_dataloader, val_dataloader)

# Save models
torch.save(trained_ppo.state_dict(), 'ppo_sst2.pth')
torch.save(trained_dqn.state_dict(), 'dqn_sst2.pth')

# 8.  Model Testing

In [None]:
def predict_sentiment(model, text):
    model.eval()
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs, dim=1)

    return "Positive" if prediction.item() == 1 else "Negative"

# Test examples
test_sentences = [
    "This movie was absolutely fantastic!",
    "I really didn't enjoy this film at all.",
    "The acting was decent but the plot was confusing."
]

print("PPO Model Predictions:")
for sentence in test_sentences:
    print(f"\nSentence: {sentence}")
    print(f"Prediction: {predict_sentiment(trained_ppo, sentence)}")

print("\nDQN Model Predictions:")
for sentence in test_sentences:
    print(f"\nSentence: {sentence}")
    print(f"Prediction: {predict_sentiment(trained_dqn, sentence)}")