In [1]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
import json

# Kiểm tra GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
# Đường dẫn model và dataset
model_path = r"D:\Shopee\Data_expl\fine_tuned_sentiment_classifier.pth"
new_dataset_path = r"D:\Shopee\Data_expl\new_sentiment_dataset_1.json"

In [3]:
# Tokenizer (PhoBERT)
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

In [4]:
# Mô hình
class SentimentClassifier(nn.Module):
    def __init__(self, pretrained_model_name, num_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.fc(cls_output)
        return logits

In [5]:
# Khởi tạo mô hình và tải trọng số
num_classes = 3  # Negative, Neutral, Positive
model = SentimentClassifier(pretrained_model_name="vinai/phobert-base", num_classes=num_classes)
model.load_state_dict(torch.load(model_path, map_location=device))
model = model.to(device)

  model.load_state_dict(torch.load(model_path, map_location=device))


In [6]:
# Hàm dự đoán cảm xúc
def predict_sentiment(text, model, tokenizer, device):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, prediction = torch.max(outputs, dim=1)
    return prediction.item()

In [7]:
# Dataset cho fine-tuning
class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
        label = label_map[item['correct_label']]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [8]:
# Hàm huấn luyện
def train_epoch(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)

    accuracy = correct_predictions.double() / len(data_loader.dataset)
    return total_loss / len(data_loader), accuracy

In [9]:
# Hàm đánh giá
def eval_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct_predictions.double() / len(data_loader.dataset)
    return total_loss / len(data_loader), accuracy, all_preds, all_labels

In [10]:
# Tải dữ liệu từ file JSON
with open(new_dataset_path, 'r', encoding='utf-8') as f:
    new_data = json.load(f)

# Tạo DataLoader
fine_tune_dataset = SentimentDataset(new_data, tokenizer, max_length=128)
fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=16, shuffle=True)

In [None]:
# Huấn luyện tiếp tục
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
epochs = 5

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss, train_acc = train_epoch(model, fine_tune_loader, optimizer, criterion, device)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")

# Lưu lại mô hình sau khi fine-tune
fine_tuned_model_path = r"D:\Shopee\Data_expl\fine_tuned_sentiment_classifier_1.pth"
torch.save(model.state_dict(), fine_tuned_model_path)
print(f"Fine-tuned model saved to {fine_tuned_model_path}")


Epoch 1/5
Train Loss: 1.8505, Train Accuracy: 0.6190
Epoch 2/5
Train Loss: 1.2279, Train Accuracy: 0.7381
Epoch 3/5
Train Loss: 0.6607, Train Accuracy: 0.7619
Epoch 4/5
Train Loss: 0.2405, Train Accuracy: 0.8810
Epoch 5/5
Train Loss: 0.1810, Train Accuracy: 0.9048
Fine-tuned model saved to D:\Shopee\Data_expl\fine_tuned_sentiment_classifier.pth
