In [29]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [30]:
train_path = r"D:\Shopee\Data_expl\train_vietnamese_students_feedback.csv"
test_path = r"D:\Shopee\Data_expl\test_vietnamese_students_feedback.csv"

def load_data(path):
    data = pd.read_csv(path)
    texts = data["sentence"].tolist()  
    labels = data["sentiment"].tolist()  
    return texts, labels

train_texts, train_labels = load_data(train_path)
test_texts, test_labels = load_data(test_path)


In [31]:
# Tokenizer (PhoBERT)
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

In [32]:
# Dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [33]:
# Tạo DataLoader
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, max_length=128)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [34]:
# Mô hình
class SentimentClassifier(nn.Module):
    def __init__(self, pretrained_model_name, num_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        logits = self.fc(cls_output)
        return logits


In [35]:
# Khởi tạo mô hình
num_classes = 3  # Negative, Neutral, Positive
model = SentimentClassifier(pretrained_model_name="vinai/phobert-base", num_classes=num_classes)
model = model.to(device)


In [36]:
# Optimizer và Loss
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss()

In [37]:
writer = SummaryWriter("runs/sentiment_analysis")

In [38]:
# Hàm huấn luyện
def train_epoch(model, data_loader, optimizer, criterion, device, epoch):
    model.train()
    total_loss = 0
    correct_predictions = 0

    loop = tqdm(data_loader, desc=f"Epoch {epoch + 1}")
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)

        # Cập nhật tqdm
        loop.set_postfix(loss=loss.item())

    accuracy = correct_predictions.double() / len(data_loader.dataset)
    writer.add_scalar("Train/Loss", total_loss / len(data_loader), epoch)
    writer.add_scalar("Train/Accuracy", accuracy, epoch)

    return total_loss / len(data_loader), accuracy


In [39]:
# Hàm đánh giá
def eval_model(model, data_loader, criterion, device, epoch):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct_predictions.double() / len(data_loader.dataset)
    writer.add_scalar("Validation/Loss", total_loss / len(data_loader), epoch)
    writer.add_scalar("Validation/Accuracy", accuracy, epoch)

    return total_loss / len(data_loader), accuracy, all_preds, all_labels

In [40]:
# Hàm vẽ confusion matrix
def plot_confusion_matrix(y_true, y_pred, classes, epoch):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"Confusion Matrix - Epoch {epoch + 1}")
    plt.savefig(f"runs/confusion_matrix_epoch_{epoch + 1}.png")
    plt.close()

In [41]:
# Huấn luyện mô hình
epochs = 10
class_names = ["Negative", "Neutral", "Positive"]

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, device, epoch)
    val_loss, val_acc, val_preds, val_labels = eval_model(model, test_loader, criterion, device, epoch)

    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")

    # Vẽ confusion matrix
    plot_confusion_matrix(val_labels, val_preds, class_names, epoch)

# Lưu mô hình
model_save_path = "sentiment_classifier.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Đóng TensorBoard
writer.close()

Epoch 1/10


Epoch 1: 100%|██████████| 358/358 [01:04<00:00,  5.53it/s, loss=0.00933]


Train Loss: 0.2915, Train Accuracy: 0.8987
Val Loss: 0.2623, Val Accuracy: 0.9223
Epoch 2/10


Epoch 2: 100%|██████████| 358/358 [01:04<00:00,  5.57it/s, loss=0.146] 


Train Loss: 0.1681, Train Accuracy: 0.9455
Val Loss: 0.2397, Val Accuracy: 0.9286
Epoch 3/10


Epoch 3: 100%|██████████| 358/358 [01:04<00:00,  5.52it/s, loss=0.00885]


Train Loss: 0.1215, Train Accuracy: 0.9630
Val Loss: 0.2439, Val Accuracy: 0.9302
Epoch 4/10


Epoch 4: 100%|██████████| 358/358 [01:04<00:00,  5.56it/s, loss=0.417]  


Train Loss: 0.0897, Train Accuracy: 0.9738
Val Loss: 0.2883, Val Accuracy: 0.9264
Epoch 5/10


Epoch 5: 100%|██████████| 358/358 [01:04<00:00,  5.57it/s, loss=0.00636]


Train Loss: 0.0819, Train Accuracy: 0.9768
Val Loss: 0.2895, Val Accuracy: 0.9289
Epoch 6/10


Epoch 6: 100%|██████████| 358/358 [01:03<00:00,  5.62it/s, loss=0.0128] 


Train Loss: 0.0622, Train Accuracy: 0.9828
Val Loss: 0.2994, Val Accuracy: 0.9283
Epoch 7/10


Epoch 7: 100%|██████████| 358/358 [01:04<00:00,  5.59it/s, loss=0.00216]


Train Loss: 0.0425, Train Accuracy: 0.9892
Val Loss: 0.3520, Val Accuracy: 0.9277
Epoch 8/10


Epoch 8: 100%|██████████| 358/358 [01:03<00:00,  5.61it/s, loss=0.00386]


Train Loss: 0.0491, Train Accuracy: 0.9869
Val Loss: 0.3173, Val Accuracy: 0.9296
Epoch 9/10


Epoch 9: 100%|██████████| 358/358 [01:03<00:00,  5.60it/s, loss=0.00381]


Train Loss: 0.0384, Train Accuracy: 0.9892
Val Loss: 0.3297, Val Accuracy: 0.9283
Epoch 10/10


Epoch 10: 100%|██████████| 358/358 [01:03<00:00,  5.62it/s, loss=2.58]   


Train Loss: 0.0410, Train Accuracy: 0.9905
Val Loss: 0.3564, Val Accuracy: 0.9296
Model saved to sentiment_classifier.pth
