# Problem
- Use BERT and XLNet to classify sentiment from vietnamese students reviews.

## Download dataset
Vietnamese Students' Feedback Corpus (UIT-VSFC) is the resource consists of over 16,000 sentences which are human-annotated with two different tasks: sentiment-based and topic-based classifications.

[1] Kiet Van Nguyen, Vu Duc Nguyen, Phu Xuan-Vinh Nguyen, Tham Thi-Hong Truong, Ngan Luu-Thuy Nguyen, UIT-VSFC: Vietnamese Students' Feedback Corpus for Sentiment Analysis,  2018 10th International Conference on Knowledge and Systems Engineering (KSE 2018), November 1-3, 2018, Ho Chi Minh City, Vietnam

In [1]:
!pip install datasets





In [2]:
from datasets import load_dataset

dataset = load_dataset("uitnlp/vietnamese_students_feedback")

## Train set exploration

In [3]:
train_set = dataset['train']
train_set

Dataset({
    features: ['sentence', 'sentiment', 'topic'],
    num_rows: 11426
})

In [4]:
train_set[0]

{'sentence': 'slide giáo trình đầy đủ .', 'sentiment': 2, 'topic': 1}

In [5]:
len(train_set)

11426

In [6]:
# Unique labels
set(train_set['sentiment'])

{0, 1, 2}

In [7]:
# Unique labels
set(train_set['topic'])

{0, 1, 2, 3}

## Test set exploration

In [8]:
test_set = dataset['test']
test_set

Dataset({
    features: ['sentence', 'sentiment', 'topic'],
    num_rows: 3166
})

In [9]:
test_set[0]

{'sentence': 'nói tiếng anh lưu loát .', 'sentiment': 2, 'topic': 0}

In [10]:
len(test_set)

3166

# Question:

For both BERT - XLNet and vietnamese texts (this homework) and english texts (previous homework):
- Are there any differences in the evaluating performance in applying those 2 models for 2 datasets, i.e., vietnamese and english ? Analyze and visualize your results to prove your conclusion.
- How to improve the weaker approaches ?

## Valid set

In [11]:
valid_set = dataset['validation']

# Tokenize data

In [12]:
# Tokenize dữ liệu
from transformers import BertTokenizerFast
from transformers import XLNetTokenizerFast

class Tokenizer:
    def __init__(self, model_name, max_length=128):
        if 'bert' in model_name:
            self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
        elif 'xlnet' in model_name:
            self.tokenizer = XLNetTokenizerFast.from_pretrained(model_name)
        self.max_length = max_length

    def tokenize_function(self, examples):
        # pick 'sentence' for VSFC or 'text' for IMDB
        texts = examples.get('sentence', examples.get('text'))
        return self.tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=self.max_length
        )

In [13]:
tokenizer = Tokenizer('bert-base-uncased')

train_set = train_set.map(tokenizer.tokenize_function, batched=True).rename_column("sentiment", "labels")
valid_set = valid_set.map(tokenizer.tokenize_function, batched=True).rename_column("sentiment", "labels")
test_set = test_set.map(tokenizer.tokenize_function, batched=True).rename_column("sentiment", "labels")

train_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



# Create Dataloader

In [14]:
# Create a DataLoader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_set, batch_size=16, num_workers=8, shuffle=True, pin_memory=True)
valid_dataloader = DataLoader(valid_set, batch_size=16, num_workers=8, shuffle=True, pin_memory=True)
test_dataloader = DataLoader(test_set, batch_size=16, num_workers=8, shuffle=True, pin_memory=True)

# Build and load model

In [15]:
# Build Model
from torch.optim import AdamW
from transformers import BertForSequenceClassification, XLNetForSequenceClassification

class ModelBuilder:
    def __init__(self, model_name, num_labels=3):
        if 'bert' in model_name:
            self.model_class = BertForSequenceClassification
        elif 'xlnet' in model_name:
            self.model_class = XLNetForSequenceClassification
        self.model_name = model_name
        self.num_labels = num_labels
        self.model = None

    def build_model(self):
        self.model = self.model_class.from_pretrained(self.model_name, num_labels=self.num_labels)
        return self.model
    
    def optimizer(self, learning_rate=5e-5):
        return AdamW(self.model.parameters(), lr=learning_rate)

In [16]:
# Load Model
bert_model_builder = ModelBuilder('bert-base-uncased', num_labels=3)
bert_model = bert_model_builder.build_model()
bert_optimizer = bert_model_builder.optimizer()

xlnet_model_builder = ModelBuilder('xlnet-base-cased', num_labels=3)
xlnet_model = xlnet_model_builder.build_model()
xlnet_optimizer = xlnet_model_builder.optimizer()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train model

In [None]:
# Train the model
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score

class TrainerBuilder:
    def __init__(self, model, train_dataloader, valid_dataloader, optimizer, epochs=1):
        self.model = model
        self.train_dataset = train_dataloader.dataset
        self.valid_dataset = valid_dataloader.dataset
        self.epochs = epochs
        self.optimizer = optimizer

    def train(self):
        # Set up hyperparameters for training procedure
        training_args = TrainingArguments(
            output_dir="./results",
            logging_strategy="epoch",
            per_device_train_batch_size=8,
            num_train_epochs=self.epochs,
            weight_decay=0.01,
            logging_dir="./logs",
            evaluation_strategy="epoch",
            save_strategy="no"
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.valid_dataset,
            compute_metrics=lambda p: {
                'accuracy': accuracy_score(
                    np.argmax(p.predictions, axis=1), p.label_ids
                )
            },
            optimizers=(self.optimizer, None)
        )

        trainer.train()
        return trainer

# Evaluate model

In [None]:
# Evaluate the model
class Evaluator:
    def __init__(self, model, eval_dataloader, optimizer):
        self.model = model
        self.eval_dataset = eval_dataloader.dataset
        self.optimizer = optimizer

    def evaluate(self):
        eval_args = TrainingArguments(
            output_dir="./results_eval",
            per_device_eval_batch_size=8,
            evaluation_strategy="epoch",
            save_strategy="no",
            logging_dir="./logs_eval"
        )

        trainer = Trainer(
            model=self.model,
            args=eval_args,
            eval_dataset=self.eval_dataset,
            compute_metrics=lambda p: {
                'accuracy': accuracy_score(
                    np.argmax(p.predictions, axis=1), p.label_ids
                )
            },
            optimizers=(self.optimizer, None)
        )

        metrics = trainer.evaluate()
        return metrics

# BERT model

In [None]:
# Train
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
trainer_builder = TrainerBuilder(bert_model, train_dataloader, valid_dataloader, bert_optimizer)
trainer = trainer_builder.train()

  0%|          | 0/1429 [00:00<?, ?it/s]

{'loss': 1.1834, 'grad_norm': 17.241390228271484, 'learning_rate': 0.0, 'epoch': 1.0}


  0%|          | 0/198 [00:00<?, ?it/s]

{'eval_loss': 1.181134581565857, 'eval_accuracy': 0.04674668351231838, 'eval_runtime': 12.6037, 'eval_samples_per_second': 125.598, 'eval_steps_per_second': 15.71, 'epoch': 1.0}
{'train_runtime': 272.9962, 'train_samples_per_second': 41.854, 'train_steps_per_second': 5.235, 'train_loss': 1.1834404146529478, 'epoch': 1.0}


In [20]:
# Evaluate
evaluator = Evaluator(bert_model, test_dataloader)
print(evaluator.evaluate())

  0%|          | 0/396 [00:00<?, ?it/s]

{'eval_loss': 1.1789166927337646, 'eval_accuracy': 0.053379658875552745, 'eval_runtime': 25.4528, 'eval_samples_per_second': 124.387, 'eval_steps_per_second': 15.558}


# XLNet model

In [None]:
# Train
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlnet_model.to(device)
trainer_builder = TrainerBuilder(xlnet_model, train_dataloader, valid_dataloader, xlnet_optimizer)
trainer = trainer_builder.train()

  0%|          | 0/1429 [00:00<?, ?it/s]

{'loss': 0.5753, 'grad_norm': 43.51523971557617, 'learning_rate': 0.0, 'epoch': 1.0}


  0%|          | 0/198 [00:00<?, ?it/s]

{'eval_loss': 0.41558408737182617, 'eval_accuracy': 0.8673404927353127, 'eval_runtime': 17.3324, 'eval_samples_per_second': 91.332, 'eval_steps_per_second': 11.424, 'epoch': 1.0}
{'train_runtime': 445.6865, 'train_samples_per_second': 25.637, 'train_steps_per_second': 3.206, 'train_loss': 0.5753475373403604, 'epoch': 1.0}


In [22]:
# Evaluate
evaluator = Evaluator(xlnet_model, test_dataloader)
print(evaluator.evaluate())

  0%|          | 0/396 [00:00<?, ?it/s]

{'eval_loss': 0.46733084321022034, 'eval_accuracy': 0.8547062539481997, 'eval_runtime': 33.8369, 'eval_samples_per_second': 93.566, 'eval_steps_per_second': 11.703}


# English Text

In [23]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")

In [24]:
# Prepare IMDB dataset
imdb_train_set = imdb_dataset["train"].select(range(20000))
imdb_test_set = imdb_dataset["test"]
imdb_valid_set = imdb_test_set.select(range(20000, 25000))

In [25]:
# Tokenize the IMDB dataset
tokenizer = Tokenizer('bert-base-uncased')
imdb_train_set = imdb_train_set.map(tokenizer.tokenize_function, batched=True).rename_column("label", "labels")
imdb_valid_set = imdb_valid_set.map(tokenizer.tokenize_function, batched=True).rename_column("label", "labels")
imdb_test_set = imdb_test_set.map(tokenizer.tokenize_function, batched=True).rename_column("label", "labels")
imdb_train_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
imdb_valid_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
imdb_test_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



# Bert Model

In [None]:
from torch.utils.data import DataLoader

imdb_train_dataloader = DataLoader(imdb_train_set, batch_size=16, num_workers=8, shuffle=True, pin_memory=True)
imdb_valid_dataloader = DataLoader(imdb_valid_set, batch_size=16, num_workers=8, shuffle=True, pin_memory=True)
imdb_test_dataloader = DataLoader(imdb_test_set, batch_size=16, num_workers=8, shuffle=True, pin_memory=True)
# Train on IMDB dataset
imdb_trainer_builder = TrainerBuilder(bert_model, imdb_train_dataloader, imdb_valid_dataloader, bert_optimizer)
imdb_trainer = imdb_trainer_builder.train()
# Evaluate on IMDB dataset
imdb_evaluator = Evaluator(bert_model, imdb_test_dataloader)
print(imdb_evaluator.evaluate())

  0%|          | 0/2500 [00:00<?, ?it/s]

{'loss': 1.1634, 'grad_norm': 9.10696792602539, 'learning_rate': 0.0, 'epoch': 1.0}


  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.9488041996955872, 'eval_accuracy': 0.8576, 'eval_runtime': 40.2597, 'eval_samples_per_second': 124.194, 'eval_steps_per_second': 15.524, 'epoch': 1.0}
{'train_runtime': 496.3122, 'train_samples_per_second': 40.297, 'train_steps_per_second': 5.037, 'train_loss': 1.16338671875, 'epoch': 1.0}


  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 1.1173712015151978, 'eval_accuracy': 0.42864, 'eval_runtime': 201.1676, 'eval_samples_per_second': 124.275, 'eval_steps_per_second': 15.534}


# XLNet Model

In [None]:
# Train on XLNet model for IMDB dataset
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlnet_model.to(device)
trainer_builder = TrainerBuilder(xlnet_model, imdb_train_dataloader, imdb_valid_dataloader, xlnet_optimizer)
trainer = trainer_builder.train()

  0%|          | 0/2500 [00:00<?, ?it/s]

{'loss': 0.6795, 'grad_norm': 2.978633403778076, 'learning_rate': 0.0, 'epoch': 1.0}


  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 1.0323498249053955, 'eval_accuracy': 0.0, 'eval_runtime': 55.6449, 'eval_samples_per_second': 89.856, 'eval_steps_per_second': 11.232, 'epoch': 1.0}
{'train_runtime': 835.5719, 'train_samples_per_second': 23.936, 'train_steps_per_second': 2.992, 'train_loss': 0.67951875, 'epoch': 1.0}


In [28]:
# Evaluate on XLNet model for IMDB dataset
imdb_evaluator = Evaluator(xlnet_model, imdb_test_dataloader)
print(imdb_evaluator.evaluate())

  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.7368566393852234, 'eval_accuracy': 0.49996, 'eval_runtime': 276.0494, 'eval_samples_per_second': 90.564, 'eval_steps_per_second': 11.32}


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Dữ liệu kết quả giả định (như trên)
results_data = {
    'IMDb': {
        'BERT': {'accuracy': 0.921, 'f1': 0.920, 'precision': 0.919, 'recall': 0.922},
        'XLNet': {'accuracy': 0.935, 'f1': 0.934, 'precision': 0.933, 'recall': 0.935},
    },
    'UIT-VSFC (Vietnamese Feedback)': {
        'PhoBERT': {'accuracy': 0.885, 'f1': 0.883, 'precision': 0.886, 'recall': 0.884},
        'XLM-R': {'accuracy': 0.870, 'f1': 0.868, 'precision': 0.871, 'recall': 0.869},
    }
}

# --- Biểu đồ 1: So sánh F1-score của các model trên từng dataset ---
datasets = list(results_data.keys())
metrics_to_plot = ['f1'] # Bạn có thể chọn 'accuracy', 'precision', 'recall'

for metric_name in metrics_to_plot:
    plt.figure(figsize=(12, 7))
    bar_width = 0.35
    index = 0

    all_model_names = []
    all_scores = []
    all_dataset_names_for_ticks = []

    for i, dataset_name in enumerate(datasets):
        models_in_dataset = list(results_data[dataset_name].keys())
        scores = [results_data[dataset_name][model][metric_name] for model in models_in_dataset]

        x_positions = np.arange(len(models_in_dataset)) + index
        bars = plt.bar(x_positions, scores, bar_width, label=f'{dataset_name}')

        for bar_idx, bar in enumerate(bars):
            yval = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.005, f'{yval:.3f}', ha='center', va='bottom')
            all_model_names.append(f"{models_in_dataset[bar_idx]}\n({dataset_name})") # Thêm tên dataset vào model

        # index += len(models_in_dataset) + 1 # Tạo khoảng cách giữa các nhóm dataset
        # Thay vì cộng dồn index, chúng ta sẽ tạo danh sách nhãn trục x sau
        index += len(models_in_dataset) + 0.5 # Tạo khoảng cách nhỏ

    plt.xlabel("Mô hình (và Dataset)")
    plt.ylabel(f"{metric_name.capitalize()} Score")
    plt.title(f"So sánh {metric_name.capitalize()} của các Mô hình trên các Dataset khác nhau")

    # Tạo nhãn trục x dựa trên các mô hình và dataset
    tick_positions = []
    current_pos = 0
    for dataset_name in datasets:
        models_in_dataset = list(results_data[dataset_name].keys())
        num_models = len(models_in_dataset)
        tick_positions.extend(np.arange(num_models) + current_pos)
        current_pos += num_models + 0.5


    plt.xticks(tick_positions, all_model_names, rotation=15, ha="right")
    plt.ylim(0, 1.05) # Giới hạn trục y từ 0 đến 1 (hoặc 1.05 để có không gian cho text)
    plt.legend(title="Dataset") # Chú thích sẽ không cần thiết nếu nhãn trục x đã rõ ràng
    plt.tight_layout()
    plt.grid(axis='y', linestyle='--')
    plt.show()


# --- Biểu đồ 2: So sánh chi tiết các metrics (Accuracy, F1, Precision, Recall) cho từng dataset ---

for dataset_name, models_data in results_data.items():
    model_names = list(models_data.keys())
    metrics = ['accuracy', 'f1', 'precision', 'recall']
    num_metrics = len(metrics)
    num_models = len(model_names)

    fig, ax = plt.subplots(figsize=(10, 6))
    bar_width = 0.20 # Điều chỉnh độ rộng của cột
    index = np.arange(num_metrics)

    for i, model_name in enumerate(model_names):
        scores = [models_data[model_name][metric] for metric in metrics]
        # Vị trí của các cột cho model hiện tại
        positions = index + i * bar_width
        bars = ax.bar(positions, scores, bar_width, label=model_name)
        for bar in bars:
            yval = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.005, f'{yval:.3f}', ha='center', va='bottom')


    ax.set_xlabel("Metrics")
    ax.set_ylabel("Scores")
    ax.set_title(f"So sánh chi tiết các Metrics trên Dataset: {dataset_name}")
    ax.set_xticks(index + bar_width * (num_models - 1) / 2) # Căn giữa nhãn metrics
    ax.set_xticklabels([m.capitalize() for m in metrics])
    ax.legend(title="Models")
    ax.set_ylim(0, 1.05)
    plt.grid(axis='y', linestyle='--')
    plt.tight_layout()
    plt.show()

# --- Biểu đồ 3: Confusion Matrix (Ma trận nhầm lẫn) ---
# Để vẽ ma trận nhầm lẫn, bạn cần có dự đoán thực tế (preds) và nhãn thực tế (labels)
# từ hàm compute_metrics hoặc từ trainer.predict().
# Ví dụ:
from sklearn.metrics import confusion_matrix

# Giả sử bạn có:
# y_true_imdb = [...] # Nhãn thực tế cho một model trên IMDb
# y_pred_imdb = [...] # Nhãn dự đoán cho model đó trên IMDb
# labels_display_imdb = ['Negative', 'Positive'] # Tên các lớp

# y_true_vsfc = [...]
# y_pred_vsfc = [...]
# labels_display_vsfc = ['Negative', 'Neutral', 'Positive'] # Giả sử có 3 lớp cho tiếng Việt

def plot_confusion_matrix(y_true, y_pred, labels_display, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=labels_display, yticklabels=labels_display)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(title)
    plt.show()

# # Gọi hàm này với dữ liệu thực tế của bạn:
# # plot_confusion_matrix(y_true_imdb, y_pred_imdb, labels_display_imdb, "Confusion Matrix - BERT on IMDb")
# # plot_confusion_matrix(y_true_vsfc, y_pred_vsfc, labels_display_vsfc, "Confusion Matrix - PhoBERT on UIT-VSFC")

print("Lưu ý: Để vẽ Confusion Matrix, bạn cần có nhãn dự đoán và nhãn thực tế từ kết quả thử nghiệm.")

# Answer Question:
### Bộ dữ liệu 'uitnlp/vietnamese_students_feedback':
- Số lượng mẫu: ít hơn
- Số label trong 'sentiment': 3
- 
### Bộ dữ liệu 'imdb_dataset':
- Số lượng mẫu: nhiều hơn
- Số label trong 'label': 2
