# Problem
- Use BERT and XLNet to classify sentiment from vietnamese students reviews.

## Download dataset
Vietnamese Students' Feedback Corpus (UIT-VSFC) is the resource consists of over 16,000 sentences which are human-annotated with two different tasks: sentiment-based and topic-based classifications.

[1] Kiet Van Nguyen, Vu Duc Nguyen, Phu Xuan-Vinh Nguyen, Tham Thi-Hong Truong, Ngan Luu-Thuy Nguyen, UIT-VSFC: Vietnamese Students' Feedback Corpus for Sentiment Analysis,  2018 10th International Conference on Knowledge and Systems Engineering (KSE 2018), November 1-3, 2018, Ho Chi Minh City, Vietnam

In [1]:
!pip install datasets





In [2]:
from datasets import load_dataset

dataset = load_dataset("uitnlp/vietnamese_students_feedback")

## Train set exploration

In [None]:
train_set = dataset['train']
train_set

Dataset({
    features: ['sentence', 'sentiment', 'topic'],
    num_rows: 3500
})

In [4]:
train_set[0]

{'sentence': 'slide giáo trình đầy đủ .', 'sentiment': 2, 'topic': 1}

In [5]:
len(train_set)

3500

In [6]:
# Unique labels
set(train_set['sentiment'])

{0, 1, 2}

In [7]:
# Unique labels
set(train_set['topic'])

{0, 1, 2, 3}

## Test set exploration

In [8]:
test_set = dataset['test']
test_set

Dataset({
    features: ['sentence', 'sentiment', 'topic'],
    num_rows: 3166
})

In [9]:
test_set[0]

{'sentence': 'nói tiếng anh lưu loát .', 'sentiment': 2, 'topic': 0}

In [10]:
len(test_set)

3166

# Question:

For both BERT - XLNet and vietnamese texts (this homework) and english texts (previous homework):
- Are there any differences in the evaluating performance in applying those 2 models for 2 datasets, i.e., vietnamese and english ? Analyze and visualize your results to prove your conclusion.
- How to improve the weaker approaches ?

## Valid set

In [11]:
valid_set = dataset['validation']

# Tokenize data

In [None]:
# Tokenize dữ liệu
from transformers import BertTokenizerFast
from transformers import XLNetTokenizerFast

class Tokenizer:
    def __init__(self, model_name, max_length=128):
        if 'bert' in model_name:
            self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
        elif 'xlnet' in model_name:
            self.tokenizer = XLNetTokenizerFast.from_pretrained(model_name)
        self.max_length = max_length

    def tokenize_function(self, examples):
        return self.tokenizer(
            examples["sentence"],
            truncation=True,
            padding="max_length",
            max_length=self.max_length
        )

tokenizer = Tokenizer('bert-base-uncased')

train_set = train_set.map(tokenizer.tokenize_function, batched=True).rename_column("sentiment", "labels")
valid_set = valid_set.map(tokenizer.tokenize_function, batched=True).rename_column("sentiment", "labels")
test_set = test_set.map(tokenizer.tokenize_function, batched=True).rename_column("sentiment", "labels")

train_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



# Create Dataloader

In [13]:
# Create a DataLoader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_set, batch_size=128, num_workers=8, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=128, num_workers=8, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=128, num_workers=8, shuffle=True)

# Build and load model

In [14]:
# Build or Load Model
from torch.optim import AdamW
from transformers import BertForSequenceClassification, XLNetForSequenceClassification

class ModelBuilder:
    def __init__(self, model_name, num_labels):
        if 'bert' in model_name:
            self.model_class = BertForSequenceClassification
        elif 'xlnet' in model_name:
            self.model_class = XLNetForSequenceClassification
        self.model_name = model_name
        self.num_labels = num_labels
        self.model = None

    def build_model(self):
        self.model = self.model_class.from_pretrained(self.model_name, num_labels=self.num_labels)
        return self.model
    
    def optimizer(self, learning_rate=0.0001):
        return AdamW(self.model.parameters(), lr=learning_rate)
    
model_builder = ModelBuilder('bert-base-uncased', num_labels=3)
bert_model = model_builder.build_model()
optimizer = model_builder.optimizer()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train model

In [15]:
# Train the model
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score

class TrainerBuilder:
    def __init__(self, model, train_dataloader, valid_dataloader, epochs=3):
        self.model = model
        self.train_dataset = train_dataloader.dataset
        self.valid_dataset = valid_dataloader.dataset
        self.epochs = epochs

    def train(self):
        # Set up hyperparameters for training procedure
        training_args = TrainingArguments(
            output_dir="./results",
            logging_strategy="epoch",
            per_device_train_batch_size=8,
            num_train_epochs=self.epochs,
            weight_decay=0.01,
            logging_dir="./logs"
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.valid_dataset,
            compute_metrics=lambda p: {
                'accuracy': accuracy_score(
                    np.argmax(p.predictions, axis=1), p.label_ids
                )
            },
            optimizers=(optimizer, None)  # Use the optimizer from ModelBuilder
        )

        trainer.train()
        return trainer

In [16]:
# Train with sentiment labels
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
trainer_builder = TrainerBuilder(bert_model, train_dataloader, valid_dataloader)
trainer = trainer_builder.train()

  0%|          | 0/1314 [00:00<?, ?it/s]

{'loss': 0.6885, 'grad_norm': 2.224764585494995, 'learning_rate': 6.666666666666667e-05, 'epoch': 1.0}
{'loss': 0.5858, 'grad_norm': 2.5780751705169678, 'learning_rate': 3.3333333333333335e-05, 'epoch': 2.0}
{'loss': 0.5333, 'grad_norm': 2.419379472732544, 'learning_rate': 0.0, 'epoch': 3.0}
{'train_runtime': 317.2232, 'train_samples_per_second': 33.1, 'train_steps_per_second': 4.142, 'train_loss': 0.6025503730483613, 'epoch': 3.0}


# Evaluate model

In [17]:
# Evaluate the model
class Evaluator:
    def __init__(self, model, eval_dataloader):
        self.model = model
        self.eval_dataset = eval_dataloader.dataset

    def evaluate(self):
        eval_args = TrainingArguments(
            output_dir="./results_eval",
            per_device_eval_batch_size=8,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_dir="./logs_eval"
        )

        trainer = Trainer(
            model=self.model,
            args=eval_args,
            eval_dataset=self.eval_dataset,
            compute_metrics=lambda p: {
                'accuracy': accuracy_score(
                    np.argmax(p.predictions, axis=1), p.label_ids
                )
            },
            optimizers=(optimizer, None)
        )

        metrics = trainer.evaluate()
        return metrics

In [18]:
# Evaluate on sentiment labels
evaluater_sentiment = Evaluator(bert_model, test_dataloader)
print(evaluater_sentiment.evaluate())

  0%|          | 0/396 [00:00<?, ?it/s]

{'eval_loss': 0.6099095940589905, 'eval_accuracy': 0.8190145293746052, 'eval_runtime': 25.5333, 'eval_samples_per_second': 123.995, 'eval_steps_per_second': 15.509}
