# Problem
- Use BERT and XLNet to classify sentiment from vietnamese students reviews.

## Download dataset
Vietnamese Students' Feedback Corpus (UIT-VSFC) is the resource consists of over 16,000 sentences which are human-annotated with two different tasks: sentiment-based and topic-based classifications.

[1] Kiet Van Nguyen, Vu Duc Nguyen, Phu Xuan-Vinh Nguyen, Tham Thi-Hong Truong, Ngan Luu-Thuy Nguyen, UIT-VSFC: Vietnamese Students' Feedback Corpus for Sentiment Analysis,  2018 10th International Conference on Knowledge and Systems Engineering (KSE 2018), November 1-3, 2018, Ho Chi Minh City, Vietnam

In [1]:
!pip install datasets





In [2]:
from datasets import load_dataset

dataset = load_dataset("uitnlp/vietnamese_students_feedback")

## Train set exploration

In [3]:
train_set = dataset['train']
train_set

Dataset({
    features: ['sentence', 'sentiment', 'topic'],
    num_rows: 11426
})

In [4]:
train_set[0]

{'sentence': 'slide giáo trình đầy đủ .', 'sentiment': 2, 'topic': 1}

In [5]:
len(train_set)

11426

In [6]:
# Unique labels
set(train_set['sentiment'])

{0, 1, 2}

In [7]:
# Unique labels
set(train_set['topic'])

{0, 1, 2, 3}

## Test set exploration

In [8]:
test_set = dataset['test']
test_set

Dataset({
    features: ['sentence', 'sentiment', 'topic'],
    num_rows: 3166
})

In [9]:
test_set[0]

{'sentence': 'nói tiếng anh lưu loát .', 'sentiment': 2, 'topic': 0}

In [10]:
len(test_set)

3166

# Question:

For both BERT - XLNet and vietnamese texts (this homework) and english texts (previous homework):
- Are there any differences in the evaluating performance in applying those 2 models for 2 datasets, i.e., vietnamese and english ? Analyze and visualize your results to prove your conclusion.
- How to improve the weaker approaches ?

## Valid set

In [11]:
valid_set = dataset['validation']

# Tokenize data

In [12]:
# Tokenize dữ liệu
from transformers import BertTokenizerFast
from transformers import XLNetTokenizerFast

class Tokenizer:
    def __init__(self, model_name, max_length=128):
        if 'bert' in model_name:
            self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
        elif 'xlnet' in model_name:
            self.tokenizer = XLNetTokenizerFast.from_pretrained(model_name)
        self.max_length = max_length

    def tokenize_function(self, examples):
        # pick 'sentence' for VSFC or 'text' for IMDB
        texts = examples.get('sentence', examples.get('text'))
        return self.tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=self.max_length
        )

In [20]:
tokenizer = Tokenizer('bert-base-uncased')

train_set = train_set.map(tokenizer.tokenize_function, batched=True).rename_column("sentiment", "labels")
valid_set = valid_set.map(tokenizer.tokenize_function, batched=True).rename_column("sentiment", "labels")
test_set = test_set.map(tokenizer.tokenize_function, batched=True).rename_column("sentiment", "labels")

train_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



Map:   0%|          | 0/11426 [00:00<?, ? examples/s]

Map:   0%|          | 0/1583 [00:00<?, ? examples/s]

Map:   0%|          | 0/3166 [00:00<?, ? examples/s]

# Create Dataloader

In [23]:
# Create a DataLoader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_set, batch_size=32, num_workers=8, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=32, num_workers=8, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=32, num_workers=8, shuffle=True)

# Build and load model

In [13]:
# Build Model
from torch.optim import AdamW
from transformers import BertForSequenceClassification, XLNetForSequenceClassification

class ModelBuilder:
    def __init__(self, model_name, num_labels=3):
        if 'bert' in model_name:
            self.model_class = BertForSequenceClassification
        elif 'xlnet' in model_name:
            self.model_class = XLNetForSequenceClassification
        self.model_name = model_name
        self.num_labels = num_labels
        self.model = None

    def build_model(self):
        self.model = self.model_class.from_pretrained(self.model_name, num_labels=self.num_labels)
        return self.model
    
    def optimizer(self, learning_rate=5e-5):
        return AdamW(self.model.parameters(), lr=learning_rate)

In [21]:
# Load Model
bert_model_builder = ModelBuilder('bert-base-uncased', num_labels=3)
bert_model = bert_model_builder.build_model()
bert_optimizer = bert_model_builder.optimizer()

xlnet_model_builder = ModelBuilder('xlnet-base-cased', num_labels=3)
xlnet_model = xlnet_model_builder.build_model()
xlnet_optimizer = xlnet_model_builder.optimizer()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train model

In [14]:
# Train the model
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score

class TrainerBuilder:
    def __init__(self, model, train_dataloader, valid_dataloader, epochs=1):
        self.model = model
        self.train_dataset = train_dataloader.dataset
        self.valid_dataset = valid_dataloader.dataset
        self.epochs = epochs

    def train(self):
        # Set up hyperparameters for training procedure
        training_args = TrainingArguments(
            output_dir="./results",
            logging_strategy="epoch",
            per_device_train_batch_size=8,
            num_train_epochs=self.epochs,
            weight_decay=0.01,
            logging_dir="./logs"
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.valid_dataset,
            compute_metrics=lambda p: {
                'accuracy': accuracy_score(
                    np.argmax(p.predictions, axis=1), p.label_ids
                )
            },
            optimizers=(bert_optimizer, None)  # Use the optimizer from ModelBuilder
        )

        trainer.train()
        return trainer

# Evaluate model

In [15]:
# Evaluate the model
class Evaluator:
    def __init__(self, model, eval_dataloader):
        self.model = model
        self.eval_dataset = eval_dataloader.dataset

    def evaluate(self):
        eval_args = TrainingArguments(
            output_dir="./results_eval",
            per_device_eval_batch_size=8,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_dir="./logs_eval"
        )

        trainer = Trainer(
            model=self.model,
            args=eval_args,
            eval_dataset=self.eval_dataset,
            compute_metrics=lambda p: {
                'accuracy': accuracy_score(
                    np.argmax(p.predictions, axis=1), p.label_ids
                )
            },
            optimizers=(bert_optimizer, None)
        )

        metrics = trainer.evaluate()
        return metrics

# BERT model

In [24]:
# Train
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)
trainer_builder = TrainerBuilder(bert_model, train_dataloader, valid_dataloader)
trainer = trainer_builder.train()

  0%|          | 0/1429 [00:00<?, ?it/s]

{'loss': 0.4636, 'grad_norm': 1.6403861045837402, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 657.8503, 'train_samples_per_second': 17.369, 'train_steps_per_second': 2.172, 'train_loss': 0.46359280692354793, 'epoch': 1.0}


In [25]:
# Evaluate
evaluator = Evaluator(bert_model, test_dataloader)
print(evaluator.evaluate())

  0%|          | 0/396 [00:00<?, ?it/s]

{'eval_loss': 0.4294278621673584, 'eval_accuracy': 0.8742893240682249, 'eval_runtime': 31.0389, 'eval_samples_per_second': 102.001, 'eval_steps_per_second': 12.758}


# XLNet model

In [27]:
# Train
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlnet_model.to(device)
trainer_builder = TrainerBuilder(xlnet_model, train_dataloader, valid_dataloader)
trainer = trainer_builder.train()

  0%|          | 0/1429 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Evaluate
evaluator = Evaluator(xlnet_model, test_dataloader)
print(evaluator.evaluate())

  0%|          | 0/396 [00:00<?, ?it/s]

{'eval_loss': 1.1079281568527222, 'eval_accuracy': 0.5028427037271005, 'eval_runtime': 28.3529, 'eval_samples_per_second': 111.664, 'eval_steps_per_second': 13.967}


# English Text

In [16]:
imdb_dataset = load_dataset("imdb")

In [17]:
imdb_train_set = imdb_dataset["train"].select(range(20000))
imdb_test_set = imdb_dataset["test"]
imdb_valid_set = imdb_test_set.select(range(20000, 25000))

In [18]:
tokenizer = Tokenizer('bert-base-uncased')
imdb_train_set = imdb_train_set.map(tokenizer.tokenize_function, batched=True).rename_column("label", "labels")
imdb_valid_set = imdb_valid_set.map(tokenizer.tokenize_function, batched=True).rename_column("label", "labels")
imdb_test_set = imdb_test_set.map(tokenizer.tokenize_function, batched=True).rename_column("label", "labels")
imdb_train_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
imdb_valid_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
imdb_test_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])




In [19]:
from torch.utils.data import DataLoader

imdb_train_dataloader = DataLoader(imdb_train_set, batch_size=32, num_workers=8, shuffle=True)
imdb_valid_dataloader = DataLoader(imdb_valid_set, batch_size=32, num_workers=8, shuffle=True)
imdb_test_dataloader = DataLoader(imdb_test_set, batch_size=32, num_workers=8, shuffle=True)
# Train on IMDB dataset
imdb_bert_model_builder = ModelBuilder('bert-base-uncased', num_labels=2)
imdb_bert_model = imdb_bert_model_builder.build_model()
import torch
device_xpu = torch.device("xpu:0" if torch.xpu.is_available() else "cpu") 
imdb_bert_model.to(device_xpu)
bert_optimizer = imdb_bert_model_builder.optimizer()
imdb_trainer_builder = TrainerBuilder(imdb_bert_model, imdb_train_dataloader, imdb_valid_dataloader)
imdb_trainer = imdb_trainer_builder.train()
# Evaluate on IMDB dataset
imdb_evaluator = Evaluator(imdb_bert_model, imdb_test_dataloader)
print(imdb_evaluator.evaluate())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2500 [00:00<?, ?it/s]

{'loss': 0.3632, 'grad_norm': 1.0832698345184326, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 654.2625, 'train_samples_per_second': 30.569, 'train_steps_per_second': 3.821, 'train_loss': 0.363159814453125, 'epoch': 1.0}


  0%|          | 0/3125 [00:00<?, ?it/s]

{'eval_loss': 0.34513843059539795, 'eval_accuracy': 0.87152, 'eval_runtime': 219.172, 'eval_samples_per_second': 114.066, 'eval_steps_per_second': 14.258}
