# Problem
- Use BERT and XLNet to classify sentiment from vietnamese students reviews.

## Download dataset
Vietnamese Students' Feedback Corpus (UIT-VSFC) is the resource consists of over 16,000 sentences which are human-annotated with two different tasks: sentiment-based and topic-based classifications.

[1] Kiet Van Nguyen, Vu Duc Nguyen, Phu Xuan-Vinh Nguyen, Tham Thi-Hong Truong, Ngan Luu-Thuy Nguyen, UIT-VSFC: Vietnamese Students' Feedback Corpus for Sentiment Analysis,  2018 10th International Conference on Knowledge and Systems Engineering (KSE 2018), November 1-3, 2018, Ho Chi Minh City, Vietnam

In [34]:
!pip install datasets





In [35]:
from datasets import load_dataset

dataset = load_dataset("uitnlp/vietnamese_students_feedback")

## Train set exploration

In [36]:
train_set = dataset['train']
train_set

Dataset({
    features: ['sentence', 'sentiment', 'topic'],
    num_rows: 11426
})

In [19]:
train_set[0]

{'sentence': 'slide giáo trình đầy đủ .', 'sentiment': 2, 'topic': 1}

In [20]:
len(train_set)

11426

In [21]:
# Unique labels
set(train_set['sentiment'])

{0, 1, 2}

In [22]:
# Unique labels
set(train_set['topic'])

{0, 1, 2, 3}

## Test set exploration

In [23]:
test_set = dataset['test']
test_set

Dataset({
    features: ['sentence', 'sentiment', 'topic'],
    num_rows: 3166
})

In [24]:
test_set[0]

{'sentence': 'nói tiếng anh lưu loát .', 'sentiment': 2, 'topic': 0}

In [10]:
len(test_set)

3166

# Question:

For both BERT - XLNet and vietnamese texts (this homework) and english texts (previous homework):
- Are there any differences in the evaluating performance in applying those 2 models for 2 datasets, i.e., vietnamese and english ? Analyze and visualize your results to prove your conclusion.
- How to improve the weaker approaches ?

# Tokenize data

In [11]:
# Tokenize dữ liệu
from transformers import BertTokenizerFast
from transformers import XLNetTokenizerFast

class Tokenizer:
    def __init__(self, model_name, max_length=128):
        if 'bert' in model_name:
            self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
        elif 'xlnet' in model_name:
            self.tokenizer = XLNetTokenizerFast.from_pretrained(model_name)
        self.max_length = max_length

    def tokenize_function(self, examples):
        return self.tokenizer(
            examples["sentence"],
            truncation=True,
            padding="max_length",
            max_length=self.max_length
        )

tokenizer = Tokenizer('bert-base-uncased')

train_set = train_set.map(tokenizer.tokenize_function, batched=True)
test_set = test_set.map(tokenizer.tokenize_function, batched=True)

# rename the “topic” column to “labels” so Trainer can compute loss
train_set = train_set.rename_column("topic", "labels")
test_set = test_set.rename_column("topic", "labels")

# now include labels in the torch tensors
train_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_set.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])



# Create Dataloader

In [12]:
# Create a DataLoader
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

class DataLoaderCreator:
    def __init__(self, dataset, batch_size):
        self.dataset = dataset
        self.batch_size = batch_size
        self.data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        self.dataloader = None

    def create_dataloader(self):
        self.dataloader = DataLoader(self.dataset, batch_size=self.batch_size, collate_fn=self.data_collator, num_workers=8, pin_memory=True)
        return self.dataloader

train_dataloader = DataLoaderCreator(train_set, batch_size=32).create_dataloader()
test_dataloader = DataLoaderCreator(test_set, batch_size=32).create_dataloader()

# Build and load model

In [13]:
# Build or Load Model
from torch.optim import AdamW
from transformers import BertForSequenceClassification, XLNetForSequenceClassification

class ModelBuilder:
    def __init__(self, model_name, num_labels=4):
        if 'bert' in model_name:
            self.model_class = BertForSequenceClassification
        elif 'xlnet' in model_name:
            self.model_class = XLNetForSequenceClassification
        self.model_name = model_name
        self.num_labels = num_labels
        self.model = None

    def build_model(self):
        self.model = self.model_class.from_pretrained(self.model_name, num_labels=self.num_labels)
        return self.model
    
    def optimizer(self, learning_rate=5e-5):
        return AdamW(self.model.parameters(), lr=learning_rate)
    
model_builder = ModelBuilder('bert-base-uncased')
bert_model = model_builder.build_model()
optimizer = model_builder.optimizer()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train model

In [14]:
# Train the model
from transformers import Trainer, TrainingArguments, EvalPrediction, TrainerCallback
import numpy as np
from sklearn.metrics import accuracy_score

class LossCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            print(f"[Epoch {state.epoch}] Training Loss: {logs['loss']}")

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        if "eval_loss" in metrics:
            print(f"[Epoch {state.epoch}] Eval Loss: {metrics['eval_loss']}")

class TrainerBuilder:
    def __init__(self, model, train_dataloader, epochs=3):
        self.model = model
        self.train_dataset = train_dataloader.dataset
        self.epochs = epochs

    def train(self):
        # Set up hyperparameters for training procedure
        training_args = TrainingArguments(
            output_dir="./results",
            evaluation_strategy="no",
            save_strategy="no",
            logging_strategy="epoch",
            learning_rate=0.001,
            per_device_train_batch_size=16,
            num_train_epochs=self.epochs,
            weight_decay=0.01,
            load_best_model_at_end=False,
            logging_dir="./logs",
            report_to="none"
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_dataset,
            callbacks=[LossCallback()]
        )

        trainer.train()
        return trainer
    
# import torch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# bert_model.to(device)
# # Train the model using TrainerBuilder
# trainer_builder = TrainerBuilder(bert_model, train_dataloader)
# trainer = trainer_builder.train()
# # Save the model
# trainer.save_model("bert-imdb")

In [37]:
from datasets import DatasetDict

# Chia dữ liệu thành 2 bộ riêng
sentiment_dataset = train_set.map(lambda x: {'label': x['sentiment']}, remove_columns=['topic'])
topic_dataset = train_set.map(lambda x: {'label': x['topic']}, remove_columns=['sentiment'])


In [39]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(example):
    return tokenizer(example["sentence"], padding='max_length', truncation=True)

sentiment_dataset = sentiment_dataset.map(tokenize, batched=True)
topic_dataset     = topic_dataset.map(tokenize, batched=True)


In [42]:
from torch.utils.data import DataLoader

sentiment_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
topic_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

sentiment_dataloader = DataLoader(sentiment_dataset, batch_size=16, shuffle=True)
topic_dataloader = DataLoader(topic_dataset, batch_size=16, shuffle=True)

In [44]:
from transformers import BertForSequenceClassification
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentiment_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
sentiment_model.to(device)

sentiment_trainer = TrainerBuilder(sentiment_model, sentiment_dataloader, epochs=3).train()
sentiment_trainer.save_model("bert-sentiment")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2145 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.31 GiB is allocated by PyTorch, and 239.11 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
topic_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
topic_model.to(device)

topic_trainer = TrainerBuilder(topic_model, topic_dataloader, epochs=3).train()
topic_trainer.save_model("bert-topic")
