In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# transformer
!git clone https://github.com/huggingface/transformers
%cd transformers
!pip install transformers

# fast
!pip install fastBPE
!pip install fairseq

# download pretrained model - PhoBERT_base_transformers
!wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
!tar -xzvf PhoBERT_base_transformers.tar.gz

Cloning into 'transformers'...
remote: Enumerating objects: 148195, done.[K
remote: Counting objects: 100% (2046/2046), done.[K
remote: Compressing objects: 100% (824/824), done.[K
remote: Total 148195 (delta 1231), reused 1734 (delta 1108), pack-reused 146149[K
Receiving objects: 100% (148195/148195), 153.62 MiB | 24.65 MiB/s, done.
Resolving deltas: 100% (109507/109507), done.
/content/transformers
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# from tqdm.notebook import tqdm
from tqdm.auto import tqdm

import pickle

In [None]:
!nvidia-smi

Sun Jul  2 11:38:36 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Hyperparameters

In [None]:
NUM_CLASSES = 7
MAX_LEN = 20

BATCH_SIZE = 64
EPOCHS = 3

device = 'cuda'

In [None]:
TRAIN_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/train_processed.csv'
VAL_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/val_processed.csv'
TEST_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/test_processed.csv'

MODELS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/models/baseline/'
RESULTS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/results/'

# Load data


In [None]:
def load_data(path):
    data = pd.read_csv(path)
    data.cleaned_sentence.fillna('', inplace=True)
    X = data.cleaned_sentence.values.tolist()
    y = data.emotion.values.tolist()
    return X, y

In [None]:
X_train, y_train = load_data(TRAIN_PATH)
X_val, y_val = load_data(VAL_PATH)
X_test, y_test = load_data(TEST_PATH)

# Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
le.fit(y_train)

y_train = le.transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

# save
with open(MODELS_PATH + '/baseline_le.pkl', 'wb') as f:
    le = pickle.dump(le, f)

# Preparing data for training

In [None]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes',
    default="/content/transformers/PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("/content/transformers/PhoBERT_base_transformers/dict.txt")

## Text Encoding (ids & attention mask)

* Sử dụng `bpe.encode(text)` để encode 1 câu hay một đoạn văn bản thành một list các subword.
* Sử dụng `vocab` để ánh xạ từ subword về id của nó trong bộ từ vựng.



In [None]:
import tensorflow
from tensorflow.keras.utils import pad_sequences

In [None]:
def encode(X):
    re_ids, re_mask = [], []
    for sent in X:
        subwords = ' ' + bpe.encode(sent) + ' '
        encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
        mask = [int(token_id > 0) for token_id in encoded_sent]

        re_ids.append(encoded_sent)
        re_mask.append(mask)

    re_ids = pad_sequences(re_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
    re_mask = pad_sequences(re_mask, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

    return re_ids, re_mask

In [None]:
X_train_ids, X_train_mask = encode(X_train)
X_val_ids, X_val_mask = encode(X_val)
X_test_ids, X_test_mask = encode(X_test)

## Data Loader

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
def get_data_loader(ids, mask, label):
    # to tensor
    ids = torch.tensor(ids)
    mask = torch.tensor(mask)
    label = torch.tensor(label)

    # data loader
    data = TensorDataset(ids, mask, label)
    sampler = SequentialSampler(data)
    data_loader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)

    return data_loader

In [None]:
train_data_loader = get_data_loader(X_train_ids, X_train_mask, y_train)
val_data_loader = get_data_loader(X_val_ids, X_val_mask, y_val)
test_data_loader = get_data_loader(X_test_ids, X_test_mask, y_test)

# PhoBERT-base Transformer Model

In [None]:
if not torch.cuda.is_available():
    raise RuntimeError("CUDA is not available. Make sure you have a CUDA-enabled GPU and the necessary drivers installed.")
else:
    print('CUDA is available')

CUDA is available


In [None]:
from transformers import (
    RobertaForSequenceClassification, BertForSequenceClassification,
    RobertaConfig,
    AdamW
)

In [None]:
config = RobertaConfig.from_pretrained(
    '/content/transformers/PhoBERT_base_transformers/config.json', from_tf=False, num_labels=NUM_CLASSES, output_hidden_states=False,
)
phobert_sa = BertForSequenceClassification.from_pretrained(
    "/content/transformers/PhoBERT_base_transformers/model.bin",
    config=config
)
phobert_sa.cuda()

You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at /content/transformers/PhoBERT_base_transformers/model.bin were not used when initializing BertForSequenceClassification: ['roberta.encoder.layer.3.intermediate.dense.bias', 'roberta.encoder.layer.9.output.LayerNorm.weight', 'roberta.encoder.layer.7.attention.self.value.bias', 'roberta.encoder.layer.9.output.LayerNorm.bias', 'roberta.encoder.layer.7.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.dense.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.self.key.bias', 'roberta.embeddings.word_embeddings.weight', 'roberta.encoder.layer.5.intermediate.dense.weight', 'roberta.encoder.layer.1.attention.self.value.bias', 'roberta.encoder.layer.3.output.dense.weight', 'roberta.encoder.layer.10.attention.self.query.weight', 'roberta.encoder.lay

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=0)
      (position_embeddings): Embedding(258, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05,

# Training

In [None]:
param_optimizer = list(phobert_sa.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)



In [None]:
for epoch_i in range(0, 15):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')

    phobert_sa.train()
    total_loss = 0
    # train_accuracy = 0
    # nb_train_steps = 0
    # train_f1 = 0

    for step, batch in tqdm(enumerate(train_data_loader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        phobert_sa.zero_grad()
        outputs = phobert_sa(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels
        )
        loss = outputs[0]
        total_loss += loss.item()

        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # tmp_train_accuracy, tmp_train_f1 = flat_accuracy(logits, label_ids)
        # train_accuracy += tmp_train_accuracy
        # train_f1 += tmp_train_f1
        # nb_train_steps += 1

        loss.backward()
        torch.nn.utils.clip_grad_norm_(phobert_sa.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_data_loader)
    # print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
    # print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
    print(" Average training loss: {0:.4f}".format(avg_train_loss))

    # print("Running validation...")
    # phobert_sa.eval()
    # eval_loss = 0
    # eval_accuracy = 0
    # nb_eval_steps = 0
    # nb_eval_examples = 0
    # eval_f1 = 0
    # min_eval_accuracy, min_eval_f1 = 0, 0
    # for batch in tqdm_notebook(train_data_loader):

    #     batch = tuple(t.to(device) for t in batch)

    #     b_input_ids, b_input_mask, b_labels = batch

    #     with torch.no_grad():
    #         outputs = phobert_sa(b_input_ids,
    #         token_type_ids=None,
    #         attention_mask=b_input_mask)
    #         logits = outputs[0]
    #         logits = logits.detach().cpu().numpy()
    #         label_ids = b_labels.to('cpu').numpy()

    #         tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)

    #         eval_accuracy += tmp_eval_accuracy
    #         eval_f1 += tmp_eval_f1
    #         nb_eval_steps += 1

    # print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
    # print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))

print("Training complete!")

# save
phobert_sa.save_pretrained(MODELS_PATH + 'phobert_baseline')

Training...


0it [00:00, ?it/s]

 Average training loss: 1.1395
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1489
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1453
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1429
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1470
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1427
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1394
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1399
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1409
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1444
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1508
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1390
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1434
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1422
Training...


0it [00:00, ?it/s]

 Average training loss: 1.1424
Training complete!


# Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# load
phobert_sa = BertForSequenceClassification.from_pretrained(MODELS_PATH + 'phobert_baseline').to(device)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


In [None]:
def get_prediction(data_loader):
    predictions = []
    labels = []
    for step, batch in tqdm(enumerate(data_loader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = phobert_sa(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels
            )

        logits = torch.sigmoid(outputs[1]).detach().cpu().numpy()
        # logits = outputs[1].detach().cpu().numpy()
        b_labels = b_labels.detach().cpu().numpy()

        preds = [np.argmax(y) for y in logits]
        lbls = [np.argmax(y) for y in b_labels]

        predictions = predictions + preds
        labels = labels + lbls

    return predictions, labels

In [None]:
def evaluate(data):
    preds, y = get_prediction(data)

    acc = round(accuracy_score(y, preds), 2)
    pre = round(precision_score(y, preds, average='weighted'), 2)
    recall = round(recall_score(y, preds, average='weighted'), 2)
    f1 = round(f1_score(y, preds, average='weighted'), 2)

    return [acc, pre, recall, f1]

In [None]:
evaluate(val_data_loader)

0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


[0.05, 1.0, 0.05, 0.1]

In [None]:
def get_result(X, y):
    text_cnn_re = evaluate(text_cnn, X, y)
    lstm_re = evaluate(lstm, X, y)

    re = pd.DataFrame(
        [text_cnn_re] + [lstm_re],
        columns=['accuracy', 'precision', 'recall', 'f1'],
        index=['text_cnn', 'lstm']
    )

    return re

In [None]:
re_train = get_result(X_train, y_train)
re_train



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.9,0.9,0.9,0.9
lstm,0.93,0.93,0.93,0.93


In [None]:
re_val = get_result(X_val, y_val)
re_val.to_csv(RESULTS_PATH + 'baseline_val_dl.csv')
re_val



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.54,0.54,0.54,0.54
lstm,0.52,0.52,0.52,0.52


In [None]:
re_test = get_result(X_test, y_test)
re_test.to_csv(RESULTS_PATH + 'baseline_test_dl.csv')
re_test



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.52,0.53,0.52,0.52
lstm,0.53,0.53,0.53,0.52
