In [2]:
# !pip install torch==1.13.1+cu116 torchaudio==0.13.1+cu116 torchvision==0.14.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
# !pip install transformers==4.35.2
# !pip install scikit-learn

In [3]:
import pandas as pd
df = pd.read_csv("../data/AERA02_AptitudeAssessment_Dataset_NLP_cleaned.csv")

In [4]:
import re
import string
def process_text(text):
    text = re.sub("(&#\d+;)", "", text)
    text = re.sub("([\/-])", " ", text)
    text = re.sub("(<.*?>)", "" ,text)
    text = re.sub("(^https?:\/\/\S+)", "", text)
    text = "".join([i for i in text if i not in string.punctuation + "…"])
    text = text.lower()
    return text

def process_corpus(corpus):
    _WORD_SPLIT = re.compile("([.,!?\"/':;)(])")
    def basic_tokenizer(sentence):
        words = []
        for space_separated_fragment in sentence.strip().split():
            words.extend(_WORD_SPLIT.split(space_separated_fragment))
        return [w.lower() for w in words if w != '' and w != ' ' and w not in string.punctuation]
    
    corpus = corpus.replace("\n", " ").split(" ")

In [5]:
vi_df = df[df["language"] == "vi"].copy()
vi_df["score"] = vi_df["score"].astype("int")
vi_df

Unnamed: 0,score,title,review,language
3,5,TRẢI NGHIỆM TỐT,Đầy đủ dịch vụ tiện nghi Ăn sáng buffee ngon H...,vi
8,5,Tuyệt vời,"Khách sạn mới, sạch sẽ, có bar và bể bơi ở tần...",vi
9,5,trải nghiệm tuyệt vời tại Brandi Gate,"Khách sạn mới 100% tọa lạc trước sông Tô Lịch,...",vi
16,5,"Good hotel, good room rates","During the last visit to Hanoi, in April 2019,...",vi
64,1,"Tồi , lừa đảo",Mình đặt 2 phòng ở 3 đêm từ 30/11-3/12 . Vì có...,vi
...,...,...,...,...
813623,5,Lần thứ 2 quay lại,Vừa rồi tham gia cuộc thi sắc đẹp cho doanh nh...,vi
813648,4,Giá rẻ nhân viên thân thiện,Gia đình chúng tôi gồm bố mẹ và 1 bé 4 tuổi đã...,vi
813651,5,"Giá rẻ, đồ ăn ngon","Thấy khách sạn lâu rồi mà không dám vào ở, sợ ...",vi
813654,5,Kỳ nghỉ tháng 10 năm 2017 tại Đà Nẵng,"Khách sạn với nội thất tuyệt vời , phòng rất r...",vi


## BERT training

In [6]:
import numpy as np
import os
import random
from pathlib import Path
import json

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification



class Config():
    seed_val = 17
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    epochs = 5 
    batch_size = 32
    seq_length = 512
    lr = 2e-5
    eps = 1e-8
    pretrained_model = 'bert-base-uncased'
    test_size=0.15
    random_state=42
    add_special_tokens=True 
    return_attention_mask=True 
    pad_to_max_length=True 
    do_lower_case=False
    return_tensors='pt'
    cache_dir="/space/hotel/phit/personal/aera02-aisia/notebooks/cache"

config = Config()

# params will be saved after training
params = {"seed_val": config.seed_val,
    "device":str(config.device),
    "epochs":config.epochs, 
    "batch_size":config.batch_size,
    "seq_length":config.seq_length,
    "lr":config.lr,
    "eps":config.eps,
    "pretrained_model": config.pretrained_model,
    "test_size":config.test_size,
    "random_state":config.random_state,
    "add_special_tokens":config.add_special_tokens,
    "return_attention_mask":config.return_attention_mask,
    "pad_to_max_length":config.pad_to_max_length,
    "do_lower_case":config.do_lower_case,
    "return_tensors":config.return_tensors,
         }

In [7]:
#split train test
from sklearn.model_selection import train_test_split

train_df_, test_df = train_test_split(vi_df, 
                                      test_size=0.10, 
                                      random_state=config.random_state, 
                                      stratify=vi_df.score.values)

In [8]:
def set_random_seed(seed_val):
    # set random seed and device
    import random

    device = config.device

    random.seed(config.seed_val)
    np.random.seed(config.seed_val)
    torch.manual_seed(config.seed_val)
    torch.cuda.manual_seed_all(config.seed_val)
    
set_random_seed(config.seed_val)

In [9]:
train_df, val_df = train_test_split(train_df_, 
                                    test_size=0.10, 
                                    random_state=42, 
                            stratify=train_df_.score.values)

In [10]:
# create tokenizer
tokenizer = BertTokenizer.from_pretrained(config.pretrained_model, 
                                          do_lower_case=config.do_lower_case)

In [11]:
train_df

Unnamed: 0,score,title,review,language
376018,5,Trải nghiệm thật tuyệt vời tại Hoàn Mỹ Resort ...,"Cam ơn Hoàn Mỹ resort, đã cho mình có kì nghỉ ...",vi
249432,5,Khách sạn thân thiện,Nhân dịp nghỉ dưỡng cuối năm ở Quảng Bình đã c...,vi
848,5,Một trải nghiệm tuyệt vời tại A25 Sahul Hotel ...,"Tôi đến Hà Nội công tác, nhận phòng hơi muộn. ...",vi
354875,5,Good,Gia đình chúng tôi rất vui- hạnh phúc- tuyệt v...,vi
397962,3,"Giá tốt, chấp nhận được","Phòng sạch sẽ, giá hợp lí, quảng cáo trên inte...",vi
...,...,...,...,...
591070,4,A nice destination,"Tất cả đều mới, đẹp và nên thơ. Phòng rộng, th...",vi
332938,4,Công ty,Thái độ nhân viên tốt. Chu đáo hỗ trợ khách nh...,vi
392883,5,Tuyệt vời,Ấn tượng đầu tiên là resort cảnh quan đẹp. Sân...,vi
113208,5,Bữa trưa tại tầng 62,Tôi đến dùng bữa trưa ở khách sạn. khách sạn c...,vi


In [12]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.review = dataframe.review.tolist()
        self.targets = dataframe.score.tolist()

    def __len__(self):
        return len(self.review)

    def __getitem__(self, index):
        # review = str(self.review[index])
        # review = " ".join(review.split())

        inputs = self.tokenizer(
            self.review[index], 
            add_special_tokens=config.add_special_tokens, 
            return_attention_mask=config.return_attention_mask, 
            pad_to_max_length=config.pad_to_max_length,
            max_length=config.seq_length, 
            # return_tensors=config.return_tensors
        )
        inputs["label"] = self.targets[index]
        inputs = {key: torch.tensor(value) for key, value in inputs.items()}
        
        return inputs

In [13]:
dataset_train = CustomDataset(train_df, tokenizer)
dataset_val = CustomDataset(val_df, tokenizer)

In [14]:
dataset_train[0]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': tensor([  101,   100,   100,   100,   100,  7001,  1010,   100, 16480,   100,
           100,   100,   100,  1017,  2078,  2475,  2094,   100,   100,   100,
           100,  1012,   100,   100,   100,   100,   100,   100,  1010,   100,
         27699,  2078,  1060,  2319,  2232,  1010,   100,   100,  1012,   100,
           100,   100,   100,   100,   100,   100,  1010,   100,   100,  1010,
          2202,  2729,   100,  1047,  4048,   100,   100,   100,  4638,  2041,
          1012,   100,   100,   100,   100,   100,   100, 17895,  2078,  1010,
          1102,  2050,   100,   100,   100,   100,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [15]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# def collate_fn(batch):
#     """ Instructs how the DataLoader should process the data into a batch"""
    
#     text = [item['text'] for item in batch]
#     labels = torch.stack([torch.tensor(item['label']) for item in batch])

#     return {'text': text, 'tabular': tabular, 'label': labels}

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=config.batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=config.batch_size)

In [16]:
model = BertForSequenceClassification.from_pretrained(config.pretrained_model,
                                                      num_labels=6,
                                                      output_attentions=False,
                                                      output_hidden_states=False,
                                                      cache_dir=config.cache_dir)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

class BertTrainer:
    """ A training and evaluation loop for PyTorch models with a BERT like architecture. """
    
    def __init__(
        self, 
        model,
        tokenizer,
        train_dataloader,
        eval_dataloader=None,
        epochs=1,
        lr=5e-04,
        output_dir='./',
        output_filename='model_state_dict.pt',
        save=False,
        tabular=False,
    ):
        """
        Args:
            model: torch.nn.Module: = A PyTorch model with a BERT like architecture,
            tokenizer: = A BERT tokenizer for tokenizing text input,
            train_dataloader: torch.utils.data.DataLoader = 
                A dataloader containing the training data with "text" and "label" keys (optionally a "tabular" key),
            eval_dataloader: torch.utils.data.DataLoader = 
                A dataloader containing the evaluation data with "text" and "label" keys (optionally a "tabular" key),
            epochs: int = An integer representing the number epochs to train,
            lr: float = A float representing the learning rate for the optimizer,
            output_dir: str = A string representing the directory path to save the model,
            output_filename: string = A string representing the name of the file to save in the output directory,
            save: bool = A boolean representing whether or not to save the model,
            tabular: bool = A boolean representing whether or not the BERT model is modified to accept tabular data,
        """
        
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.tokenizer = tokenizer
        self.train_dataloader = train_dataloader
        self.eval_dataloader = eval_dataloader
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
        self.loss_fn = nn.CrossEntropyLoss()
        self.output_dir = output_dir
        self.output_filename = output_filename
        self.save = save
        self.eval_loss = float('inf')  # tracks the lowest loss so as to only save the best model  
        self.epochs = epochs
        self.epoch_best_model = 0  # tracks which epoch the lowest loss is in so as to only save the best model
        
        
    def train(self, evaluate=False):
        """ Calls the batch iterator to train and optionally evaluate the model."""
        for epoch in range(self.epochs):
            self.iteration(epoch, self.train_dataloader)
            if evaluate and self.eval_dataloader is not None:
                self.iteration(epoch, self.eval_dataloader, train=False)
                
    def evaluate(self):
        """ Calls the batch iterator to evaluate the model."""
        epoch=0
        self.iteration(epoch, self.eval_dataloader, train=False)
    
    def iteration(self, epoch, data_loader, train=True):
        """ Iterates through one epoch of training or evaluation"""
        
        # initialize variables
        loss_accumulated = 0.
        correct_accumulated = 0
        samples_accumulated = 0
        preds_all = []
        labels_all = []
        
        self.model.train() if train else self.model.eval()
        
        # progress bar
        mode = "train" if train else "eval"
        batch_iter = tqdm.tqdm(
            enumerate(data_loader),
            desc=f"EP ({mode}) {epoch}",
            total=len(data_loader),
            bar_format="{l_bar}{r_bar}"
        )
        
        # iterate through batches of the dataset
        for i, batch in batch_iter:

            batch_t = {key: value.to(self.device) for key, value in batch_t.items()}
            batch_t["label"] = batch["label"].to(self.device)

            logits = self.model(
                input_ids=batch_t["input_ids"], 
                token_type_ids=batch_t["token_type_ids"], 
                attention_mask=batch_t["attention_mask"],
            )

            # calculate loss
            loss = self.loss_fn(logits, batch_t["label"])
    
            # compute gradient and and update weights
            if train:
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            
            # calculate the number of correct predictions
            preds = logits.argmax(dim=-1)
            correct = preds.eq(batch_t["label"]).sum().item()
            
            # accumulate batch metrics and outputs
            loss_accumulated += loss.item()
            correct_accumulated += correct
            samples_accumulated += len(batch_t["label"])
            preds_all.append(preds.detach())
            labels_all.append(batch_t['label'].detach())
        
        # concatenate all batch tensors into one tensor and move to cpu for compatibility with sklearn metrics
        preds_all = torch.cat(preds_all, dim=0).cpu()
        labels_all = torch.cat(labels_all, dim=0).cpu()
        
        # metrics
        accuracy = accuracy_score(labels_all, preds_all)
        precision = precision_score(labels_all, preds_all, average='macro')
        recall = recall_score(labels_all, preds_all, average='macro')
        f1 = f1_score(labels_all, preds_all, average='macro')
        avg_loss_epoch = loss_accumulated / len(data_loader)
        
        # print metrics to console
        print(
            f"samples={samples_accumulated}, \
            correct={correct_accumulated}, \
            acc={round(accuracy, 4)}, \
            recall={round(recall, 4)}, \
            prec={round(precision,4)}, \
            f1={round(f1, 4)}, \
            loss={round(avg_loss_epoch, 4)}"
        )    
        
        # save the model if the evaluation loss is lower than the previous best epoch 
        if self.save and not train and avg_loss_epoch < self.eval_loss:
            
            # create directory and filepaths
            dir_path = Path(self.output_dir)
            dir_path.mkdir(parents=True, exist_ok=True)
            file_path = dir_path / f"{self.output_filename}_epoch_{epoch}.pt"
            
            # delete previous best model from hard drive
            if epoch > 0:
                file_path_best_model = dir_path / f"{self.output_filename}_epoch_{self.epoch_best_model}.pt"
                !rm -f $file_path_best_model
            
            # save model
            torch.save({
                'model_state_dict': self.model.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict()
            }, file_path)
            
            # update the new best loss and epoch
            self.eval_loss = avg_loss_epoch
            self.epoch_best_model = epoch

In [43]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels, label_dict):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [18]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = {key: value.to(config.device) for key, value in batch.items()}


        with torch.no_grad():        
            outputs = model(input_ids=batch["input_ids"],
                       attention_mask=batch["attention_mask"],
                       token_type_ids=batch["token_type_ids"],
                       labels=batch["label"])
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = batch['label'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
        
    # calculate avareage val loss
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


In [19]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

In [20]:
from transformers import get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification, AdamW, BertConfig

# AdamW is an optimizer which is a Adam Optimzier with weight-decay-fix
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )



# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# Total number of training steps is number of batches * number of epochs.
total_steps = len(dataloader_train) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)



In [21]:
import torch.nn.functional as F

model.to(config.device)
    
for epoch in tqdm(range(1, config.epochs+1)):
    
    model.train()
    
    loss_train_total = 0
    # allows you to see the progress of the training 
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    
    for batch in progress_bar:

        model.zero_grad()
        
        batch = {key: value.to(config.device) for key, value in batch.items()}
        
        logits = model(input_ids=batch["input_ids"],
                       attention_mask=batch["attention_mask"],
                       token_type_ids=batch["token_type_ids"],
                       labels=batch["label"])
        # print(logits["logits"], batch["label"])

        loss = loss_fn(logits["logits"], batch["label"])
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    
    tqdm.write(f'F1 Score (Weighted): {val_f1}');
# save model params and other configs 
with Path('params.json').open("w") as f:
    json.dump(params, f, ensure_ascii=False, indent=4)


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1218 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.5543593577245381
Validation loss: 0.5118527162600967
F1 Score (Weighted): 0.7909205673951524


Epoch 2:   0%|          | 0/1218 [00:00<?, ?it/s]




Epoch 2
Training loss: 0.4774976786599175
Validation loss: 0.498394967549864
F1 Score (Weighted): 0.7956633935059885


Epoch 3:   0%|          | 0/1218 [00:00<?, ?it/s]




Epoch 3
Training loss: 0.4533945979743168
Validation loss: 0.498394967549864
F1 Score (Weighted): 0.7956633935059885


Epoch 4:   0%|          | 0/1218 [00:00<?, ?it/s]




Epoch 4
Training loss: 0.4542880081533407
Validation loss: 0.498394967549864
F1 Score (Weighted): 0.7956633935059885


Epoch 5:   0%|          | 0/1218 [00:00<?, ?it/s]




Epoch 5
Training loss: 0.45326254604925664
Validation loss: 0.498394967549864
F1 Score (Weighted): 0.7956633935059885


In [None]:
model = BertForSequenceClassification.from_pretrained(config.pretrained_model,
                                                      num_labels=6,
                                                      output_attentions=False,
                                                      output_hidden_states=False,
                                                      cache_dir=config.cache_dir)
model.load_state_dict(torch.load('_BERT_epoch_5.model'))

In [23]:
model.to(config.device)
data_iter = iter(dataloader_validation)

In [71]:
batch = next(data_iter)
batch = {key: value.to(config.device) for key, value in batch.items()}
with torch.no_grad():        
    outputs = model(input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
                labels=batch["label"])
    logits = outputs[1].detach().cpu().numpy()
print(np.array([logits.argmax(axis=1), batch["label"].detach().cpu().numpy()]))
print(accuracy_per_class(logits, batch["label"].detach().cpu().numpy(), {"A1": 0, "A2": 1, "B1": 2, "B2": 3, "C1": 4, "C2": 5}))

[[5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 1 5 1 5]
 [5 5 4 3 5 5 4 5 2 5 5 5 5 5 5 5 5 5 5 5 3 5 5 5 5 5 5 5 1 5 1 5]]
Class: A2
Accuracy: 2/2

Class: B1
Accuracy: 0/1

Class: B2
Accuracy: 0/2

Class: C1
Accuracy: 0/2

Class: C2
Accuracy: 24/25

None
