<a href="https://colab.research.google.com/github/AnhVietPham/Text-Mining/blob/main/VihealthBert_WSD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch.nn as nn
import torch
import torch.nn as nn
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaPreTrainedModel

In [None]:
from torch.utils.data import Dataset
import torch

import random
import os
import json
import copy
import re

import logging
import argparse
import numpy as np
from torch.utils.data import RandomSampler, SequentialSampler, DataLoader
from collections import defaultdict
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.auto import tqdm, trange


In [None]:
class Classifier(nn.Module):
    def __init__(self, config, dropout_rate=0.1):
        super().__init__()

        self.dropout_1 = nn.Dropout(dropout_rate*2)
        self.dense_1  = nn.Linear(config.hidden_size*2, 128)
        self.relu = nn.ReLU()
        self.dropout_2 = nn.Dropout(dropout_rate)
        self.dense_2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, feature):

        feature = self.dropout_1(feature)
        feature = self.dense_1(feature)
        feature = self.relu(feature)
        feature = self.dropout_2(feature)
        feature = self.dense_2(feature).view(-1)

        feature = self.sigmoid(feature)
        return feature

In [None]:
class ViHnBERT(RobertaPreTrainedModel):
    def __init__(self, config, args):
        super(ViHnBERT, self).__init__(config)
        self.args = args
        self.config = config
        # init backbone
        self.roberta = RobertaModel(config)

        self.classifier = Classifier(config, args.dropout_rate)

    
    def forward(self,
                input_ids=None, 
                token_type_ids=None, 
                attention_mask=None, 
                start_token_idx=None, 
                end_token_idx=None,
                labels=None):

        outputs = self.roberta(input_ids=input_ids,
                                    attention_mask=attention_mask)

        features_bert = outputs[0]
        # Features of [CLS] tokens
        features_cls = features_bert[:, 0, :].unsqueeze(1)

        # Features of acronym tokens
        if start_token_idx is None or end_token_idx is None:
            raise Exception('Require start_token_idx and end_token_idx')
        list_mean_feature_acr = []
        for idx in range(features_bert.size()[0]):
            feature_acr = features_bert[idx, start_token_idx[idx]:end_token_idx[idx]+1, :].unsqueeze(0)
            mean_feature_acr = torch.mean(feature_acr, 1, True)
            list_mean_feature_acr.append(mean_feature_acr)
        features_arc = torch.cat(list_mean_feature_acr, dim=0)

        # Concate featrues
        features = torch.cat([features_cls, features_arc], dim=2)

        logits = self.classifier(features)
        outputs = ((logits),) + outputs[2:]

        loss_fn = nn.BCELoss()
        total_loss = 0.0

        if labels is not None:
            total_loss = loss_fn(logits, labels)
        
        outputs = (total_loss,) + outputs
        
        return outputs

In [None]:
logger = logging.getLogger(__name__)

class InputExample(object):
    def __init__(self, guid, id, text, text_tokens, expansion, start_char_idx, length_acronym, start_token_idx, end_token_idx, label) -> None:
        super().__init__()
        self.guid = guid
        self.id = id
        self.text = text
        self.text_tokens = text_tokens
        self.expansion = expansion
        self.start_char_idx = start_char_idx
        self.length_acronym = length_acronym
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.label = label

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, id, input_ids, attention_mask, token_type_ids, start_token_idx, end_token_idx, label, expansion):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.label = label
        self.expansion = expansion

        self.id = id

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

class Processor(object):
    """Processor for the ArcBERT data set """
    def __init__(self, args) -> None:
        super().__init__()
        self.args = args

    @classmethod
    def _read_file(cls, input_file):
        """Reads json file"""
        with open(input_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        return data
    
    def is_whitespace(self, c):
        if c == " " or c == "\t" or c == "\n" or ord(c) == 0x202F:
            return True
        return False
    
    def clean_syn(self, text):
            text = re.sub('[\?,\.\!:;\(\)]', '', text)
            return text

    def _create_examples(self, data, mode):
        """Creates examples for the training and dev sets."""
        examples = []
        for i, example in enumerate(data):
            guid = "%s-%s" % (mode, i)
            id = example['id']
            # 1. Input text
            text = example['text']
            text_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in text:
                if self.is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        text_tokens.append(c)
                    else:
                        text_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(text_tokens)-1)
            # 2. Expansion of acr
            expansion = example['expansion']
            # 3. Position of acr and acr
            start_char_idx = example['start_char_idx']
            length_acronym = example['length_acronym']
            start_token_idx = char_to_word_offset[start_char_idx]
            end_token_idx = char_to_word_offset[start_char_idx+length_acronym-1]
            # 4. Label
            label = example['label']
            examples.append(InputExample(
                guid=guid,
                id=id,
                text=text,
                text_tokens=text_tokens,
                expansion=expansion,
                start_char_idx=start_char_idx,
                length_acronym=length_acronym,
                start_token_idx=start_token_idx,
                end_token_idx=end_token_idx,
                label=label
            ))
        return examples
    
    def get_examples(self, args, mode):
        """
        Args:
            mode: train, dev, test
        """

        data_path = os.path.join(args.data_dir, mode)
        data = self._read_file(os.path.join(data_path, args.data_file_name))
        
        PATH_DICTIONARY = os.path.join(args.data_dir, args.dict_file_name)
        if not os.path.isfile(PATH_DICTIONARY):
            raise Exception(f"Folder {args.data_dir} doesn't contain canonical dictionary")
        dictionary = self._read_file(PATH_DICTIONARY)
        
        examples = []

        pos_data = add_label_positive_sample(data)
        examples.extend(pos_data)

        neg_data = negative_data(pos_data, dictionary, mode)
        examples.extend(neg_data)
            
        return self._create_examples(examples, mode)



def negative_data(positive_data:list, diction:dict, mode) -> list:
    """
    Funciton: Create negative samples
    args:
        positive_data: training data whose format {
            'acronym': ...,(optional)
            'expansion': ...,
            'text': ...,
            'start_char_idx: ...,
            'lenght_acronym': ...,
            'label': 1 (positive sample)
        }
        and
        diction: dictionary of acronym and able expansion respectively
    """

    neg_data = []
    tmp = 0
    for sample in positive_data:
        try:
            acronym = sample["text"][sample["start_char_idx"]:sample["start_char_idx"]+sample['length_acronym']]
            list_neg_expansion = diction[acronym].copy()
            list_neg_expansion.remove(sample["expansion"])
            if mode == 'train':
                if len(list_neg_expansion) > 1: 
                    list_neg_expansion = random.sample(list_neg_expansion, random.randint(1,2))
            elif mode == 'dev' or mode == 'test':
                if len(list_neg_expansion) > 1:
                    list_neg_expansion = list_neg_expansion
            for i in list_neg_expansion:
                neg_data.append(sample.copy())
                neg_data[tmp]["expansion"] = i
                neg_data[tmp]["label"] = 0 # pseudo negative samples
                tmp += 1
        except: 
            print(sample)
            continue
    
    return neg_data

def add_label_positive_sample(data: list):
    for idx, sample in enumerate(data):
        sample['text'] = sample['text'].lower()
        sample['label'] = 1
        sample['id'] = idx
    return data



def convert_examples_to_features(examples,
                                max_seq_len,
                                tokenizer):
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    pad_token_id = tokenizer.pad_token_id

    features = []
    
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
        
        orig_to_tok_index = []
        all_doc_tokens = []

        for (i, token) in enumerate(example.text_tokens):
            orig_to_tok_index.append(len(all_doc_tokens))
            sub_tokens = tokenizer.tokenize(token)

            for sub_token in sub_tokens:
                all_doc_tokens.append(sub_token)
        
        start_token_idx = orig_to_tok_index[example.start_token_idx]
        if len(orig_to_tok_index) == (example.end_token_idx + 1):
            end_token_idx = orig_to_tok_index[-1]
        else:
            end_token_idx = orig_to_tok_index[example.end_token_idx + 1] - 1
        
        input_ids = []
        
        input_ids += [cls_token]
        input_ids += all_doc_tokens
        input_ids += [sep_token]
        
        token_type_ids = [0]*len(input_ids)
        
        expansion = example.expansion
        expansion_tokens = tokenizer.tokenize(expansion)
        
        input_ids += expansion_tokens
        input_ids += [sep_token]
        

        token_type_ids += [1]*(len(expansion_tokens) + 1)

        attention_mask = [1]*len(input_ids)
        
        input_ids = tokenizer.convert_tokens_to_ids(input_ids)
        
        padding = max_seq_len - len(input_ids)
        
        if padding < 0:
            print('Ignore sample has length > 256 tokens')
            continue
        
        input_ids = input_ids + ([pad_token_id] * padding)
        attention_mask = attention_mask + [0]*padding
        token_type_ids = token_type_ids + [0]*padding
        assert len(input_ids) == len(attention_mask) == len(token_type_ids), "Error with input length {} vs attention mask length {}, token type length {}".format(len(input_ids), len(attention_mask), len(token_type_ids))
        assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(
            len(attention_mask), max_seq_len
        )
        assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(
            len(token_type_ids), max_seq_len
        )
        id = example.id
        label=example.label
        
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % example.guid)
            logger.info("tokens: %s" % " ".join([str(x) for x in all_doc_tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("labels: %s" % " ".join([str(x) for x in [label]]))
            logger.info("expansion: %s" % " ".join([str(x) for x in [expansion]]))

        
        features.append(
                InputFeatures(
                    id = id,
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    start_token_idx=start_token_idx,
                    end_token_idx=end_token_idx,
                    label=label,
                    expansion=expansion
                )
            )
    return features

def load_and_cache_examples(args, tokenizer, mode=None):
    if not mode:
        return None
    processor = Processor(args)

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        'cached_{}_{}_{}'.format(mode,
                                list(filter(None, args.model_name_or_path.split("/"))).pop(),
                                args.max_seq_len
                                )
    )
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        
        logger.info("Creating features from dataset file at %s", args.data_dir)
        if mode == "train":
            examples = processor.get_examples(args, "train")
        elif mode == "dev":
            examples = processor.get_examples(args, "dev")
        elif mode == "test":
            examples = processor.get_examples(args, "test")
        else:
            raise Exception("For mode, Only train, dev, test is available")
            
        features = convert_examples_to_features(
            examples, args.max_seq_len, tokenizer
        )

        logger.info("Saving features into cached file %s", cached_features_file)

        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.int64)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.float)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.int64)
    all_start_token_idx = torch.tensor([f.start_token_idx for f in features], dtype=torch.int64)
    all_end_token_idx = torch.tensor([f.end_token_idx for f in features], dtype=torch.int64)
    all_label = torch.tensor([f.label for f in features], dtype=torch.float)

    all_id = torch.tensor([f.id for f in features], dtype=torch.long)
    all_expansion = [f.expansion for f in features]

    return (
        all_input_ids,
        all_token_type_ids,
        all_attention_mask,
        all_start_token_idx,
        all_end_token_idx,
        all_label,
        all_id,
        all_expansion
    )


class AcrDataset(Dataset):
    def __init__(self,
                args,
                tokenizer,
                mode) -> None:
        super().__init__()
        self.mode = mode
        
        self.dataset = load_and_cache_examples(args, tokenizer, mode)

    def __len__(self) -> int:
        return len(self.dataset[0])
    
    def __getitem__(self, index: int):
        return  self.dataset[0][index], self.dataset[1][index], self.dataset[2][index], self.dataset[3][index], self.dataset[4][index], self.dataset[5][index], self.dataset[6][index], self.dataset[7][index]


In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_score_min = np.Inf

    def __call__(self, score, model, args):
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(score, model, args)
        elif score < self.best_score:
            self.counter += 1
            print(f"EarlyStopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(score, model, args)
            self.counter = 0

    def save_checkpoint(self, score, model, args):
        """Saves model when validation loss decreases or accuracy/f1 increases."""
        if self.verbose:
            print(f'{args.tuning_metric} imporoved ({self.val_score_min:.6f} ----> {score:.6f}). Saving model ....')
        model.save_pretrained(args.model_dir)
        torch.save(args, os.path.join(args.model_dir, "training_args.bin"))
        self.val_score_min = score

In [None]:
class Trainer(object):
    def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None) -> None:
        super().__init__()
        self.args = args
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset

        self.gold = read_json('/content/drive/MyDrive/Luận Văn Thạc Sĩ/data-vihealbert/acrDrAid/data/gold.json')


        self.config_class, self.model_class, self.tokenizer_class = MODEL_CLASSES[args.model_type]

        if args.pretrained:
            self.config = self.config_class.from_pretrained(args.model_name_or_path, finetuning_task=args.token_level)
            self.model = self.model_class.from_pretrained(
                args.pretrained_path,
                config=self.config,
                args=self.args
            )
        else:
            self.config = self.config_class.from_pretrained(args.model_name_or_path, finetuning_task=args.token_level)
            self.model = self.model_class.from_pretrained(
                args.model_name_or_path,
                config=self.config,
                args=self.args
            )

        # GPU or CPU
        torch.cuda.set_device(self.args.gpu_id)
        print('GPU ID :',self.args.gpu_id)
        print('Cuda device:',torch.cuda.current_device())
        self.device = args.device

        self.model.to(self.device)

    def train(self):
        train_sampler = RandomSampler(self.train_dataset)
        train_loader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size)

        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            self.args.num_train_epochs = (
                self.args.max_steps // (len(train_loader) // self.args.gradient_accumulation_steps) + 1
            )
        else:
            t_total = len(train_loader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs
        
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': self.args.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0}
        ]

        optimizer = torch.optim.Adam(lr=self.args.learning_rate, betas=(0.9, 0.98), eps=self.args.adam_epsilon, params=optimizer_grouped_parameters)
        
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total
        )


        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(self.train_dataset))
        logger.info("  Num Epochs = %d", self.args.num_train_epochs)
        logger.info("  Total train batch size = %d", self.args.train_batch_size)
        logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)
        logger.info("  Logging steps = %d", self.args.logging_steps)
        logger.info("  Save steps = %d", self.args.save_steps)

        global_step = 0
        tr_loss = 0.0
        # self.model.zero_grad()

        train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch")
        early_stopping = EarlyStopping(patience=self.args.early_stopping, verbose=True)

        for it in train_iterator:
            print(f'epoch: {it}')
            epoch_iterator = tqdm(train_loader, desc="Iteration", position=0, leave=True)
            for step, batch in enumerate(epoch_iterator):
                self.model.train()
                # batch = tuple(t.to(self.device) for t in batch)  # GPU or CPU
                inputs = {
                    "input_ids": batch[0].to(self.device),
                    "token_type_ids": batch[1].to(self.device),
                    "attention_mask": batch[2].to(self.device),
                    "start_token_idx": batch[3].to(self.device),
                    "end_token_idx": batch[4].to(self.device),
                    "labels": batch[5].to(self.device)
                }

                optimizer.zero_grad()
                outputs = self.model(**inputs)
                loss, _ = outputs[:2]

                if self.args.gradient_accumulation_steps > 1:
                    loss = loss / self.args.gradient_accumulation_steps

                loss.backward()
                tr_loss += loss.item()
                
                if (step + 1) % self.args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    optimizer.zero_grad()
                    global_step += 1

                    if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0:
                        print("\nTuning metrics:", self.args.tuning_metric)
                        eval_loss, ids, pred_expansions, pred_scores = self.evaluate('test')
                        results = compute_metrics(self.args, ids, pred_expansions, pred_scores)
                        results['loss'] = eval_loss
                        logger.info("***** Eval results *****")
                        for key in sorted(results.keys()):
                            logger.info("  %s = %s", key, str(results[key]))

                        early_stopping(results[self.args.tuning_metric], self.model, self.args)
                        if early_stopping.early_stop:
                            print('Early Stopping')
                            break
                            
                        print(f'Training Loss {tr_loss / global_step}')
                    
                    # if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
                    #     self.save_model()
                
                if 0 < self.args.max_steps < global_step:
                    epoch_iterator.close()
                    break
            if 0 < self.args.max_steps < global_step or early_stopping.early_stop:
                train_iterator.close()
                break
        
        return global_step, tr_loss / global_step
    
    def write_evaluation_result(self, out_file, results):
        out_file = self.args.model_dir + "/" + out_file
        w = open(out_file, "w", encoding="utf-8")
        w.write("***** Eval results *****\n")
        for key in sorted(results.keys()):
            to_write = " {key} = {value}".format(key=key, value=str(results[key]))
            w.write(to_write)
            w.write("\n")
        w.close()

    def evaluate(self, mode):
        if mode == 'test':
            dataset = self.test_dataset
        elif mode == 'dev':
            dataset = self.dev_dataset
        else:
            raise Exception("Only dev and test dataset available")
        
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)

        # Eval
        logger.info("***** Running evaluation on %s dataset *****", mode)
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.args.eval_batch_size)

        ids = []
        pred_expansions = []
        pred_scores = []

        eval_loss = 0.0
        nb_eval_steps = 0
        correct = 0

        self.model.eval()
        

        for batch in eval_dataloader:
            # batch = tuple(t.to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0].to(self.device),
                    "token_type_ids": batch[1].to(self.device),
                    "attention_mask": batch[2].to(self.device),
                    "start_token_idx": batch[3].to(self.device),
                    "end_token_idx": batch[4].to(self.device),
                    "labels": batch[5].to(self.device)
                }
                outputs = self.model(**inputs)
                loss, logits = outputs[:2]
                eval_loss += loss.item()

                labels = inputs['labels']
                preds = (logits > self.args.threshold).type(torch.int16)
                correct += sum(preds == labels).item()

                ids.extend(batch[6].tolist())
                pred_expansions.extend(list(batch[7]))
                
                pred_scores.extend(logits.tolist())

            nb_eval_steps += 1
        
        eval_loss = eval_loss / nb_eval_steps
        acc = correct/len(dataset)
        print(f'Classification Accuracy: {acc}')

        return eval_loss, ids, pred_expansions, pred_scores
    
    def load_model(self):
        # Check whether model exists
        if not os.path.exists(self.args.model_dir):
            raise Exception("Model doesn't exists! Train first!")

        try:
            self.model = self.model_class.from_pretrained(
                self.args.model_dir,
                config=self.config,
                args=self.args
                
            )
            self.model.to(self.device)
            logger.info("***** Model Loaded *****")
        except Exception:
            raise Exception("Some model files might be missing...")

In [None]:
from transformers import (
    AutoTokenizer,
    RobertaConfig
)

MODEL_CLASSES = {
    "vihnbert": (RobertaConfig, ViHnBERT, AutoTokenizer),
    "phobert": (RobertaConfig, ViHnBERT, AutoTokenizer)
    }

MODEL_PATH_MAP = {
    "vihnbert": "demdecuong/vihealthbert-base-word",
    "phobert": "vinai/phobert-base"
    }

def init_logger():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if not args.no_cuda and torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)

def load_tokenizer(args):
    return MODEL_CLASSES[args.model_type][2].from_pretrained(args.model_name_or_path)

def score_expansion(gold, prediction):
    """
    gold, prediction is list
    """
    correct = 0
    for i in range(len(gold)):
        if gold[i] == prediction[i]:
            correct += 1
    acc = correct/len(prediction)

    expansions = set()
    correct_per_expansion = defaultdict(int)
    total_per_expansion = defaultdict(int)
    pred_per_expansion = defaultdict(int)

    for i in range(len(gold)):
        expansions.add(gold[i])
        total_per_expansion[gold[i]] += 1
        pred_per_expansion[prediction[i]] += 1
        if gold[i] == prediction[i]:
            correct_per_expansion[gold[i]] += 1
    
    precs = defaultdict(int)
    recalls = defaultdict(int)

    for exp in expansions:
        precs[exp] = correct_per_expansion[exp] / pred_per_expansion[exp] if exp in pred_per_expansion else 1
        recalls[exp] = correct_per_expansion[exp] / total_per_expansion[exp]

    # micro-pred = micro-recall = micro-f1 = acc if len(gold) = len(prediction)
    micro_prec = sum(correct_per_expansion.values()) / sum(pred_per_expansion.values())
    micro_recall = sum(correct_per_expansion.values()) / sum(total_per_expansion.values())
    micro_f1 = 2*micro_prec*micro_recall/(micro_prec+micro_recall) if micro_prec+micro_recall != 0 else 0

    # official evaluation metrics are the macro-averaged precision, recall and F1 for correct expansion predictions
    macro_prec = sum(precs.values()) / len(precs)
    macro_recall = sum(recalls.values()) / len(recalls)
    macro_f1 = 2*macro_prec*macro_recall / (macro_prec+macro_recall) if macro_prec+macro_recall != 0 else 0

    return macro_prec, macro_recall, macro_f1, acc

def compute_metrics(args, ids, pred_expansions, pred_scores):
    thresh = args.threshold
    gold = read_json(os.path.join(args.data_dir, args.gold_file_name))
    pred = {}
    for i, expan, score in zip(ids, pred_expansions, pred_scores):
        if score > thresh:
            if i not in pred:
                pred[i] = [score, expan]
            else:
                if score > pred[i][0]:
                    pred[i] = [score, expan]
    pred = [pred[int(k)][1] if int(k) in pred else '' for k,v in gold.items()]
    gold = [gold[k] for k,v in gold.items()]
    assert len(gold) == len(pred)
    macro_prec, macro_recall, macro_f1, acc = score_expansion(gold, pred)
    result = {}

    result['macro_prec'] = macro_prec
    result['macro_recall'] = macro_recall
    result['macro_f1'] = macro_f1
    result['accuracy'] = acc

    return result
    
def read_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [None]:
def main(args):
    init_logger()
    set_seed(args)
    tokenizer = load_tokenizer(args)

    train_dataset = AcrDataset(args, tokenizer, 'train')
    dev_dataset = AcrDataset(args, tokenizer, 'dev')
    test_dataset = AcrDataset(args, tokenizer, 'test')

    trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)

    eval_loss, ids, pred_expansions, pred_scores = trainer.evaluate('test')
    results = compute_metrics(args, ids, pred_expansions, pred_scores)
    results['loss'] = eval_loss
    logger.info("***** Eval results *****")
    for key in sorted(results.keys()):
        logger.info("  %s = %s", key, str(results[key]))

    print(f"Anh Viet Pham do_train: {args.do_train}")

    print(f"Anh Viet Pham do_eval: {args.do_eval}")


    if args.do_train:
        print("Anh Viet Pham do_train")
        trainer.train()

    if args.do_eval:
        trainer.load_model()
        eval_loss, ids, pred_expansions, pred_scores = trainer.evaluate('dev')
        results = compute_metrics(args, ids, pred_expansions, pred_scores)
        results['loss'] = eval_loss
        logger.info("***** Dev results *****")
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))

        eval_loss, ids, pred_expansions, pred_scores = trainer.evaluate('test')
        results = compute_metrics(args, ids, pred_expansions, pred_scores)
        results['loss'] = eval_loss
        logger.info("***** Test results *****")
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))

In [None]:

parser = argparse.ArgumentParser()

# parser.add_argument("--task", default=None, required=True, type=str, help="The name of the task to train")
parser.add_argument("--model_dir", default="/content/drive/MyDrive/Luận Văn Thạc Sĩ/data-vihealbert/model-save", type=str, help="Path to save, load model")
parser.add_argument("--data_dir", default="/content/drive/MyDrive/Luận Văn Thạc Sĩ/data-vihealbert/acrDrAid/data", type=str, help="The input data dir")
parser.add_argument("--data_file_name", default="data.json", type=str, help="The input data name")
parser.add_argument("--gold_file_name", default="gold.json", type=str, help="The gold file name")
parser.add_argument("--dict_file_name", default="dictionary.json", type=str, help="The dictionary file name")


parser.add_argument(
        "--model_type",
        default="phobert",
        type=str,
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
)
parser.add_argument("--tuning_metric", default="macro_f1", type=str, help="Metrics to tune when training")
parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument("--train_batch_size", default=32, type=int, help="Batch size for training.")
parser.add_argument("--eval_batch_size", default=64, type=int, help="Batch size for evaluation.")
parser.add_argument(
        "--max_seq_len", default=256, type=int, help="The maximum total input sequence length after tokenization."
)
parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.")
parser.add_argument(
        "--num_train_epochs", default=100.0, type=float, help="Total number of training epochs to perform."
)
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
)
parser.add_argument("--adam_epsilon", default=1e-9, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
)
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
parser.add_argument("--dropout_rate", default=0.1, type=float, help="Dropout for fully-connected layers")

parser.add_argument("--logging_steps", type=int, default=200, help="Log every X updates steps.")
parser.add_argument("--save_steps", type=int, default=200, help="Save checkpoint every X updates steps.")

parser.add_argument("--do_train", action="store_true", default=False, help="Whether to run training.")
parser.add_argument("--do_eval", action="store_true", default=True, help="Whether to run eval on the test set.")

parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")

parser.add_argument(
        "--token_level",
        type=str,
        default="word-level",
        help="Tokens are at syllable level or word level (Vietnamese) [word-level, syllable-level]",
)
parser.add_argument(
        "--early_stopping",
        type=int,
        default=25,
        help="Number of unincreased validation step to wait for early stopping",
)
parser.add_argument("--gpu_id", type=int, default=0, help="Select gpu id")
        
# init pretrained
parser.add_argument("--pretrained", action="store_true", help="Whether to init model from pretrained base model")
parser.add_argument("--pretrained_path", default="./workspace/vinbrain/vutran/Transfer_Learning/Domain_Adaptive/Finetune/WSD/src/XLMr_ADvn/1e-5/42/", type=str, help="The pretrained model path")

parser.add_argument(
        "--threshold", default=0.5, type=float, help="Threshold"
)

parser.add_argument('-f')

args = parser.parse_args()

args.model_name_or_path = MODEL_PATH_MAP[args.model_type]
args.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
main(args)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing ViHnBERT: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing ViHnBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViHnBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViHnBERT were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense_1.bias', 'classifier.

GPU ID : 0
Cuda device: 0
Classification Accuracy: 0.4544130498106612
Anh Viet Pham do_train: False
Anh Viet Pham do_eval: True
Classification Accuracy: 0.921968787515006
Classification Accuracy: 0.8598893096417128
