In [None]:
!pip install transformers
import pandas as pd 
import numpy as np
import re
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sn
from matplotlib.pyplot import figure
from collections import Counter
from transformers import PhobertTokenizer
from transformers import AutoTokenizer
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
from transformers import AutoModel, AutoTokenizer


In [None]:
def process(file_path: str) : 
    with open(file_path) as f:
        text = f.readlines()

        data=[]
        for sent in text:
            sent = sent.replace('\n','')
            sent = sent.replace(',','')
            data.append(sent)

        for i in range(len(data)):
            data[i] = data[i].split(' ')

        return data

def load_label(file_path:str):
    with open(file_path) as f:
        text = f.readlines()

        labels = []
        for label in text:
            label = label.replace('\n','')
            labels.append(label)

        return labels

In [None]:
data_train_path = '/content/drive/MyDrive/PhoATIS/syllable-level/train/seq.in'
data_dev_path = '/content/drive/MyDrive/PhoATIS/syllable-level/dev/seq.in'
data_test_path = '/content/drive/MyDrive/PhoATIS/syllable-level/test/seq.in'

label_train_path = '/content/drive/MyDrive/PhoATIS/syllable-level/train/label'
label_test_path = '/content/drive/MyDrive/PhoATIS/syllable-level/test/label'
label_dev_path = '/content/drive/MyDrive/PhoATIS/syllable-level/dev/label'

In [None]:
data_train = process(data_train_path)
data_dev = process(data_dev_path)
data_test = process(data_test_path)

labels_train = load_label(label_train_path)
labels_test = load_label(label_test_path)
labels_dev = load_label(label_dev_path)

In [None]:
label_list = sorted(list(set(labels_train + labels_test + labels_dev)))
label_dict = {v: k for k,v in enumerate(label_list)}

In [None]:
y_train = [label_dict[v] for v in labels_train]
y_test = [label_dict[v] for v in labels_test]
y_dev = [label_dict[v] for v in labels_dev]

y_train = np.array(y_train)
y_dev = np.array(y_dev)
y_test = np.array(y_test)
# y_train = to_categorical(y_train)
# y_dev = to_categorical(y_dev)

In [None]:
from transformers import BertTokenizer,BertModel


In [None]:
tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def tokenize(sentence):
    batch = tokenizer(sentence, is_split_into_words=True,
                      padding=True,
                      truncation=True,
                      return_tensors='pt')
    return batch

In [None]:
token_train = tokenize(data_train)
token_test = tokenize(data_test)
token_dev = tokenize(data_dev)

In [None]:
y_train = torch.tensor(y_train)
y_test = torch.tensor(y_test)
y_dev = torch.tensor(y_dev)

In [None]:
train_dataset = TensorDataset(token_train['input_ids'],token_train['attention_mask'],y_train)
test_dataset = TensorDataset(token_test['input_ids'], token_test['attention_mask'], y_test)
dev_dataset = TensorDataset(token_dev['input_ids'], token_dev['attention_mask'], y_dev)

In [None]:
train_batch_size = 32
val_batch_size = 64

In [None]:
train_dataloader = DataLoader(train_dataset,
                              batch_size = train_batch_size)

test_dataloader = DataLoader(test_dataset,
                             batch_size = val_batch_size)

val_dataloader = DataLoader(dev_dataset,
                            batch_size = val_batch_size)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
model = BertForSequenceClassification.from_pretrained('vinai/phobert-base', num_labels=len(label_list))

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing BertForSequenceClassification: ['roberta.encoder.layer.11.output.LayerNorm.bias', 'roberta.encoder.layer.11.attention.self.value.bias', 'roberta.encoder.layer.6.attention.self.query.bias', 'roberta.encoder.layer.5.attention.output.dense.weight', 'roberta.encoder.layer.2.attention.output.dense.weight', 'roberta.encoder.layer.9.attention.output.dense.bias', 'roberta.encoder.layer.10.attention.output.dense.bias', 'roberta.encoder.layer.9.output.LayerNorm.weight', 'roberta.encoder.layer.7.attention.self.key.bias', 'roberta.encoder.layer.9.output.dense.weight', 'roberta.encoder.layer.4.attention.self.key.bias', 'roberta.encoder.layer.10.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer

In [None]:
model.to(device)

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from transformers import AdamW

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    F1_score = f1_score(pred_flat, labels_flat, average='weighted')
    
    return accuracy_score(pred_flat, labels_flat), F1_score

In [None]:
import random
from tqdm import tqdm_notebook
device = 'cuda'
epochs = 10

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)


for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_loss = 0
    model.train()
    train_accuracy = 0
    nb_train_steps = 0
    train_f1 = 0
    
    for step, batch in tqdm_notebook(enumerate(train_dataloader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()
        outputs = model(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask, 
            labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_train_accuracy, tmp_train_f1 = flat_accuracy(logits, label_ids)
        train_accuracy += tmp_train_accuracy
        train_f1 += tmp_train_f1
        nb_train_steps += 1
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
    print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
    print(" Average training loss: {0:.4f}".format(avg_train_loss))

    print("Running Validation...")
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    eval_f1 = 0
    for batch in tqdm_notebook(val_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy, tmp_eval_f1 = flat_accuracy(logits, label_ids)

            eval_accuracy += tmp_eval_accuracy
            eval_f1 += tmp_eval_f1
            nb_eval_steps += 1
    print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
    print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))
print("Training complete!")

Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.7677
 F1 score: 0.8263
 Average training loss: 0.9597
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/8 [00:00<?, ?it/s]

 Accuracy: 0.7939
 F1 score: 0.8301
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.8810
 F1 score: 0.9060
 Average training loss: 0.4879
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/8 [00:00<?, ?it/s]

 Accuracy: 0.8733
 F1 score: 0.8930
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9154
 F1 score: 0.9312
 Average training loss: 0.3384
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/8 [00:00<?, ?it/s]

 Accuracy: 0.8903
 F1 score: 0.9082
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9335
 F1 score: 0.9468
 Average training loss: 0.2552
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/8 [00:00<?, ?it/s]

 Accuracy: 0.9118
 F1 score: 0.9215
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9504
 F1 score: 0.9581
 Average training loss: 0.1917
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/8 [00:00<?, ?it/s]

 Accuracy: 0.9099
 F1 score: 0.9113
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9569
 F1 score: 0.9629
 Average training loss: 0.1633
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/8 [00:00<?, ?it/s]

 Accuracy: 0.9352
 F1 score: 0.9364
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9665
 F1 score: 0.9695
 Average training loss: 0.1329
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/8 [00:00<?, ?it/s]

 Accuracy: 0.9600
 F1 score: 0.9670
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9752
 F1 score: 0.9774
 Average training loss: 0.1019
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/8 [00:00<?, ?it/s]

 Accuracy: 0.9663
 F1 score: 0.9727
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9783
 F1 score: 0.9798
 Average training loss: 0.0899
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/8 [00:00<?, ?it/s]

 Accuracy: 0.9600
 F1 score: 0.9664
Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

 Accuracy: 0.9804
 F1 score: 0.9825
 Average training loss: 0.0779
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/8 [00:00<?, ?it/s]

 Accuracy: 0.9639
 F1 score: 0.9694
Training complete!


In [None]:
    model.eval()
    print('Testing..............................................')
    test_loss, test_accuracy = 0, 0
    nb_test_steps, nb_test_examples = 0, 0
    test_f1 = 0
    for batch in tqdm_notebook(test_dataloader):

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_test_accuracy, tmp_test_f1 = flat_accuracy(logits, label_ids)

            test_accuracy += tmp_test_accuracy
            test_f1 += tmp_test_f1
            nb_test_steps += 1
    print(" Accuracy: {0:.4f}".format(test_accuracy/nb_test_steps))
    print(" F1 score: {0:.4f}".format(test_f1/nb_test_steps))

Testing..............................................


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/14 [00:00<?, ?it/s]

 Accuracy: 0.9520
 F1 score: 0.9594


In [None]:
output_dir = '/content/drive/MyDrive/PhoATIS/Model/Bert_model'

In [None]:
model.save_pretrained(output_dir)