In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ner-covid-19/dev_word.conll
/kaggle/input/ner-covid-19/train_word.conll
/kaggle/input/ner-covid-19/test_word.conll
/kaggle/input/covid19-ner/test_word.json
/kaggle/input/covid19-ner/dev_word.json
/kaggle/input/covid19-ner/train_word.json


In [2]:
!pip install torcheval

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torcheval
Successfully installed torcheval-0.0.7


In [3]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn

from sklearn import preprocessing
from sklearn import model_selection
from torcheval.metrics.functional import multiclass_f1_score
from transformers import get_linear_schedule_with_warmup

from tqdm import tqdm
from transformers import AdamW

In [4]:
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 16
    VALID_BATCH_SIZE = 16
    EPOCH = 20
    SEED = 42
    BASE_MODEL = "vinai/phobert-base-v2"
    TRAIN = "/kaggle/input/ner-covid-19/train_word.conll"

    TOKENIZE = transformers.AutoTokenizer.from_pretrained(
        BASE_MODEL,
        do_lower_case = True
    )
torch.manual_seed(config.SEED)
torch.cuda.manual_seed(config.SEED)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [5]:
def read_conll(file_path):
    sentences = []
    sentence = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == "":  
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                sentence.append(line.split()) 
    if sentence:
        sentences.append(sentence)

    return sentences

file_path = '/kaggle/input/ner-covid-19/train_word.conll'
sentences = read_conll(file_path)

print(sentences[:2])


[[['Đồng_thời', 'O'], [',', 'O'], ['bệnh_viện', 'O'], ['tiếp_tục', 'O'], ['thực_hiện', 'O'], ['các', 'O'], ['biện_pháp', 'O'], ['phòng_chống', 'O'], ['dịch_bệnh', 'O'], ['COVID', 'O'], ['-', 'O'], ['19', 'O'], ['theo', 'O'], ['hướng_dẫn', 'O'], ['của', 'O'], ['Bộ', 'B-ORGANIZATION'], ['Y_tế', 'I-ORGANIZATION'], ['.', 'O']], [['"', 'O'], ['Số', 'O'], ['bệnh_viện', 'O'], ['có_thể', 'O'], ['tiếp_nhận', 'O'], ['bệnh_nhân', 'O'], ['bị', 'O'], ['sốt', 'B-SYMPTOM_AND_DISEASE'], ['cao', 'I-SYMPTOM_AND_DISEASE'], ['và', 'O'], ['khó', 'B-SYMPTOM_AND_DISEASE'], ['thở', 'I-SYMPTOM_AND_DISEASE'], ['đang', 'O'], ['giảm', 'O'], ['dần', 'O'], ['"', 'O'], [',', 'O'], ['thông_cáo', 'O'], ['có', 'O'], ['đoạn', 'O'], [',', 'O'], ['cảnh_báo', 'O'], ['những', 'O'], ['bệnh_nhân', 'O'], ['này', 'O'], ['thay', 'O'], ['vào', 'O'], ['đó', 'O'], ['được', 'O'], ['chuyển', 'O'], ['tới', 'O'], ['các', 'O'], ['phòng_khám', 'O'], ['khẩn_cấp', 'O'], [',', 'O'], ['khiến', 'O'], ['những', 'O'], ['bệnh_nhân', 'O'], ['mắc'

In [6]:
import pandas as pd

def conll_to_dataframe(sentences, columns):
    data = []
    for sentence in sentences:
        for token in sentence:
            data.append(dict(zip(columns, token)))

    return pd.DataFrame(data)

columns = ["Word", "NER"]

df = conll_to_dataframe(sentences, columns)

df.head(20)


Unnamed: 0,Word,NER
0,Đồng_thời,O
1,",",O
2,bệnh_viện,O
3,tiếp_tục,O
4,thực_hiện,O
5,các,O
6,biện_pháp,O
7,phòng_chống,O
8,dịch_bệnh,O
9,COVID,O


In [7]:
df["NER"].value_counts()

NER
O                        104750
B-LOCATION                 5398
I-LOCATION                 5242
B-PATIENT_ID               3240
B-DATE                     2549
I-ORGANIZATION             2545
I-DATE                     2500
I-SYMPTOM_AND_DISEASE      1552
B-SYMPTOM_AND_DISEASE      1439
B-ORGANIZATION             1137
B-AGE                       682
B-GENDER                    542
B-NAME                      349
B-TRANSPORTATION            226
B-JOB                       205
I-TRANSPORTATION             67
I-JOB                        62
I-NAME                       13
I-PATIENT_ID                 11
I-AGE                         2
Name: count, dtype: int64

In [8]:
print(len(df))

132511


In [9]:
df["NER"] = df["NER"].replace(np.nan,"nan")
df = df[df["NER"].str.isupper()]

uppercase_rows = df["Word"].notna()&df["Word"].str.isupper()
df = df[~uppercase_rows]
print(len(df))
df.head(15)

129760


Unnamed: 0,Word,NER
0,Đồng_thời,O
1,",",O
2,bệnh_viện,O
3,tiếp_tục,O
4,thực_hiện,O
5,các,O
6,biện_pháp,O
7,phòng_chống,O
8,dịch_bệnh,O
10,-,O


In [10]:
    sentence_number = 1
    sentence_column = []
    
    for index,row in df.iterrows():
        if row["Word"] in {".","!","?"}:
            sentence_number += 1
        sentence_column.append(f"Sentence: {sentence_number}")
    df["Sentence #"] = sentence_column
    df = df.reset_index(drop=True)

In [11]:
df

Unnamed: 0,Word,NER,Sentence #
0,Đồng_thời,O,Sentence: 1
1,",",O,Sentence: 1
2,bệnh_viện,O,Sentence: 1
3,tiếp_tục,O,Sentence: 1
4,thực_hiện,O,Sentence: 1
...,...,...,...
129755,",",O,Sentence: 4978
129756,kết_quả,O,Sentence: 4978
129757,nghi,O,Sentence: 4978
129758,nhiễm,O,Sentence: 4978


In [12]:
class COVIDDataset:
    def __init__(self,word,ner):
        self.word = word
        self.ner = ner
    def __len__(self):
        return len(self.word)
    def __getitem__(self,item):
        word = self.word[item]
        ner = self.ner[item]

        ids = []
        target_ner = []
    
        for idx,str in enumerate(word):
            inputs = config.TOKENIZE.encode(str,add_special_tokens = False)
            input_len = len(inputs)
            ids.extend(inputs)
            target_ner.extend([ner[idx]] * input_len)
    
        ids = ids[:(config.MAX_LEN - 2)]
        target_ner = target_ner[:(config.MAX_LEN - 2)]
    
        ids = [0] + ids + [2]
        target_ner = [1] + target_ner +[1]
    
        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)
    
        padding_len = config.MAX_LEN - len(ids)
    
        ids = ids + [1]*padding_len
        mask = mask +[0]*padding_len
        token_type_ids = token_type_ids + [0]*padding_len
        target_ner = target_ner + [1]*padding_len
    
        return {
            "ids":torch.tensor(ids, dtype = torch.long),
            "mask":torch.tensor(mask, dtype = torch.long),
            "token_type_ids":torch.tensor(token_type_ids, dtype = torch.long),
            "target_ner":torch.tensor(target_ner, dtype = torch.long)
        }

In [13]:
def process_data(df):
    enc_ner = preprocessing.LabelEncoder()
    # Các attribute của LabelEncoder():
    # fit(): Tạo ra một list liệt kê các NER
    # transform(): Biến đổi list các NER thành các số nguyên
    # inverse_transform(): Biến đổi ngược các số nguyên thành list các NER
    # fit_transform(): là bước kết hợp fit và transform
    df.loc[:,"NER"] = enc_ner.fit_transform(df["NER"])

    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    ner = df.groupby("Sentence #")["NER"].apply(list).values

    return sentences, ner, enc_ner

In [14]:
def train_fn(data_loader,model,optimizer,device,scheduler):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader):
        ids = data["ids"].to(device)
        mask = data["mask"].to(device)
        token_type_ids = data["token_type_ids"].to(device)
        target_ner = data["target_ner"].to(device)
        optimizer.zero_grad()
        _,loss,acc,_ = model(ids, mask, token_type_ids, target_ner)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
    return final_loss/len(data_loader), acc

def eval_fn(data_loader,model,device):
    model.eval()
    final_loss = 0
    for data in tqdm(data_loader):
        ids = data["ids"].to(device)
        mask = data["mask"].to(device)
        token_type_ids = data["token_type_ids"].to(device)
        target_ner = data["target_ner"].to(device)
        _,loss,acc,f1 = model(ids, mask, token_type_ids, target_ner)
        final_loss +=loss.item()
    return final_loss/len(data_loader),acc,f1

In [15]:
    # output: Tensor đầu ra từ mô hình, kích thước là [batch_size,max_len,num_labels]
    #         Mỗi token có một vector logit đại diện cho xác suất thuộc về từng nhãn 
    # target: Tensor chứa nhãn thực tế, kích thước [batch_size, max_len]
    # mask : Tensor đánh dấu giá trị có thực hay không, 0 là các padding còn 1 là các token thật
    #         kích thước là [batch_size,max_len]
    # num_labels: Số lượng nhãn mà mô hình có thể dự đoán
    
def loss_fn(output, target, mask, num_labels):
        lfn = nn.CrossEntropyLoss(ignore_index=1)
        #Tất cả nhãn dán bằng 1 sẽ bị bỏ qua khi tính loss, 
        #trong bài toán NER thì [1] thường là các padđing trong ids
        active_loss = mask.view(-1) == 1
        # Chuyển ma trận kích thước batch_size, max_len thành ma trận có kích thước batch_size x max_len 
        # Khi so sánh với == 1 sẽ trả về một ma trận boolean với True là token hợp lệ và ngược lại
        active_logits = output.view(-1, num_labels)  #Logit của các token hợp lệ
        # Output đầu ra có kích thước batch_size,max_len,num_labels, qua biến đổi sẽ có kích thước là 
        # batch_size x max_len, num_labels
        active_labels = torch.where(
            active_loss,
            target.view(-1),#Lấy nhãn nếu token hợp lệ
            torch.tensor(1).type_as(target) #Gãn nhãn là padding nếu token không hợp lệ
        )
        # tôi muốn hỏi là như vậy thì hàm CrossEntropy khi làm với các bài toán NER sẽ được tính bằng active logits và active labels à ?
        # Active loss là một tensor có dạng [True, True, False, True]
        # Biến đổi target thành ma trận 1 chiều
        # Tạo ra một tensor có kích thước bằng target nhưng toàn là số một 
        # Dùng torch.where để lấy ra kết quả theo tensor của active_loss nếu là True thì lấy giá trị của
        # target còn nếu là False sẽ lấy kết quả của torch.tensor(1) tức là 1
        loss = lfn(active_logits, active_labels)
        return loss

def acc_fn(output,target,mask,num_labels):
        _,predicted = torch.max(output,2)
        #Chọn nhãn có xác suất cao nhất từ output theo chiều thứ 2 Ví dụ có 
        # tensor [
    #     [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3]],
    #     [[0.6, 0.2, 0.2], [0.1, 0.7, 0.2]]
    # ] 
        # sẽ trả về kết quả là [[1,1],[0,1]]
        correct = (predicted == target) & (target != 1) & (mask == 1)
        # So sánh nhãn dự đoán với nhãn thực tế, trả về True nếu trùng
        # Chỉ xét các token là hợp lệ là các token khác [1] giống với các padding trong ids của bài toán NER
        # Chỉ xét các token là các token hợp lệ trong mask và thực hiện toán tử là AND để lấy các toán tử đúng
        # Ví dụ có tensor: 
        # predicted = torch.tensor([[0, 1], [1, 0]])
        # target = torch.tensor([[0, 1], [1, 1]])
        # mask = torch.tensor([[1, 1], [1, 0]])
        # Kết quả sẽ là [[True,False],[False,False]]
        num_correct = correct.sum().item()
        # đếm tổng các số lượng True trong correct 
        # Kết quả là tổng số token được dự đoán đúng trong batch
        num_samples = target[(target != 1)& (mask == 1)].size()[0]
        # Lọc ra các token hợp lệ trong thực tế (loại bỏ các token là [1](các token là padding) và lấy các token hợp lệ của mask)
        # size()[0] dùng để tính tổng các token hợp lệ
        accuracy = num_correct/num_samples if num_samples > 0 else 0.0
        return accuracy
    
def f1_score_fn(output,target,mask,num_labels):
        active_loss = mask.view(-1) == 1
        active_logits = output.view(-1,num_labels)
        active_labels = torch.where(
            active_loss,
            target.view(-1),
            torch.tensor(1).type_as(target)
        )
        f1 = multiclass_f1_score(active_logits,active_labels,num_classes = num_labels,average = "macro")
        # Công thức hàm f1 score gồm input, target, num_class và average
        return f1

In [16]:
sentences,ner,enc_ner = process_data(df)

In [17]:
for label,encode_value in zip(enc_ner.classes_,enc_ner.transform(enc_ner.classes_)):
    print(f"{label} -> {encode_value}")

B-AGE -> 0
B-DATE -> 1
B-GENDER -> 2
B-JOB -> 3
B-LOCATION -> 4
B-NAME -> 5
B-ORGANIZATION -> 6
B-PATIENT_ID -> 7
B-SYMPTOM_AND_DISEASE -> 8
B-TRANSPORTATION -> 9
I-AGE -> 10
I-DATE -> 11
I-JOB -> 12
I-LOCATION -> 13
I-NAME -> 14
I-ORGANIZATION -> 15
I-PATIENT_ID -> 16
I-SYMPTOM_AND_DISEASE -> 17
I-TRANSPORTATION -> 18
O -> 19


In [18]:
num_ner = len(enc_ner.classes_)
print(num_ner)

20


In [19]:
(train_sentences,valid_sentences,train_ner,valid_ner) = model_selection.train_test_split(sentences,ner,random_state = 42, train_size=0.8)

In [20]:
train_dataset = COVIDDataset(train_sentences,train_ner)
train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size = config.TRAIN_BATCH_SIZE,shuffle=True)
valid_dataset = COVIDDataset(valid_sentences,valid_ner)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset,batch_size = config.VALID_BATCH_SIZE,shuffle=True)

In [21]:
class COVIDModel(nn.Module):
    def __init__(self,num_ner):
        super(COVIDModel,self).__init__()
        self.num_ner = num_ner
        self.bert = transformers.AutoModel.from_pretrained(config.BASE_MODEL)
        self.dropout = nn.Dropout(0.1)
        self.out_ner = nn.Linear(768,self.num_ner)
    def forward(self,ids,mask,token_type_ids,target_ner):
        outputs = self.bert(
            ids,
            attention_mask = mask,
            token_type_ids = token_type_ids
        )
        o1 = outputs["last_hidden_state"]
        bo_ner = self.dropout(o1)
        
        ner = self.out_ner(bo_ner)
        
        loss_ner = loss_fn(ner, target_ner.to(ner.device), mask.to(ner.device), self.num_ner)

        acc_ner = acc_fn(ner,target_ner,mask,self.num_ner)

        f1 = f1_score_fn(ner,target_ner,mask,self.num_ner)

        return ner,loss_ner,acc_ner,f1

In [22]:
model = COVIDModel(num_ner)
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_steps = int(
    len(train_sentences) / config.TRAIN_BATCH_SIZE * config.EPOCH
)
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0, 
    num_training_steps=num_train_steps
)

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
train_losses = []
valid_losses = []
best_loss = np.inf
for epoch in range(config.EPOCH):
    train_loss, train_acc = train_fn(train_dataloader,model,optimizer,device,scheduler)
    valid_loss, valid_acc, f1_score = eval_fn(valid_dataloader,model,device)
    print(f"Epoch: {epoch+1} - Train Loss: {train_loss} - Train Accuracy: {train_acc}")
    print(f"Epoch: {epoch+1} - Valid Loss: {valid_loss} - Valid Accuracy: {valid_acc} - F1 score: {f1_score}")
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    if valid_loss<best_loss:
        best_loss = valid_loss

  4%|▎         | 9/249 [00:02<00:57,  4.19it/s]

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

100%|██████████| 249/249 [00:55<00:00,  4.50it/s]
100%|██████████| 63/63 [00:04<00:00, 13.90it/s]


Epoch: 1 - Train Loss: 0.8829921184773426 - Train Accuracy: 0.9436936936936937
Epoch: 1 - Valid Loss: 0.4260216137719533 - Valid Accuracy: 0.9333333333333333 - F1 score: 0.4358093738555908


100%|██████████| 249/249 [00:53<00:00,  4.63it/s]
100%|██████████| 63/63 [00:04<00:00, 14.11it/s]


Epoch: 2 - Train Loss: 0.34014162858087854 - Train Accuracy: 0.9727272727272728
Epoch: 2 - Valid Loss: 0.2291064943586077 - Valid Accuracy: 0.984 - F1 score: 0.6908043026924133


100%|██████████| 249/249 [00:53<00:00,  4.62it/s]
100%|██████████| 63/63 [00:04<00:00, 13.77it/s]


Epoch: 3 - Train Loss: 0.21007658134263205 - Train Accuracy: 0.976303317535545
Epoch: 3 - Valid Loss: 0.16005880693121563 - Valid Accuracy: 0.9827586206896551 - F1 score: 0.5819923877716064


100%|██████████| 249/249 [00:54<00:00,  4.53it/s]
100%|██████████| 63/63 [00:04<00:00, 13.38it/s]


Epoch: 4 - Train Loss: 0.15606335771969523 - Train Accuracy: 0.9682151589242054
Epoch: 4 - Valid Loss: 0.1298980818144859 - Valid Accuracy: 0.9591836734693877 - F1 score: 0.4420969486236572


100%|██████████| 249/249 [00:54<00:00,  4.54it/s]
100%|██████████| 63/63 [00:04<00:00, 13.96it/s]


Epoch: 5 - Train Loss: 0.12224419845874052 - Train Accuracy: 0.9895833333333334
Epoch: 5 - Valid Loss: 0.1176900038170436 - Valid Accuracy: 1.0 - F1 score: 0.799248993396759


100%|██████████| 249/249 [00:54<00:00,  4.59it/s]
100%|██████████| 63/63 [00:04<00:00, 13.78it/s]


Epoch: 6 - Train Loss: 0.10175339074199458 - Train Accuracy: 0.9957716701902748
Epoch: 6 - Valid Loss: 0.09763523386347861 - Valid Accuracy: 0.9847328244274809 - F1 score: 0.7044234871864319


100%|██████████| 249/249 [00:54<00:00,  4.58it/s]
100%|██████████| 63/63 [00:04<00:00, 13.89it/s]


Epoch: 7 - Train Loss: 0.08520340151995061 - Train Accuracy: 0.9797101449275363
Epoch: 7 - Valid Loss: 0.08823035142961003 - Valid Accuracy: 0.9854014598540146 - F1 score: 0.6106010675430298


100%|██████████| 249/249 [00:54<00:00,  4.58it/s]
100%|██████████| 63/63 [00:04<00:00, 14.00it/s]


Epoch: 8 - Train Loss: 0.07314040251525052 - Train Accuracy: 0.9965277777777778
Epoch: 8 - Valid Loss: 0.08035702305653739 - Valid Accuracy: 0.9733333333333334 - F1 score: 0.5231986045837402


100%|██████████| 249/249 [00:54<00:00,  4.58it/s]
100%|██████████| 63/63 [00:04<00:00, 13.96it/s]


Epoch: 9 - Train Loss: 0.0646099025181618 - Train Accuracy: 0.9966555183946488
Epoch: 9 - Valid Loss: 0.07675454999128031 - Valid Accuracy: 0.9852941176470589 - F1 score: 0.6169127821922302


100%|██████████| 249/249 [00:54<00:00,  4.59it/s]
100%|██████████| 63/63 [00:04<00:00, 14.03it/s]


Epoch: 10 - Train Loss: 0.05716112335343437 - Train Accuracy: 1.0
Epoch: 10 - Valid Loss: 0.0782344691928417 - Valid Accuracy: 0.9923664122137404 - F1 score: 0.6492846012115479


100%|██████████| 249/249 [00:54<00:00,  4.58it/s]
100%|██████████| 63/63 [00:04<00:00, 13.67it/s]


Epoch: 11 - Train Loss: 0.051991825332363927 - Train Accuracy: 0.9978118161925602
Epoch: 11 - Valid Loss: 0.0701470049245963 - Valid Accuracy: 1.0 - F1 score: 0.7562559843063354


100%|██████████| 249/249 [00:54<00:00,  4.60it/s]
100%|██████████| 63/63 [00:04<00:00, 14.09it/s]


Epoch: 12 - Train Loss: 0.046826894967311836 - Train Accuracy: 0.9969604863221885
Epoch: 12 - Valid Loss: 0.0749885563753427 - Valid Accuracy: 0.9871794871794872 - F1 score: 0.6531928181648254


100%|██████████| 249/249 [00:54<00:00,  4.59it/s]
100%|██████████| 63/63 [00:04<00:00, 13.83it/s]


Epoch: 13 - Train Loss: 0.04356471446892583 - Train Accuracy: 1.0
Epoch: 13 - Valid Loss: 0.06882507048015084 - Valid Accuracy: 1.0 - F1 score: 0.7567568421363831


100%|██████████| 249/249 [00:54<00:00,  4.57it/s]
100%|██████████| 63/63 [00:04<00:00, 13.84it/s]


Epoch: 14 - Train Loss: 0.0400531128737103 - Train Accuracy: 1.0
Epoch: 14 - Valid Loss: 0.06779371403039448 - Valid Accuracy: 1.0 - F1 score: 0.8037409782409668


100%|██████████| 249/249 [00:54<00:00,  4.55it/s]
100%|██████████| 63/63 [00:04<00:00, 13.86it/s]


Epoch: 15 - Train Loss: 0.03771661606852908 - Train Accuracy: 0.994475138121547
Epoch: 15 - Valid Loss: 0.06786170406710534 - Valid Accuracy: 1.0 - F1 score: 0.8358869552612305


100%|██████████| 249/249 [00:54<00:00,  4.58it/s]
100%|██████████| 63/63 [00:04<00:00, 14.01it/s]


Epoch: 16 - Train Loss: 0.03582265368666515 - Train Accuracy: 0.9946236559139785
Epoch: 16 - Valid Loss: 0.066983746468193 - Valid Accuracy: 0.9716981132075472 - F1 score: 0.643653929233551


100%|██████████| 249/249 [00:54<00:00,  4.59it/s]
100%|██████████| 63/63 [00:04<00:00, 13.91it/s]


Epoch: 17 - Train Loss: 0.03449504709162985 - Train Accuracy: 1.0
Epoch: 17 - Valid Loss: 0.06677097005266992 - Valid Accuracy: 0.9791666666666666 - F1 score: 0.7072843313217163


100%|██████████| 249/249 [00:54<00:00,  4.59it/s]
100%|██████████| 63/63 [00:04<00:00, 13.95it/s]


Epoch: 18 - Train Loss: 0.0332948810132931 - Train Accuracy: 0.9975429975429976
Epoch: 18 - Valid Loss: 0.06626613410041919 - Valid Accuracy: 1.0 - F1 score: 0.7600763440132141


100%|██████████| 249/249 [00:54<00:00,  4.60it/s]
100%|██████████| 63/63 [00:04<00:00, 14.09it/s]


Epoch: 19 - Train Loss: 0.031683393982220365 - Train Accuracy: 0.9908814589665653
Epoch: 19 - Valid Loss: 0.06678225233086518 - Valid Accuracy: 0.978021978021978 - F1 score: 0.7071689963340759


100%|██████████| 249/249 [00:54<00:00,  4.59it/s]
100%|██████████| 63/63 [00:04<00:00, 13.94it/s]

Epoch: 20 - Train Loss: 0.03148315157591698 - Train Accuracy: 0.992
Epoch: 20 - Valid Loss: 0.0661979949929648 - Valid Accuracy: 0.9927536231884058 - F1 score: 0.48771199584007263





In [24]:
# Lưu lại trọng số của mô hình
torch.save(model.state_dict(), "model_weights.pth")

In [25]:
# Load lại mô hình
model.load_state_dict(torch.load("model_weights.pth"))

  model.load_state_dict(torch.load("model_weights.pth"))


<All keys matched successfully>

In [26]:
model

COVIDModel(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [27]:
file_path_test = "/kaggle/input/ner-covid-19/test_word.conll"
df_test = conll_to_dataframe(read_conll(file_path_test),columns)
df_test

Unnamed: 0,Word,NER
0,Từ,O
1,24,B-DATE
2,-,I-DATE
3,7,I-DATE
4,đến,O
...,...,...
85673,nhiệt_đới,I-LOCATION
85674,trung_ương,I-LOCATION
85675,cơ_sở,I-LOCATION
85676,Đông_Anh,I-LOCATION


In [28]:
df_test["NER"] = df_test["NER"].replace(np.nan,'nan')
df_test = df_test[df_test["NER"].str.isupper()]

df_test = df_test[~uppercase_rows].reset_index(drop=True)
df_test

  df_test = df_test[~uppercase_rows].reset_index(drop=True)


Unnamed: 0,Word,NER
0,Từ,O
1,24,B-DATE
2,-,I-DATE
3,7,I-DATE
4,đến,O
...,...,...
83923,nhiệt_đới,I-LOCATION
83924,trung_ương,I-LOCATION
83925,cơ_sở,I-LOCATION
83926,Đông_Anh,I-LOCATION


In [29]:
sentence_number_test = 1
sentence_column_test = []

for index,row in df_test.iterrows():
    if row["Word"] in {".","!","?"}:
        sentence_number_test += 1
    sentence_column_test.append(f"Sentence: {sentence_number_test}")
df_test["Sentence #"] = sentence_column_test
df_test = df_test.reset_index(drop=True)
df_test

Unnamed: 0,Word,NER,Sentence #
0,Từ,O,Sentence: 1
1,24,B-DATE,Sentence: 1
2,-,I-DATE,Sentence: 1
3,7,I-DATE,Sentence: 1
4,đến,O,Sentence: 1
...,...,...,...
83923,nhiệt_đới,I-LOCATION,Sentence: 2901
83924,trung_ương,I-LOCATION,Sentence: 2901
83925,cơ_sở,I-LOCATION,Sentence: 2901
83926,Đông_Anh,I-LOCATION,Sentence: 2901


In [30]:
df_test["NER"].value_counts()

NER
O                        62271
I-LOCATION                4809
B-LOCATION                4349
I-ORGANIZATION            1979
B-PATIENT_ID              1963
I-DATE                    1716
B-DATE                    1622
I-SYMPTOM_AND_DISEASE     1438
B-SYMPTOM_AND_DISEASE     1115
B-ORGANIZATION             758
B-AGE                      565
B-GENDER                   452
B-NAME                     310
B-TRANSPORTATION           189
B-JOB                      167
I-JOB                      112
I-TRANSPORTATION            68
I-PATIENT_ID                26
I-NAME                      13
I-AGE                        6
Name: count, dtype: int64

In [31]:
def processing_data(df):
    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    ner = df.groupby("Sentence #")["NER"].apply(list).values
    return sentences, ner

In [32]:
# Xử lí các ner không tồn tại trong ner train nếu không sẽ bị gặp lỗi unseenlabel
df_test_copy = df_test.copy()

enc_ner_unknown = preprocessing.LabelEncoder()
enc_ner_unknown.classes_ = np.append(enc_ner.classes_,"unknown")

unseen_ner = set(df_test_copy["NER"]) -  set(enc_ner.classes_)
df_test_copy.loc[:,"NER"] = df_test_copy["NER"].replace(unseen_ner,"unknown")

df_test_copy.loc[:,"NER"] = enc_ner_unknown.fit_transform(df_test_copy["NER"])

sentences_test, ner_test = processing_data(df_test_copy)
test_dataset = COVIDDataset(sentences_test,ner_test)

In [33]:
# model = COVIDModel(num_ner)
# model.load_state_dict(torch.load("model_weights.pth"))
# model.to(device)

with torch.no_grad():
    data = test_dataset[0]
    for k, v in data.items():
        data[k] = v.to(device).unsqueeze(0)
    ner_test, _, acc, f1_score = model(**data)
    predict_ner = enc_ner.inverse_transform(
        np.clip(ner_test.argmax(2).cpu().numpy().reshape(-1), 0, len(enc_ner.classes_) - 1))[:data['ids'].size(1)]
    print(print(f"Test Accuracy: {acc:.4f}, Test F1-score: {f1_score:.4f}"))

Test Accuracy: 0.9400, Test F1-score: 0.5614
None


In [37]:
df_test_new = df_test.iloc[:len(predict_ner)].copy()
df_test_new["predict_ner"] = predict_ner
df_test_new[:20]

Unnamed: 0,Word,NER,Sentence #,predict_ner
0,Từ,O,Sentence: 1,O
1,24,B-DATE,Sentence: 1,O
2,-,I-DATE,Sentence: 1,I-DATE
3,7,I-DATE,Sentence: 1,I-DATE
4,đến,O,Sentence: 1,I-DATE
5,31,B-DATE,Sentence: 1,O
6,-,I-DATE,Sentence: 1,I-DATE
7,7,I-DATE,Sentence: 1,I-DATE
8,",",O,Sentence: 1,I-DATE
9,được,O,Sentence: 1,O


In [35]:
#Test a sentence
# !pip install py_vncorenlp

  pid, fd = os.forkpty()


Collecting py_vncorenlp
  Downloading py_vncorenlp-0.1.4.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pyjnius (from py_vncorenlp)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hBuilding wheels for collected packages: py_vncorenlp
  Building wheel for py_vncorenlp (setup.py) ... [?25ldone
[?25h  Created wheel for py_vncorenlp: filename=py_vncorenlp-0.1.4-py3-none-any.whl size=4305 sha256=5ccbf4e9aa0b6a0b9ee383a5a4a8efb73e7e26249aa9a52417165fb4d5dd2c7b
  Stored in directory: /root/.cache/pip/wheels/d5/d9/bf/62632cdb007c702a0664091e92a0bb1f18a2fcecbe962d9827
Successfully built py_vncorenlp
Installing collected packages: pyjnius, py_vncorenlp
Successfully installed py_vncoren

In [46]:
# !apt-get update -qq
# !apt-get install -y openjdk-11-jdk-headless -qq > /dev/null

W: https://packages.cloud.google.com/apt/dists/gcsfuse-focal/InRelease: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.
W: https://packages.cloud.google.com/apt/dists/google-fast-socket/InRelease: Key is stored in legacy trusted.gpg keyring (/etc/apt/trusted.gpg), see the DEPRECATION section in apt-key(8) for details.


In [63]:
# import py_vncorenlp

# Automatically download VnCoreNLP components from the original repository
# and save them in some local machine folder
# save_dir = "/kaggle/working/"
# py_vncorenlp.download_model(save_dir=save_dir)

# Load the word and sentence segmentation component
# rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"],save_dir=save_dir)

# output_text = rdrsegmenter.word_segment(text)

text = """Ông Nguyễn_Văn_Công đã bắt_đầu làm việc ở Bệnh_viện nhiệt_đới trung_ương."""
print(text)

tokenize_sentence = config.TOKENIZE.encode(text)
print(tokenize_sentence)
sentence_text = text.split()

text_dataset = COVIDDataset(
    word = [sentence_text],
    ner = [[1]*len(sentence_text)]
)
with torch.no_grad():
    data = text_dataset[0]
    for k, v in data.items():
        data[k] = v.to(device).unsqueeze(0)
    ner, _, _, _ = model(**data)
predicted_ner = enc_ner.inverse_transform(
                        ner.argmax(2).cpu().numpy().reshape(-1))[:len(tokenize_sentence)]
print(predicted_ner)

Ông Nguyễn_Văn_Công đã bắt_đầu làm việc ở Bệnh_viện nhiệt_đới trung_ương.
[0, 168, 34449, 14, 403, 47, 49, 25, 1089, 5167, 12965, 16820, 10838, 2]
['O' 'O' 'B-NAME' 'O' 'O' 'O' 'O' 'O' 'B-LOCATION' 'I-LOCATION'
 'I-LOCATION' 'I-LOCATION' 'I-LOCATION' 'O']
