In [1]:
import os
from functools import partial
import paddle
import paddle.nn.functional as F
from paddlenlp.metrics import ChunkEvaluator
from paddlenlp.datasets import load_dataset
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import SkepTokenizer, SkepModel, LinearDecayWithWarmup
import json
import numpy as np
import random

train_path = "./data/opener_en_extraction/train.json"
dev_path = "./data/opener_en_extraction/dev.json"
test_path = "./data/opener_en_extraction/test.json"
label_path = "./data/opener_en_extraction/label.dict"

# load and process data

def read_json(data_path):
    with open(data_path, "r", encoding="utf-8") as f:
        for sample in f:
            sample = json.loads(sample)
            sources = sample["sources"]
            targets = sample["targets"]
            expressions = sample["expressions"]

            label = []
            for i in range(len(sources)):
                source = sources[i]
                target = targets[i]
                expression = expressions[i]

                if source != "O":
                    label.append(source)
                elif target != "O":
                    label.append(target)
                else:
                    label.append(expression)

            sample["label"] = label

            text = sample["text"]
            label = sample["label"]
            assert len(text) == len(label), f"{text},  {label}"
            example = {"text": text, "label": label}

            yield example

def load_dict(dict_path):
    with open(dict_path, "r", encoding="utf-8") as f:
        words = [word.strip() for word in f.readlines()]
        word2id = dict(zip(words, range(len(words))))
        id2word = dict((v, k) for k, v in word2id.items())

        return word2id, id2word

# load and process data
label2id, id2label = load_dict(label_path)
train_ds = load_dataset(read_json, data_path=train_path, lazy=False)
dev_ds =  load_dataset(read_json, data_path=dev_path, lazy=False)
test_ds =  load_dataset(read_json, data_path=test_path, lazy=False)

for example in train_ds[:5]:
    print(example)


{'text': ['Experienced', 'staff', 'and', 'had', 'a', 'memorable', 'stay'], 'label': ['O', 'O', 'O', 'O', 'O', 'O', 'O']}
{'text': ['India', 'as', 'a', 'country', 'has', 'always', 'fascinated', 'me', 'and', 'all', 'of', 'my', 'friends', 'who', 'have', 'been', 'there', 'always', 'have', 'wonderful', 'things', 'to', 'say', 'about', 'it', '.'], 'label': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}
{'text': ['One', 'of', 'my', 'friends', 'who', 'had', 'been', 'there', 'before', 'was', 'planning', 'a', 'weeklong', 'trip', 'to', 'Rajasthan', 'in', 'India', 'and', 'I', 'decided', 'to', 'join', 'him', 'this', 'time', '.'], 'label': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}
{'text': ['Yes', ',', 'it', 'really', 'was', 'a', 'great', 'experience', 'and', 'we', 'visited', 'various', 'places', 'but', 'the', 'most', 'wond

In [2]:
def convert_example_to_feature(example, tokenizer, label2id, max_seq_len=512, is_test=False):
    encoded_inputs = tokenizer(example["text"], is_split_into_words=True, max_seq_len=max_seq_len, return_length=True)

    if not is_test:
        # print(example["label"])
        label = [label2id["O"]] + [label2id[label_term] for label_term in example["label"]][:(max_seq_len - 2)] + [
            label2id["O"]]

        assert len(encoded_inputs["input_ids"]) == len(
            label), f"input_ids: {len(encoded_inputs['input_ids'])}, label: {len(label)}"
        return encoded_inputs["input_ids"], encoded_inputs["token_type_ids"], encoded_inputs["seq_len"], label

    return encoded_inputs["input_ids"], encoded_inputs["token_type_ids"], encoded_inputs["seq_len"]

model_name = "skep_ernie_2.0_large_en"
batch_size = 8
max_seq_len = 512

tokenizer = SkepTokenizer.from_pretrained(model_name)
trans_func = partial(convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=max_seq_len)
train_ds = train_ds.map(trans_func, lazy=False)
dev_ds = dev_ds.map(trans_func, lazy=False)
test_ds = test_ds.map(trans_func, lazy=False)

# print examples
for example in train_ds[:5]:
    print("input_ids: ", example[0])
    print("token_type_ids: ", example[1])
    print("seq_len: ", example[2])
    print("label: ", example[3])
    print()


[32m[2023-04-24 14:41:57,143] [    INFO][0m - Found /home/christophe/.paddlenlp/models/skep_ernie_2.0_large_en/skep_ernie_2.0_large_en.vocab.txt[0m


input_ids:  [101, 100, 3095, 1998, 2018, 1037, 13432, 2994, 102]
token_type_ids:  [0, 0, 0, 0, 0, 0, 0, 0, 0]
seq_len:  9
label:  [0, 0, 0, 0, 0, 0, 0, 0, 0]

input_ids:  [101, 100, 2004, 1037, 2406, 2038, 2467, 15677, 2033, 1998, 2035, 1997, 2026, 2814, 2040, 2031, 2042, 2045, 2467, 2031, 6919, 2477, 2000, 2360, 2055, 2009, 1012, 102]
token_type_ids:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
seq_len:  28
label:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

input_ids:  [101, 100, 1997, 2026, 2814, 2040, 2018, 2042, 2045, 2077, 2001, 4041, 1037, 100, 4440, 2000, 100, 1999, 100, 1998, 100, 2787, 2000, 3693, 2032, 2023, 2051, 1012, 102]
token_type_ids:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
seq_len:  29
label:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

input_ids:  [101, 100, 1010, 2009, 2428, 2001, 1037, 2307,

In [3]:
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
    Stack(dtype="int64"),
    Pad(axis=0, pad_val= -1)
): fn(samples)

train_batch_sampler = paddle.io.BatchSampler(train_ds, batch_size=batch_size, shuffle=True)
dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=batch_size, shuffle=False)
test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=batch_size, shuffle=False)

train_loader = paddle.io.DataLoader(train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn)
dev_loader = paddle.io.DataLoader(dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn)
test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn)

class SkepForTokenClassification(paddle.nn.Layer):
    def __init__(self, skep, num_classes=2, dropout=None):
        super(SkepForTokenClassification, self).__init__()
        self.num_classes = num_classes
        self.skep = skep
        self.dropout = paddle.nn.Dropout(dropout if dropout is not None else self.skep.config["hidden_dropout_prob"])
        self.classifier = paddle.nn.Linear(self.skep.config["hidden_size"], num_classes)

    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
        sequence_output, _ = self.skep(input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask)

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        return logits

def set_seed(seed):
    paddle.seed(seed)
    random.seed(seed)
    np.random.seed(seed)


# model hyperparameter  setting
num_epoch = 3
learning_rate = 3e-5
weight_decay = 0.01
warmup_proportion = 0.1
max_grad_norm = 1.0
log_step = 20
eval_step = 100
seed = 1000
checkpoint = "./checkpoint/"

set_seed(seed)
use_gpu = True if paddle.get_device().startswith("gpu") else False
print("use_gpu: ", use_gpu)
if use_gpu:
    paddle.set_device("gpu:0")
if not os.path.exists(checkpoint):
    os.mkdir(checkpoint)

skep = SkepModel.from_pretrained(model_name)
model = SkepForTokenClassification(skep, num_classes=len(label2id))

num_training_steps = len(train_loader) * num_epoch
lr_scheduler = LinearDecayWithWarmup(learning_rate=learning_rate, total_steps=num_training_steps, warmup=warmup_proportion)
decay_params = [p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])]
grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm)
optimizer = paddle.optimizer.AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip)

metric = ChunkEvaluator(label2id.keys())

[32m[2023-04-24 14:41:57,186] [    INFO][0m - Already cached /home/christophe/.paddlenlp/models/skep_ernie_2.0_large_en/skep_ernie_2.0_large_en.pdparams[0m


use_gpu:  True


W0424 14:41:57.187945 24113 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 8.6, Driver API Version: 12.1, Runtime API Version: 11.7
W0424 14:41:57.188593 24113 gpu_resources.cc:91] device: 0, cuDNN Version: 8.8.


In [4]:
def evaluate(model, data_loader, metric):

    model.eval()
    metric.reset()
    for idx, batch_data in enumerate(data_loader):
        input_ids, token_type_ids, seq_lens, labels = batch_data
        logits = model(input_ids, token_type_ids=token_type_ids)

        # count metric
        predictions = logits.argmax(axis=2)
        num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(seq_lens, predictions, labels)
        metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())

    precision, recall, f1 = metric.accumulate()
    return precision, recall, f1

def train():
    # start to train model
    global_step, best_f1 = 1, 0.
    model.train()
    for epoch in range(1, num_epoch+1):
        for batch_data in train_loader():
            input_ids, token_type_ids, _, labels = batch_data
            # logits: batch_size, seql_len, num_tags
            logits = model(input_ids, token_type_ids=token_type_ids)
            loss = F.cross_entropy(logits.reshape([-1, len(label2id)]), labels.reshape([-1]), ignore_index=-1)

            loss.backward()
            lr_scheduler.step()
            optimizer.step()
            optimizer.clear_grad()

            if global_step > 0 and global_step % log_step == 0:
                print(f"epoch: {epoch} - global_step: {global_step}/{num_training_steps} - loss:{loss.numpy().item():.6f}")
            if (global_step > 0 and global_step % eval_step == 0) or global_step == num_training_steps:
                precision, recall, f1  = evaluate(model, dev_loader,  metric)
                model.train()
                if f1 > best_f1:
                    print(f"best F1 performence has been updated: {best_f1:.5f} --> {f1:.5f}")
                    best_f1 = f1
                    paddle.save(model.state_dict(), f"{checkpoint}/best_ext.pdparams")
                print(f'evalution result: precision: {precision:.5f}, recall: {recall:.5f},  F1: {f1:.5f}')

            global_step += 1

    paddle.save(model.state_dict(), f"{checkpoint}/final_ext.pdparams")

train()

epoch: 1 - global_step: 20/654 - loss:0.704549
epoch: 1 - global_step: 40/654 - loss:0.529646
epoch: 1 - global_step: 60/654 - loss:0.542912
epoch: 1 - global_step: 80/654 - loss:0.646399
epoch: 1 - global_step: 100/654 - loss:0.335785
best F1 performence has been updated: 0.00000 --> 0.45697
evalution result: precision: 0.42463, recall: 0.49464,  F1: 0.45697
epoch: 1 - global_step: 120/654 - loss:0.375430
epoch: 1 - global_step: 140/654 - loss:0.322531
epoch: 1 - global_step: 160/654 - loss:0.336831
epoch: 1 - global_step: 180/654 - loss:0.596456
epoch: 1 - global_step: 200/654 - loss:0.372007
best F1 performence has been updated: 0.45697 --> 0.55368
evalution result: precision: 0.55782, recall: 0.54960,  F1: 0.55368
epoch: 2 - global_step: 220/654 - loss:0.593220
epoch: 2 - global_step: 240/654 - loss:0.203132
epoch: 2 - global_step: 260/654 - loss:0.145418
epoch: 2 - global_step: 280/654 - loss:0.129898
epoch: 2 - global_step: 300/654 - loss:0.318746
best F1 performence has been upd

In [5]:
# load model
model_path = "./checkpoint/best_ext.pdparams"

loaded_state_dict = paddle.load(model_path)
skep = SkepModel.from_pretrained(model_name)
model = SkepForTokenClassification(skep, num_classes=len(label2id))
model.load_dict(loaded_state_dict)

# evalute on test data
precision, recall, f1  = evaluate(model, test_loader,  metric)
print(f'evalution result: precision: {precision:.5f}, recall: {recall:.5f},  F1: {f1:.5f}')


[32m[2023-04-24 14:42:49,249] [    INFO][0m - Already cached /home/christophe/.paddlenlp/models/skep_ernie_2.0_large_en/skep_ernie_2.0_large_en.pdparams[0m


evalution result: precision: 0.60681, recall: 0.63636,  F1: 0.62124
