In [1]:
import os
from functools import partial
import paddle
import paddle.nn.functional as F
from paddlenlp.metrics import AccuracyAndF1
from paddlenlp.datasets import load_dataset
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import SkepTokenizer, SkepModel, LinearDecayWithWarmup
import numpy as np
import random
import json

def read_json(data_path):
    with open(data_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        for sample in data:
            text = sample["text"]
            for opinion in sample['opinions']:
                polar_expression = " ".join(opinion['Polar_expression'][0])
                label = label2id[opinion["Polarity"]]
                example = {"label": int(label), "target_text": polar_expression, "text": text}
                yield example

def load_dict(dict_path):
    with open(dict_path, "r", encoding="utf-8") as f:
        words = [word.strip() for word in f.readlines()]
        word2id = dict(zip(words, range(len(words))))
        id2word = dict((v, k) for k, v in word2id.items())
        print(f"word2id: {word2id}; id2word: {id2word}")
        return word2id, id2word

train_path = "./data/opener_en_raw/train.json"
dev_path = "./data/opener_en_raw/dev.json"
test_path = "./data/opener_en_raw/test.json"
label_path = "./data/opener_en_relations/label.dict"

# load and process data
label2id, id2label = load_dict(label_path)
train_ds = load_dataset(read_json, data_path=train_path, lazy=False)
dev_ds =  load_dataset(read_json, data_path=dev_path, lazy=False)
test_ds =  load_dataset(read_json, data_path=test_path, lazy=False)

# print examples
for example in train_ds[:6]:
    print(example)

word2id: {'Negative': 0, 'Positive': 1}; id2word: {0: 'Negative', 1: 'Positive'}
{'label': 1, 'target_text': 'the most wonderful part of the trip', 'text': 'Yes , it really was a great experience and we visited various places but the most wonderful part of the trip was our stay at the Oberoi Udaivilas Luxury Hotel .'}
{'label': 1, 'target_text': 'how grand looks', 'text': 'I can ’t explain in words how grand this place looks .'}
{'label': 1, 'target_text': 'unique blend of the old world royal charm and the modern luxuries', 'text': 'It is a unique blend of the old world royal charm and the modern luxuries .'}
{'label': 1, 'target_text': 'definitely going again', 'text': 'I ’m definitely going there again whenever I get a chance .'}
{'label': 0, 'target_text': 'Bit pricey', 'text': 'Bit pricey and but away from center'}
{'label': 0, 'target_text': 'away from center', 'text': 'Bit pricey and but away from center'}


In [3]:
def convert_example_to_feature(example, tokenizer, label2id, max_seq_len=512, is_test=False):
    encoded_inputs = tokenizer(example["target_text"], text_pair=example["text"], max_seq_len=max_seq_len,
                               return_length=True)

    if not is_test:
        label = example["label"]
        return encoded_inputs["input_ids"], encoded_inputs["token_type_ids"], encoded_inputs["seq_len"], label

    return encoded_inputs["input_ids"], encoded_inputs["token_type_ids"], encoded_inputs["seq_len"]


model_name = "skep_ernie_2.0_large_en"
batch_size = 8
max_seq_len = 512

tokenizer = SkepTokenizer.from_pretrained(model_name)
trans_func = partial(convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=max_seq_len)
train_ds = train_ds.map(trans_func, lazy=False)
dev_ds = dev_ds.map(trans_func, lazy=False)
test_ds = test_ds.map(trans_func, lazy=False)

# print examples
# print examples
for example in train_ds[:5]:
    print("input_ids: ", example[0])
    print("token_type_ids: ", example[1])
    print("seq_len: ", example[2])
    print("label: ", example[3])
    print()

[32m[2023-04-24 19:39:56,460] [    INFO][0m - Found /home/christophe/.paddlenlp/models/skep_ernie_2.0_large_en/skep_ernie_2.0_large_en.vocab.txt[0m


input_ids:  [101, 1996, 2087, 6919, 2112, 1997, 1996, 4440, 102, 2748, 1010, 2009, 2428, 2001, 1037, 2307, 3325, 1998, 2057, 4716, 2536, 3182, 2021, 1996, 2087, 6919, 2112, 1997, 1996, 4440, 2001, 2256, 2994, 2012, 1996, 15578, 26692, 20904, 4886, 14762, 3022, 9542, 3309, 1012, 102]
token_type_ids:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
seq_len:  45
label:  1

input_ids:  [101, 2129, 2882, 3504, 102, 1045, 2064, 1521, 1056, 4863, 1999, 2616, 2129, 2882, 2023, 2173, 3504, 1012, 102]
token_type_ids:  [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
seq_len:  19
label:  1

input_ids:  [101, 4310, 12586, 1997, 1996, 2214, 2088, 2548, 11084, 1998, 1996, 2715, 28359, 9496, 2229, 102, 2009, 2003, 1037, 4310, 12586, 1997, 1996, 2214, 2088, 2548, 11084, 1998, 1996, 2715, 28359, 9496, 2229, 1012, 102]
token_type_ids:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 

In [4]:
batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
    Stack(dtype="int64"),
    Stack(dtype="int64")
): fn(samples)

train_batch_sampler = paddle.io.BatchSampler(train_ds, batch_size=batch_size, shuffle=True)
dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=batch_size, shuffle=False)
test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=batch_size, shuffle=False)

train_loader = paddle.io.DataLoader(train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn)
dev_loader = paddle.io.DataLoader(dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn)
test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn)


class SkepForSequenceClassification(paddle.nn.Layer):
    def __init__(self, skep, num_classes=2, dropout=None):
        super(SkepForSequenceClassification, self).__init__()
        self.num_classes = num_classes
        self.skep = skep
        self.dropout = paddle.nn.Dropout(dropout if dropout is not None else self.skep.config["hidden_dropout_prob"])
        self.classifier = paddle.nn.Linear(self.skep.config["hidden_size"], num_classes)

    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
        _, pooled_output = self.skep(input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask)

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

def set_seed(seed):
    paddle.seed(seed)
    random.seed(seed)
    np.random.seed(seed)

# model hyperparameter  setting
num_epoch = 3
learning_rate = 3e-5
weight_decay = 0.01
warmup_proportion = 0.1
max_grad_norm = 1.0
log_step = 20
eval_step = 100
seed = 1000
checkpoint = "./checkpoint/"

set_seed(seed)
use_gpu = True if paddle.get_device().startswith("gpu") else False
print('use_gpu: ', use_gpu)
if use_gpu:
    paddle.set_device("gpu:0")
if not os.path.exists(checkpoint):
    os.mkdir(checkpoint)

skep = SkepModel.from_pretrained(model_name)
model = SkepForSequenceClassification(skep, num_classes=len(label2id))

num_training_steps = len(train_loader) * num_epoch
lr_scheduler = LinearDecayWithWarmup(learning_rate=learning_rate, total_steps=num_training_steps, warmup=warmup_proportion)
decay_params = [p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])]
grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm)
optimizer = paddle.optimizer.AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip)

metric = AccuracyAndF1()

[32m[2023-04-24 19:40:01,532] [    INFO][0m - Already cached /home/christophe/.paddlenlp/models/skep_ernie_2.0_large_en/skep_ernie_2.0_large_en.pdparams[0m


use_gpu:  True


W0424 19:40:01.534180 41826 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 8.6, Driver API Version: 12.1, Runtime API Version: 11.7
W0424 19:40:01.536700 41826 gpu_resources.cc:91] device: 0, cuDNN Version: 8.8.


In [5]:
def evaluate(model, data_loader, metric):

    model.eval()
    metric.reset()
    for batch_data in data_loader:
        input_ids, token_type_ids, _, labels = batch_data
        logits = model(input_ids, token_type_ids=token_type_ids)
        correct = metric.compute(logits, labels)
        metric.update(correct)

    accuracy, precision, recall, f1, _ = metric.accumulate()

    return accuracy, precision, recall, f1

def train():
    # start to train model
    global_step, best_f1 = 1, 0.
    model.train()
    for epoch in range(1, num_epoch+1):
        for batch_data in train_loader():
            input_ids, token_type_ids, _, labels = batch_data
            # logits: batch_size, seql_len, num_tags
            logits = model(input_ids, token_type_ids=token_type_ids)
            loss = F.cross_entropy(logits, labels)

            loss.backward()
            lr_scheduler.step()
            optimizer.step()
            optimizer.clear_grad()

            if global_step > 0 and global_step % log_step == 0:
                print(f"epoch: {epoch} - global_step: {global_step}/{num_training_steps} - loss:{loss.numpy().item():.6f}")
            if (global_step > 0 and global_step % eval_step == 0) or global_step == num_training_steps:
                accuracy, precision, recall, f1  = evaluate(model, dev_loader,  metric)
                model.train()
                if f1 > best_f1:
                    print(f"best F1 performence has been updated: {best_f1:.5f} --> {f1:.5f}")
                    best_f1 = f1
                    paddle.save(model.state_dict(), f"{checkpoint}/best_cls.pdparams")
                print(f'evalution result: accuracy:{accuracy:.5f} precision: {precision:.5f}, recall: {recall:.5f},  F1: {f1:.5f}')

            global_step += 1

    paddle.save(model.state_dict(), f"{checkpoint}/final_cls.pdparams")

train()

epoch: 1 - global_step: 20/1083 - loss:0.818317
epoch: 1 - global_step: 40/1083 - loss:0.390278
epoch: 1 - global_step: 60/1083 - loss:0.355876
epoch: 1 - global_step: 80/1083 - loss:0.119800
epoch: 1 - global_step: 100/1083 - loss:0.001870
best F1 performence has been updated: 0.00000 --> 0.95470
evalution result: accuracy:0.93500 precision: 0.94483, recall: 0.96479,  F1: 0.95470
epoch: 1 - global_step: 120/1083 - loss:0.017386
epoch: 1 - global_step: 140/1083 - loss:0.477388
epoch: 1 - global_step: 160/1083 - loss:0.800743
epoch: 1 - global_step: 180/1083 - loss:0.004216
epoch: 1 - global_step: 200/1083 - loss:0.612047
evalution result: accuracy:0.93000 precision: 0.93243, recall: 0.97183,  F1: 0.95172
epoch: 1 - global_step: 220/1083 - loss:0.001192
epoch: 1 - global_step: 240/1083 - loss:0.035013
epoch: 1 - global_step: 260/1083 - loss:0.126476
epoch: 1 - global_step: 280/1083 - loss:0.006750
epoch: 1 - global_step: 300/1083 - loss:0.000322
evalution result: accuracy:0.93250 precis

In [6]:
# load model
model_path = "./checkpoint/best_cls.pdparams"

loaded_state_dict = paddle.load(model_path)
skep = SkepModel.from_pretrained(model_name)
model = SkepForSequenceClassification(skep, num_classes=len(label2id))
model.load_dict(loaded_state_dict)

accuracy, precision, recall, f1  = evaluate(model, test_loader,  metric)
print(f'evalution result: accuracy:{accuracy:.5f} precision: {precision:.5f}, recall: {recall:.5f},  F1: {f1:.5f}')

[32m[2023-04-24 19:42:13,882] [    INFO][0m - Already cached /home/christophe/.paddlenlp/models/skep_ernie_2.0_large_en/skep_ernie_2.0_large_en.pdparams[0m


evalution result: accuracy:0.96647 precision: 0.98296, recall: 0.96812,  F1: 0.97549
