In [1]:
import os

import mindspore
from mindspore.dataset import text, GeneratorDataset, transforms
from mindspore import nn, context

from mindnlp.transforms import PadTransform
from mindnlp.transforms.tokenizers import BertTokenizer

from mindnlp.engine import Trainer, Evaluator
from mindnlp.engine.callbacks import CheckpointCallback, BestModelCallback
from mindnlp.metrics import Accuracy

In [2]:
# prepare dataset
class SentimentDataset:
    """Sentiment Dataset"""

    def __init__(self, path):
        self.path = path
        self._labels, self._text_a = [], []
        self._load()

    def _load(self):
        with open(self.path, "r", encoding="utf-8") as f:
            dataset = f.read()
        lines = dataset.split("\n")
        for line in lines[1:-1]:
            
            try:
                label, text_a = line.split("\t")
                #print(label)
                #print(text_a)
                self._labels.append(int(label))
                self._text_a.append(text_a)
            except:
                pass

    def __getitem__(self, index):
        return self._labels[index], self._text_a[index]

    def __len__(self):
        return len(self._labels)

In [None]:
# download dataset
!wget https://baidu-nlp.bj.bcebos.com/emotion_detection-dataset-1.0.0.tar.gz -O emotion_detection.tar.gz
!tar xvf emotion_detection.tar.gz

In [3]:
def process_dataset(source, tokenizer, pad_value, max_seq_len=64, batch_size=32, shuffle=True):
    column_names = ["label", "text_a"]
    rename_columns = ["label", "input_ids"]
    
    dataset = GeneratorDataset(source, column_names=column_names, shuffle=shuffle)
    # transforms
    pad_op = PadTransform(max_seq_len, pad_value=pad_value)
    type_cast_op = transforms.TypeCast(mindspore.int32)
    
    # map dataset
    dataset = dataset.map(operations=[tokenizer, pad_op], input_columns="text_a")
    dataset = dataset.map(operations=[type_cast_op], input_columns="label")
    # rename dataset
    dataset = dataset.rename(input_columns=column_names, output_columns=rename_columns)
    # batch dataset
    dataset = dataset.batch(batch_size)

    return dataset

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
pad_value = tokenizer.token_to_id('[PAD]')
print(type(pad_value))

<class 'int'>


In [5]:
dataset_train = process_dataset(SentimentDataset("data/new_new_train_data.tsv"), tokenizer, pad_value)
dataset_val = process_dataset(SentimentDataset("data/new_new_valid_data.tsv"), tokenizer, pad_value)
dataset_test = process_dataset(SentimentDataset("data/new_new_test_data.tsv"), tokenizer, pad_value, shuffle=False)

In [7]:
from mindnlp.models import BertForSequenceClassification
from mindnlp._legacy.amp import auto_mixed_precision

# set bert config and define parameters for training
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=17)
model = auto_mixed_precision(model, 'O1')

loss = nn.CrossEntropyLoss()
optimizer = nn.Adam(model.trainable_params(), learning_rate=2e-5)

metric = Accuracy()

# define callbacks to save checkpoints
ckpoint_cb = CheckpointCallback(save_path='checkpoint', ckpt_name='bert_emotect', epochs=1, keep_checkpoint_max=2)
best_model_cb = BestModelCallback(save_path='checkpoint', ckpt_name='bert_emotect_best', auto_load=True)

trainer = Trainer(network=model, train_dataset=dataset_train,
                  eval_dataset=dataset_val, metrics=metric,
                  epochs=5, loss_fn=loss, optimizer=optimizer, callbacks=[ckpoint_cb, best_model_cb],
                  jit=True)

  0%|          | 0.00/454M [00:00<?, ?B/s]

['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.layer_norm.gamma', 'cls.predictions.transform.layer_norm.beta', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']


In [8]:
# start training
trainer.run('label')

The train will start from the checkpoint saved in 'checkpoint'.


  0%|          | 0/313 [00:00<?, ?it/s]

Checkpoint: 'bert_emotect_epoch_0.ckpt' has been saved in epoch: 0.


  0%|          | 0/32 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.8088088088088088}
---------------Best Model: 'bert_emotect_best.ckpt' has been saved in epoch: 0.---------------


  0%|          | 0/313 [00:00<?, ?it/s]

Checkpoint: 'bert_emotect_epoch_1.ckpt' has been saved in epoch: 1.


  0%|          | 0/32 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.8408408408408409}
---------------Best Model: 'bert_emotect_best.ckpt' has been saved in epoch: 1.---------------


  0%|          | 0/313 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'bert_emotect_epoch_2.ckpt' has been saved in epoch: 2.


  0%|          | 0/32 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.8418418418418419}
---------------Best Model: 'bert_emotect_best.ckpt' has been saved in epoch: 2.---------------


  0%|          | 0/313 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'bert_emotect_epoch_3.ckpt' has been saved in epoch: 3.


  0%|          | 0/32 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.8418418418418419}


  0%|          | 0/313 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'bert_emotect_epoch_4.ckpt' has been saved in epoch: 4.


  0%|          | 0/32 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.8358358358358359}
Loading best model from 'checkpoint' with '['Accuracy']': [0.8418418418418419]...
---------------The model is already load the best model from 'bert_emotect_best.ckpt'.---------------


In [6]:
from mindnlp.models import BertForSequenceClassification
from mindnlp._legacy.amp import auto_mixed_precision

# set bert config and define parameters for training
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=17)
model = auto_mixed_precision(model, 'O1')


param_dict = mindspore.load_checkpoint("./checkpoint/bert_emotect_best.ckpt")
param_not_load = mindspore.load_param_into_net(model, param_dict)
print(param_not_load)

['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.layer_norm.gamma', 'cls.predictions.transform.layer_norm.beta', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']


[]


In [9]:
evaluator = Evaluator(network=model, eval_dataset=dataset_test, metrics=metric)
evaluator.run(tgt_columns="label")

  0%|          | 0/32 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.8388388388388388}


In [7]:
dataset_infer = SentimentDataset("data/new_new_infer100_data.tsv")

In [15]:
def predict(text, label=None):
    label_map = {   
                    0: "news_story",
                    1: "news_culture",
                    2: "news_entertainment",
                    3: "news_entertainment",
                    4: "news_sports",
                    6: "news_house",
                    7: "news_car",
                    8: "news_edu",
                    9: "news_tech",
                    10: "news_military",
                    12: "news_travel",
                    13: "news_world",
                    14: "stock",
                    15: "news_agriculture",
                    16: "news_game"
                }

    max_seq_len=64
    if len(text) < max_seq_len:
        text = text.ljust(max_seq_len, '0')
    else:
        text = text[:max_seq_len]
    
    text_tokenized = Tensor([tokenizer.encode(text).ids])
    logits = model(text_tokenized)
    predict_label = logits[0].asnumpy().argmax()
    info = f"inputs: '{text}', predict: '{label_map[predict_label]}'"
    if label is not None:
        info += f" , label: '{label_map[label]}'"
    print(info)

In [16]:
from mindspore import Tensor

for label, text in dataset_infer:
    predict(text, label)

inputs: 'Aibee获亿级行业投资，拓展精准零售新业态000000000000000000000000000000000000000000', predict: 'news_sports' , label: 'news_tech'
inputs: '名图换代车型lafesta 是否能超越当年的伊兰特000000000000000000000000000000000000000', predict: 'news_car' , label: 'news_car'
inputs: '尚雯婕出席活动，网友：每次你的打扮都让人眼前一亮，个性范十足0000000000000000000000000000000000', predict: 'news_entertainment' , label: 'news_entertainment'
inputs: '习近平主席讲话在解放军武警部队反响热烈000000000000000000000000000000000000000000000', predict: 'news_military' , label: 'news_military'
inputs: '一颗原子弹能够摧毁一支航母舰队吗？专家说出实话，你可能都不信0000000000000000000000000000000000', predict: 'news_military' , label: 'news_military'
inputs: '小龙虾火了，汽车品牌都不淡定了0000000000000000000000000000000000000000000000000', predict: 'news_car' , label: 'news_car'
inputs: '紧急应对618大促，中小型卖家如何快速弯道超车，精耕细作打爆款！00000000000000000000000000000000', predict: 'news_tech' , label: 'news_tech'
inputs: '椰视频｜椰岛自由行——三亚潜水梦之旅0000000000000000000000000000000000000000000000', predict: 'news_travel' , label: 'news_travel'
inputs: '男生读护校有前途吗？0

In [None]:
predict("家人们咱就是说一整个无语住了 绝绝子叠buff")