In [1]:
import paddle
import paddlenlp as ppnlp
from paddlenlp.data import Stack,Pad,Tuple
import paddle.nn.functional as F
import numpy as np
from functools import partial

In [2]:
train_ds,dev_ds,test_ds=ppnlp.datasets.ChnSentiCorp.get_datasets(['train','dev','test'])

In [3]:
label_list=train_ds.get_labels()

In [4]:
label_list

['0', '1']

In [5]:
dev_ds[:10]

[['這間酒店環境和服務態度亦算不錯,但房間空間太小~~不宣容納太大件行李~~且房間格調還可以~~ 中餐廳的廣東點心不太好吃~~要改善之~~~~但算價錢平宜~~可接受~~ 西餐廳格調都很好~~但吃的味道一般且令人等得太耐了~~要改善之~~',
  '1'],
 ['<荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道当年我听说这本书的时候花很长时间去图书馆找和借都没能如愿,所以这次一看到当当有,马上买了,红迷们也要记得备货哦!',
  '1'],
 ['商品的不足暂时还没发现，京东的订单处理速度实在.......周二就打包完成，周五才发货...', '0'],
 ['２００１年来福州就住在这里，这次感觉房间就了点，温泉水还是有的．总的来说很满意．早餐简单了些．', '1'],
 ['不错的上网本，外形很漂亮，操作系统应该是个很大的 卖点，电池还可以。整体上讲，作为一个上网本的定位，还是不错的。', '1'],
 ['房间地毯太脏，临近火车站十分吵闹，还好是双层玻璃。服务一般，酒店门口的TAXI讲是酒店的长期合作关系，每月要交费给酒店。从酒店到机场讲得是打表147元，到了后非要200元，可能被小宰30-40元。',
  '0'],
 ['本来想没事的时候翻翻，可惜看不下去，还是和张没法比，他的书能畅销大部分还是受张的影响，对这个男人实在是没好感，不知道怎么买的，后悔', '0'],
 ['这台机外观十分好,本人喜欢,性能不错,是LED显示屏,无线网卡是: 5100AGN 无线网卡,如果装的是一条2G 800MHZ的内存就无敌了,本本发热很小,总体来说是十分值得买的,前提是这台机是4299买的.',
  '1'],
 ['全键盘带数字键的 显卡足够强大.N卡相对A卡,个人偏向N卡 GHOST XP很容易.除了指纹识别外.所有驱动都能装齐全了,指纹识别,非要在XP下使用的朋友,可以用替代驱动.贡献下驱动地址: http://dlsvr01.asus.com/pub/ASUS/nb/F9Dc/Fingerprints_XP_080530.zip (华硕官方地址,放心下吧)',
  '1'],
 ['做工很漂亮，老婆很喜欢。T4200足够了，性价比不错的机器。测试了一下很安逸。今天晚上准备TWOW溜达圈，再看看整机表现如何！', '1']]

# 数据预处理

In [6]:
tokenizer=ppnlp.transformers.BertTokenizer.from_pretrained("bert-base-chinese")

[2022-07-25 18:03:03,349] [    INFO] - Found /home/aistudio/.paddlenlp/models/bert-base-chinese/bert-base-chinese-vocab.txt


In [7]:
def convert_example(example,tokenizer,label_list,max_seq_length=256,is_test=False):
    if is_test:
        text=example
    else:
        text,label=example
    
    encoded_inputs=tokenizer.encode(text=text,max_seq_len=max_seq_length)
    input_ids=encoded_inputs['input_ids']
    segment_ids=encoded_inputs["token_type_ids"]

    if not is_test:
        label_map={}
        for i,l in enumerate(label_list):
            label_map[l]=i
        label=label_map[label]
        label=np.array([label],dtype='int64')
        return input_ids,segment_ids,label
    else:
        return input_ids,segment_ids

In [8]:
def create_dataloader(dataset,trans_fn=None,mode='train',batch_size=1,use_gpu=False,pad_token_id=0,batchify_fn=None):
    if trans_fn:
        dataset=dataset.apply(trans_fn,lazy=True)
    if mode=='train' and use_gpu:
        sampler=paddle.io.DistributedBatchSampler(dataset=dataset,batch_size=batch_size,shuffle=True)
    else:
        shuffle=True if mode=='train' else False
        sampler=paddle.io.BatchSampler(dataset=dataset,batch_size=batch_size,shuffle=shuffle)

    dataloader=paddle.io.DataLoader(dataset,batch_sampler=sampler,return_list=True,collate_fn=batchify_fn)
    return dataloader

In [9]:
trans_fn=partial(convert_example,tokenizer=tokenizer,label_list=label_list,max_seq_length=128,is_test=False)
batchify_fn=lambda samples,fn=Tuple(Pad(axis=0,pad_val=tokenizer.pad_token_id),Pad(axis=0,pad_val=tokenizer.pad_token_id),Stack(dtype='int64')):[data for data in fn(samples)]

In [10]:
train_loader=create_dataloader(train_ds,mode='train',batch_size=64,batchify_fn=batchify_fn,trans_fn=trans_fn)
dev_loader=create_dataloader(dev_ds,mode='dev',batch_size=64,batchify_fn=batchify_fn,trans_fn=trans_fn)
test_loader=create_dataloader(test_ds,mode='test',batch_size=64,batchify_fn=batchify_fn,trans_fn=trans_fn)

# 加载模型

In [11]:
model=ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese',num_classes=2)

[2022-07-25 18:03:03,402] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/bert-base-chinese/bert-base-chinese.pdparams
W0725 18:03:03.405943  1346 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 10.1
W0725 18:03:03.410030  1346 gpu_resources.cc:91] device: 0, cuDNN Version: 7.6.


# 模型训练

In [12]:
learning_rate=1e-5
epochs=8
warmup_proption=0.1
weight_decay=0.01

num_training_steps=len(train_loader)*epochs
num_warmup_steps=int(warmup_proption*num_training_steps)


In [13]:
def get_lr_factor(current_step):
    if current_step<num_warmup_steps:
        return float(current_step)/float(max(1,num_warmup_steps))
    else:
        return max(0.0,float(num_training_steps-current_step)/float(max(1,num_training_steps-num_warmup_steps)))

lr_scheduler=paddle.optimizer.lr.LambdaDecay(learning_rate,lr_lambda=lambda current_step:get_lr_factor(current_step))

In [14]:
optimizer=paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=weight_decay,
    apply_decay_param_fun=lambda x : x in [
        p.name for n,p in model.named_parameters() if not any(nd in n for nd in ['bias','norm'])
    ]
)

In [15]:
criterion=paddle.nn.loss.CrossEntropyLoss()
metric=paddle.metric.Accuracy()

In [16]:
def evaluate(model,criterion,metric,data_loader):
    model.eval()
    metric.reset()
    losses=[]
    for batch in data_loader:
        input_ids,segment_ids,labels=batch
        logits=model(input_ids,segment_ids)
        loss=criterion(logits,labels)
        losses.append(loss.numpy())
        correct=metric.compute(logits,labels)
        metric.update(correct)
    accu=metric.accumulate()
    print(f'eval loss:{np.mean(losses)},accu:{accu}')
    model.train()
    metric.reset()


In [17]:
#开始训练
global_step=0
for epoch in range(1,epochs+1):
    for step,batch in enumerate(train_loader):
        input_ids,segment_ids,labels=batch
        logits=model(input_ids,segment_ids)
        loss=criterion(logits,labels)
        probs=F.softmax(logits,axis=1)
        correct=metric.compute(logits,labels)
        metric.update(correct)
        acc=metric.accumulate()

        global_step+=1
        if global_step%50==0:
            print(f"global step:{global_step},epoch:{epoch},batch:{step},loss:{loss.numpy()[0]},acc:{acc}")
        
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.clear_gradients()
    evaluate(model,criterion,metric,dev_loader)


global step:50,epoch:1,batch:49,loss:0.5385169982910156,acc:0.6021875
global step:100,epoch:1,batch:99,loss:0.2923926115036011,acc:0.74328125
global step:150,epoch:1,batch:149,loss:0.3264506459236145,acc:0.795625
eval loss:0.24134096503257751,accu:0.9083333333333333
global step:200,epoch:2,batch:49,loss:0.29615771770477295,acc:0.929375
global step:250,epoch:2,batch:99,loss:0.2572271227836609,acc:0.9265625
global step:300,epoch:2,batch:149,loss:0.20216935873031616,acc:0.9296875
eval loss:0.23167826235294342,accu:0.9191666666666667
global step:350,epoch:3,batch:49,loss:0.16878321766853333,acc:0.9434375
global step:400,epoch:3,batch:99,loss:0.05547510087490082,acc:0.946875
global step:450,epoch:3,batch:149,loss:0.06831805408000946,acc:0.9509375
eval loss:0.1992219090461731,accu:0.9366666666666666
global step:500,epoch:4,batch:49,loss:0.22845739126205444,acc:0.970625
global step:550,epoch:4,batch:99,loss:0.10674230754375458,acc:0.970625
global step:600,epoch:4,batch:149,loss:0.030239656567

# 模型预测

In [18]:
def predict(model,data,tokenizer,label_map,batch_size=1):
    examples=[]
    for text in data:
        input_ids,segment_ids=convert_example(text,tokenizer,label_map.values(),max_seq_length=128,is_test=True)
        examples.append((input_ids,segment_ids))
    
    batchify_fn=lambda samples,fn=Tuple(
        Pad(axis=0,pad_val=tokenizer.pad_token_id),
        Pad(axis=0,pad_val=tokenizer.pad_token_id)
    ):fn(samples)

    batches=[]
    one_batch=[]

    for example in examples:
        one_batch.append(example)
        if len(one_batch) == batch_size:
            batches.append(one_batch)
            one_batch=[]
    
    if one_batch:
        batches.append(one_batch)

    results=[]
    model.eval()

    for batch in batches:
        input_ids,segment_ids=batchify_fn(batch)
        input_ids=paddle.to_tensor(input_ids)
        segment_ids=paddle.to_tensor(segment_ids)
        logits=model(input_ids,segment_ids)
        probs=F.softmax(logits,axis=1)
        idx=paddle.argmax(probs,axis=1).numpy()
        idx=idx.tolist()

        labels=[label_map[i] for i in idx]
        results.extend(labels)
    return results

In [26]:
data=['暑期课满分,又加了一点','你也是牛逼',"我去南湖了","直接趟了"]
label_map={0:'负向情绪',1:'正向情绪'}

predictions=predict(model,data,tokenizer,label_map,batch_size=32)
for idx,text in enumerate(data):
    print("预测文本:{}\n情绪标签:{}".format(text,predictions[idx]))

预测文本:暑期课满分,又加了一点
情绪标签:负向情绪
预测文本:你也是牛逼
情绪标签:负向情绪
预测文本:我去南湖了
情绪标签:正向情绪
预测文本:直接趟了
情绪标签:负向情绪
