In [1]:
import paddle
import numpy as np
import paddlenlp
import pandas as pd
import re
from functools import partial
from paddlenlp.transformers import LinearDecayWithWarmup

from paddlenlp.data import Stack,Tuple,Pad
import warnings
warnings.filterwarnings('ignore')

import warnings
warnings.filterwarnings('ignore')
import paddle.nn.functional as F
# from utils import evaluate
from paddlenlp.datasets import MapDataset

# Train_path = '/home/dengdan/data/test_data.csv'             # 远程服务器上面的绝对路径
# Test_path='/home/dengdan/data/test_data.csv'

Train_path = './data/data103654/train.txt'             # 远程服务器上面的绝对路径
Test_path='./data/data103654/test.txt'
Valid_path='./data/data103654/dev.txt'

Label_list=['教育', '社会', '星座', '房产', '彩票', '体育', '游戏', '财经', '股票', '时政', '科技', '娱乐', '时尚', '家居']
id2label_list=[{0: '教育'}, {1: '社会'}, {2: '星座'}, {3: '房产'}, {4: '彩票'}, {5: '体育'}, {6: '游戏'},
               {7: '财经'}, {8: '股票'}, {9: '时政'}, {10: '科技'}, {11: '娱乐'}, {12: '时尚'}, {13: '家居'}]


class MyDataSet(paddle.io.Dataset):
    def __init__(self,path):
        self.data_dict_list=self.__load_data(path)


    def __load_data(self,path):
        with open(path,'r',encoding='utf-8') as f:
            data=f.readlines()
            data_dict_list=[]
            # label_dict_list=[]
            for line in data:
                message=line.strip('\n')
                label=message[-2:]
                for i in range(len(Label_list)):
                    if Label_list[i]==label:
                        label=i
                text=message[:-3]
                temp={'text':text,'label':label}
                data_dict_list.append(temp)
        return data_dict_list

    def __getitem__(self, idx):
        return self.data_dict_list[idx]

    def __len__(self):
        return len(self.data_dict_list)



In [2]:

batch_size = 32
max_seq_len = 64
lr = 5e-5
epochs = 1
MODEL_NAME = 'ernie-tiny'

# tokenizer作用为将原始输入文本转化成模型model可以接受的输入数据形式。
tokenizer = paddlenlp.transformers.ErnieTokenizer.from_pretrained(MODEL_NAME)


# 该函数作用为将数据转换为ernie所需要的输入数据格式
def convert_example(example, tokenizer, max_seq_length=128, is_test=False):
    encoded_inputs = tokenizer(text=example['text'], max_seq_len=max_seq_length)
    input_ids = encoded_inputs['input_ids']
    token_type_ids = encoded_inputs['token_type_ids']

    if is_test:
        print('进入了test:')
        return input_ids, token_type_ids
    else:
        label = np.array([example['label']], dtype='int64')
        return input_ids, token_type_ids, label


def create_dataloader(dataset, batch_size=1,
                      batchify_fn=None, trans_fn=None):
    # trans_fn对应前边的covert_example函数，使用该函数处理每个样本为期望的格式
    if trans_fn:
        dataset = dataset.map(trans_fn)

    # 定义并初始化数据读取器
    return paddle.io.DataLoader(dataset, batch_size=batch_size,
                                shuffle=False, collate_fn=batchify_fn,
                                num_workers=1, drop_last=False, return_list=True)


trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=max_seq_len)

trans_func02 = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=max_seq_len,
    is_test=True)

batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
    Stack(dtype='int64')
): [data for data in fn(samples)]

train_dataset = MyDataSet(Train_path)

train_set = MapDataset(train_dataset)

train_data_loader = create_dataloader(
    train_set,
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func
)


# 检测是否可以使用gpu,如果可以优先使用gpu
use_gpu = True if paddle.get_device().startswith("gpu") else False
if use_gpu:
    paddle.set_device('gpu:0')

dropout_rate = None

# 学习率预热比例
warmup_propotion = 0.1
# 权重衰减系数，类似模型正则项策略，避免模型过拟合
weight_decay = 0.01

num_training_steps = len(train_data_loader) * epochs

lr_scheduler = LinearDecayWithWarmup(lr, num_training_steps, warmup_propotion)



[2023-01-13 20:13:21,552] [    INFO] - Downloading https://bj.bcebos.com/paddlenlp/models/transformers/ernie_tiny/vocab.txt and saved to /home/aistudio/.paddlenlp/models/ernie-tiny
[2023-01-13 20:13:21,555] [    INFO] - Downloading vocab.txt from https://bj.bcebos.com/paddlenlp/models/transformers/ernie_tiny/vocab.txt
100%|██████████| 459k/459k [00:00<00:00, 2.82MB/s]
[2023-01-13 20:13:21,915] [    INFO] - tokenizer config file saved in /home/aistudio/.paddlenlp/models/ernie-tiny/tokenizer_config.json
[2023-01-13 20:13:21,918] [    INFO] - Special tokens file saved in /home/aistudio/.paddlenlp/models/ernie-tiny/special_tokens_map.json


In [3]:
# 构建网络
class ErnieForSequenceClassification(paddle.nn.Layer):
    def __init__(self, num_class=14, dropout=None):
        super(ErnieForSequenceClassification, self).__init__()
        # 加载预训练好的ernie
        self.ernie = paddlenlp.transformers.ErnieModel.from_pretrained(MODEL_NAME)
        # self.dropout=paddle.nn.Dropout(dropout)
        self.dropout = paddle.nn.Dropout(dropout if dropout is not None else self.ernie.config['hidden_dropout_prob'])
        self.classifier = paddle.nn.Linear(self.ernie.config['hidden_size'], num_class)

    def forward(self, input_ids, token_type_ids=None):
        sequence_output, pooled_output = self.ernie(
            input_ids,
            token_type_ids)
        # print("在网络里面~")
        # print('pooled_output:{}'.format(pooled_output))
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


# 加载预训练模型ERNIE
# 加载用于文本分类的fune-tuning网络
model = ErnieForSequenceClassification(num_class=14,
                                       dropout=dropout_rate)

# 定义统计指标
metric = paddle.metric.Accuracy()

optimizer = paddle.optimizer.AdamW(
    learning_rate=lr_scheduler,
    parameters=model.parameters(),
    weight_decay=weight_decay,
    apply_decay_param_fun=lambda x: x in [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ['bias', 'norm'])
    ])


[2023-01-13 20:13:29,436] [    INFO] - Downloading https://bj.bcebos.com/paddlenlp/models/transformers/ernie_tiny/ernie_tiny.pdparams and saved to /home/aistudio/.paddlenlp/models/ernie-tiny
[2023-01-13 20:13:29,439] [    INFO] - Downloading ernie_tiny.pdparams from https://bj.bcebos.com/paddlenlp/models/transformers/ernie_tiny/ernie_tiny.pdparams
100%|██████████| 346M/346M [00:11<00:00, 32.0MB/s] 
W0113 20:13:40.858995   286 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0113 20:13:40.862972   286 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.


In [4]:
def evaluate(model, metric, data_loader):
    model.eval()
    # 每次使用测试集进行评估时，先重置掉之前的metric的累计数据，保证只是针对本次评估。
    metric.reset()
    losses = []
    # logits_list=[]
    # labels_list=[]
    acc_list = []
    total_acc = 0

    for step, batch in enumerate(data_loader):
        input_ids, segment_ids, labels = batch
        logits = model(input_ids, segment_ids)
        # logits_list.extend(logits)
        # labels_list.extend(labels)
        loss = F.cross_entropy(input=logits, label=labels)
        loss = paddle.mean(loss)
        losses.append(loss.numpy())
        correct = metric.compute(logits, labels)
        metric.update(correct)
        acc = metric.accumulate()
        acc_list.append(acc)
        total_acc += acc

    print('eval loss:%.5f,acc:%.5f' % (np.mean(losses), acc))
    print('acc_list={}'.format(acc_list))
    print('ave_acc={}'.format(np.mean(acc_list)))
    print('total_acc={},ave_acc={}'.format(total_acc, total_acc / len(acc_list)))
    metric.reset()


def train(model):
    global_step = 0
    for epoch in range(1, epochs + 1):
        print('epoch={}'.format(epoch))
        model.train()
        for step, batch in enumerate(train_data_loader, start=1):
            if (step == 100):
                print("step={}".format(step))
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)

            loss = F.cross_entropy(input=logits, label=labels)

            probs = F.softmax(logits, axis=1)
            correct = metric.compute(probs, labels)
            # if step<5:
            #     print('probs.shape:{}'.format(probs.shape))
            #     pred_labels=np.argmax(probs,axis=1)
            #     true_labels=labels
            #     print('step:{},pred_labels:{},true_labels:{}'.format(step,pred_labels,true_labels))
            #     # print('step:{},correct:{}'.format(step,correct))

            metric.update(correct)
            acc = metric.accumulate()
            print('step:{},acc:{}'.format(step, acc))

            global_step += 1

            # print("global step %d,epoch:%d,batch:%d,loss:%.5f,acc:%.5f"%(
            #     global_step,epoch,step,loss,acc))
            # if global_step % 10==0:
            #     print('batch:{}'.format(batch))
            # print('probs:{}\nlabels={}'.format(probs,labels))
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

        # evaluate(model,metric,test_data_loader)



In [5]:
train(model)

step:22517,acc:0.9034993005284896
step:22518,acc:0.9035021982414069
step:22519,acc:0.9035023202628891
step:22520,acc:0.9035052175843694
step:22521,acc:0.9035067270547489
step:22522,acc:0.9035110114554658
step:22523,acc:0.903511133063979
step:22524,acc:0.9035112546616942
step:22525,acc:0.903515538290788
step:22526,acc:0.9035184342537512
step:22527,acc:0.9035199427353842
step:22528,acc:0.9035228382457386
step:22529,acc:0.9035271206001154
step:22530,acc:0.9035300155348425
step:22531,acc:0.9035342971905375
step:22532,acc:0.9035371915497958
step:22533,acc:0.903534538232814
step:22534,acc:0.9035332719446171
step:22535,acc:0.9035347792323053
step:22536,acc:0.9035362863862265
step:22537,acc:0.9035391800150863
step:22538,acc:0.9035434599343332
step:22539,acc:0.9035394205599183
step:22540,acc:0.9035423136645963
step:22541,acc:0.9035424337873209
step:22542,acc:0.9035467128027682
step:22543,acc:0.9035496051989531
step:22544,acc:0.9035511111603974
step:22545,acc:0.903551

In [6]:
# 模型保存的名称
model_name = "ernie_for_news_classification"

paddle.save(model.state_dict(), "{}.pdparams".format(model_name))
paddle.save(optimizer.state_dict(), "{}.optparams".format(model_name))
tokenizer.save_pretrained('./tokenizer')

[2023-01-13 20:25:17,383] [    INFO] - tokenizer config file saved in ./tokenizer/tokenizer_config.json
[2023-01-13 20:25:17,387] [    INFO] - Special tokens file saved in ./tokenizer/special_tokens_map.json


('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/added_tokens.json')

In [10]:
valid_dataset = MyDataSet(Valid_path)

valid_set = MapDataset(valid_dataset)
# test_set = MapDataset(test_dataset)

valid_data_loader = create_dataloader(
    valid_set,
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func
)



In [15]:
evaluate(model, metric,valid_data_loader)

eval loss:0.13060,acc:0.95774
acc_list=[0.875, 0.921875, 0.9375, 0.9296875, 0.9375, 0.9427083333333334, 0.9419642857142857, 0.94140625, 0.9444444444444444, 0.946875, 0.9488636363636364, 0.9479166666666666, 0.9495192307692307, 0.9508928571428571, 0.9520833333333333, 0.951171875, 0.9503676470588235, 0.9513888888888888, 0.9506578947368421, 0.9515625, 0.9523809523809523, 0.953125, 0.9497282608695652, 0.9505208333333334, 0.9525, 0.953125, 0.9548611111111112, 0.953125, 0.9536637931034483, 0.9520833333333333, 0.9526209677419355, 0.9541015625, 0.9545454545454546, 0.9558823529411765, 0.9553571428571429, 0.9539930555555556, 0.9552364864864865, 0.9547697368421053, 0.9543269230769231, 0.95546875, 0.9557926829268293, 0.9553571428571429, 0.9556686046511628, 0.9566761363636364, 0.9576388888888889, 0.9578804347826086, 0.9574468085106383, 0.9576822916666666, 0.9559948979591837, 0.956875, 0.9577205882352942, 0.9585336538461539, 0.9587264150943396, 0.9589120370370371, 0.9590909090909091, 0.9598214285714

In [17]:
# ------------------------------------   保存模型用于预测（推理）
from paddle.static import InputSpec
# 1.切换 eval()模式
model.eval()
# 2. 构造 InputSpec 信息
input_ids = InputSpec([32, 26],'int64')
segment_ids=InputSpec([32,26],'int64')
# 3.调用 paddle.jit.save 接口转为静态图模型
path = "model_for_predict/linear"
paddle.jit.save(
    layer=model,
    path=path,
    input_spec=[input_ids,segment_ids])

In [2]:
# ------------------------------------   加载模型用于预测
import paddle
path = "model_for_predict/linear"
loaded_model = paddle.jit.load(path)
# loaded_model.eval()


W0113 21:44:23.888254   178 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0113 21:44:23.894660   178 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.


In [None]:
batchify_fn02 = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
    Stack(dtype='int64')
): [data for data in fn(samples)]


In [41]:
with open(Test_path,'r',encoding='utf-8') as f:
    data=f.readlines()
    data_dict_list=[]
    for line in data:
        message=line.strip('\n')
        text=message
        temp={'text':text}
        data_dict_list.append(temp)

In [45]:
data_dict_list[:2]

[{'text': '北京君太百货璀璨秋色 满100省353020元'}, {'text': '教育部：小学高年级将开始学习性知识'}]

In [3]:

class TestDataSet(paddle.io.Dataset):
    def __init__(self,path):
        self.data_dict_list=self.__load_data(path)


    def __load_data(self,path):
        with open(path,'r',encoding='utf-8') as f:
            data=f.readlines()
            data_dict_list=[]
            for line in data:
                message=line.strip('\n')
                text=message
                temp={'text':text}
                data_dict_list.append(temp)
        return data_dict_list

    def __getitem__(self, idx):
        return self.data_dict_list[idx]

    def __len__(self):
        return len(self.data_dict_list)




In [10]:
# tokenizer作用为将原始输入文本转化成模型model可以接受的输入数据形式。
import paddlenlp
from functools import partial
from paddlenlp.data import Stack,Tuple,Pad
N=0
max_seq_len=64


tokenizer = paddlenlp.transformers.ErnieTokenizer.from_pretrained('ernie-tiny')
# 该函数作用为将数据转换为ernie所需要的输入数据格式

def convert_example02(example, tokenizer, max_seq_length=128):
    global N
    N+=1
    encoded_inputs = tokenizer(text=example['text'], max_seq_len=max_seq_length)
    input_ids = encoded_inputs['input_ids']
    token_type_ids = encoded_inputs['token_type_ids']
    print('{},进入了test:'.format(N))
    return input_ids, token_type_ids


def create_dataloader02(dataset, batch_size=1,
                      batchify_fn=None, trans_fn=None):
    # trans_fn对应前边的covert_example函数，使用该函数处理每个样本为期望的格式
    if trans_fn:
        dataset = dataset.map(trans_fn)
    print('进入了create_dataloader02')

    # 定义并初始化数据读取器
    return paddle.io.DataLoader(dataset, batch_size=batch_size,
                                shuffle=False, collate_fn=batchify_fn,
                                num_workers=1, drop_last=False, return_list=True)


# trans_func = partial(
#     convert_example,
#     tokenizer=tokenizer,
#     max_seq_length=max_seq_len)

trans_func02 = partial(
    convert_example02,
    tokenizer=tokenizer,
    max_seq_length=max_seq_len)

batchify_fn = lambda samples, fn=Tuple(
    Pad(axis=0, pad_val=tokenizer.pad_token_id),
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
    Stack(dtype='int64')
): [data for data in fn(samples)]

[2023-01-13 21:48:06,718] [    INFO] - Already cached /home/aistudio/.paddlenlp/models/ernie-tiny/vocab.txt
[2023-01-13 21:48:06,754] [    INFO] - tokenizer config file saved in /home/aistudio/.paddlenlp/models/ernie-tiny/tokenizer_config.json
[2023-01-13 21:48:06,757] [    INFO] - Special tokens file saved in /home/aistudio/.paddlenlp/models/ernie-tiny/special_tokens_map.json


In [31]:
with open(Test_path,'r',encoding='utf-8') as f:
            data=f.readlines()
            data_dict_list=[]
            for line in data:
                message=line.strip('\n')
                text=message
                temp={'text':text}
                data_dict_list.append(temp)

In [33]:
data_dict_list

[{'text': '北京君太百货璀璨秋色 满100省353020元'},
 {'text': '教育部：小学高年级将开始学习性知识'},
 {'text': '专业级单反相机 佳能7D单机售价9280元'},
 {'text': '星展银行起诉内地客户 银行强硬客户无奈'},
 {'text': '脱离中国的实际 强压人民币大幅升值只能是梦想'},
 {'text': '内城土地稀缺 对开发商提出更高要求(组图)'},
 {'text': '亚欧首脑会议举行第二次全体会议(组图)'},
 {'text': '荷兰主帅球迷面前表决心耍酷 罗本不在范佩西成一哥'},
 {'text': '搭配18-105VR镜头 尼康D90带票7109元'},
 {'text': '百盛购物中心美罗城店 同一专柜花120得200'},
 {'text': '陈水扁获释后首次出庭控告他人诽谤(图)'},
 {'text': '为艺术喝彩 索尼推出限量珍藏NWD-W202'},
 {'text': '新一轮价格战的开端？昂达VX530仅299元'},
 {'text': '三星数码相框迎圣诞送好礼活动促销中'},
 {'text': '魔兽33+11魔术克尼克斯 雷霆压爵士捍卫赛区榜首'},
 {'text': '杭州经济适用房被曝质量问题'},
 {'text': '欧元区在希腊援助条款上出现分歧'},
 {'text': '入门级单反相机 尼康D3100促销价4060元'},
 {'text': '盖特纳：解决希腊问题时间所剩无几'},
 {'text': '骗子冒充北京干部骗财24万'},
 {'text': '2010年韩国影展开幕 观众可体验“4D”效果(图)'},
 {'text': '亨通光电：行业景气下滑 未来仍有看点'},
 {'text': '美媒：内内下份合同值5000万 留在掘金能打大前锋'},
 {'text': '刘明康：严格实施二套房政策 防范房产金融风险'},
 {'text': '莎拉-杰西卡-帕克有意拍摄《欲望都市3》'},
 {'text': '带伤大郅仍是江苏头号公敌 主帅：防住了他就能赢'},
 {'text': '全国5.6万名军队转业干部安置完成'},
 {'text': '法国总理菲永称希望修复对华关系'},
 {'text': '欧洲三大

In [14]:
Test_path='./data/data103654/test.txt'
from paddlenlp.datasets import MapDataset
batch_size=32

# 加载测试数据集
test_dataset=TestDataSet(Test_path)
print("1ok")
test_set = MapDataset(test_dataset)
print("2ok")
test_data_loader = create_dataloader02(
    test_set,
    batch_size=batch_size,
    batchify_fn=batchify_fn,
    trans_fn=trans_func02
)
print("3ok")
# test_loader = paddle.io.DataLoader(test_dataset, batch_size=64, drop_last=True)
# 将该模型及其所有子层设置为预测模式
loaded_model.eval()




1ok
2ok
进入了create_dataloader02
3ok


In [24]:
import paddle.nn.functional as F

def predict(data, batch_size=32):
    examples = []
    # 数据处理
    for text in data:
        input_ids, segment_ids = convert_example02(
            text,
            tokenizer,
            max_seq_length=128)
        examples.append((input_ids, segment_ids))

    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input id
        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # segment id
    ): fn(samples)

    # 将数据按照batch_size进行切分
    batches = []
    one_batch = []
    for example in examples:
        one_batch.append(example)
        if len(one_batch) == batch_size:
            batches.append(one_batch)
            one_batch = []
    if one_batch:
        batches.append(one_batch)

    # 使用模型预测数据，并返回结果
    results = []
    loaded_model.eval()
    for batch in batches:
        input_ids, segment_ids = batchify_fn(batch)
        input_ids = paddle.to_tensor(input_ids)
        segment_ids = paddle.to_tensor(segment_ids)
        logits = loaded_model(input_ids, segment_ids)
        probs = F.softmax(logits, axis=1)
        idx = paddle.argmax(probs, axis=1).numpy()
        idx = idx.tolist()
        # labels = [id2label[i] for i in idx]
        results.extend(idx)
    return results


5,进入了test:
[2]


In [29]:
with open(Test_path,'r',encoding='utf-8') as f:
    test_data=f.readlines

In [34]:
with open(Test_path,'r',encoding='utf-8') as f:
            data=f.readlines()
            data_dict_list=[]
            for line in data:
                message=line.strip('\n')
                text=message
                temp={'text':text}
                data_dict_list.append(temp)

In [35]:
data_dict_list

[{'text': '北京君太百货璀璨秋色 满100省353020元'},
 {'text': '教育部：小学高年级将开始学习性知识'},
 {'text': '专业级单反相机 佳能7D单机售价9280元'},
 {'text': '星展银行起诉内地客户 银行强硬客户无奈'},
 {'text': '脱离中国的实际 强压人民币大幅升值只能是梦想'},
 {'text': '内城土地稀缺 对开发商提出更高要求(组图)'},
 {'text': '亚欧首脑会议举行第二次全体会议(组图)'},
 {'text': '荷兰主帅球迷面前表决心耍酷 罗本不在范佩西成一哥'},
 {'text': '搭配18-105VR镜头 尼康D90带票7109元'},
 {'text': '百盛购物中心美罗城店 同一专柜花120得200'},
 {'text': '陈水扁获释后首次出庭控告他人诽谤(图)'},
 {'text': '为艺术喝彩 索尼推出限量珍藏NWD-W202'},
 {'text': '新一轮价格战的开端？昂达VX530仅299元'},
 {'text': '三星数码相框迎圣诞送好礼活动促销中'},
 {'text': '魔兽33+11魔术克尼克斯 雷霆压爵士捍卫赛区榜首'},
 {'text': '杭州经济适用房被曝质量问题'},
 {'text': '欧元区在希腊援助条款上出现分歧'},
 {'text': '入门级单反相机 尼康D3100促销价4060元'},
 {'text': '盖特纳：解决希腊问题时间所剩无几'},
 {'text': '骗子冒充北京干部骗财24万'},
 {'text': '2010年韩国影展开幕 观众可体验“4D”效果(图)'},
 {'text': '亨通光电：行业景气下滑 未来仍有看点'},
 {'text': '美媒：内内下份合同值5000万 留在掘金能打大前锋'},
 {'text': '刘明康：严格实施二套房政策 防范房产金融风险'},
 {'text': '莎拉-杰西卡-帕克有意拍摄《欲望都市3》'},
 {'text': '带伤大郅仍是江苏头号公敌 主帅：防住了他就能赢'},
 {'text': '全国5.6万名军队转业干部安置完成'},
 {'text': '法国总理菲永称希望修复对华关系'},
 {'text': '欧洲三大

In [None]:
results = predict(data_dict_list[:10])
print(results)

47161,进入了test:
47162,进入了test:
47163,进入了test:
47164,进入了test:
47165,进入了test:
47166,进入了test:
47167,进入了test:
47168,进入了test:
47169,进入了test:
47170,进入了test:
47171,进入了test:
47172,进入了test:
47173,进入了test:
47174,进入了test:
47175,进入了test:
47176,进入了test:
47177,进入了test:
47178,进入了test:
47179,进入了test:
47180,进入了test:
47181,进入了test:
47182,进入了test:
47183,进入了test:
47184,进入了test:
47185,进入了test:
47186,进入了test:
47187,进入了test:
47188,进入了test:
47189,进入了test:
47190,进入了test:
47191,进入了test:
47192,进入了test:
47193,进入了test:
47194,进入了test:
47195,进入了test:
47196,进入了test:
47197,进入了test:
47198,进入了test:
47199,进入了test:
47200,进入了test:
47201,进入了test:
47202,进入了test:
47203,进入了test:
47204,进入了test:
47205,进入了test:
47206,进入了test:
47207,进入了test:
47208,进入了test:
47209,进入了test:
47210,进入了test:
47211,进入了test:
47212,进入了test:
47213,进入了test:
47214,进入了test:
47215,进入了test:
47216,进入了test:
47217,进入了test:
47218,进入了test:
47219,进入了test:
47220,进入了test:
47221,进入了test:
47222,进入了test:
47223,进入

In [37]:
print('hello,world')

hello,world


In [38]:
results = predict(data_dict_list[:10])
print(results)

83605,进入了test:
83606,进入了test:
83607,进入了test:
83608,进入了test:
83609,进入了test:
83610,进入了test:
83611,进入了test:
83612,进入了test:
83613,进入了test:
83614,进入了test:
[10, 9, 10, 8, 8, 3, 9, 5, 10, 3]
