In [1]:
import pandas as pd
train_data = pd.read_csv('train.csv', sep='\t', header=None)
test_data = pd.read_csv('test.csv', sep='\t', header=None)
train_data.columns = ["text","label"]
test_data.columns = ["text"]

In [5]:
label_2_id = {}
classes = []
index = 0
for i in range(0,len(train_data)):
    label = train_data.iloc[i]["label"]
    if label not in label_2_id:
        classes.append(label)
        label_2_id[label] = index
        index += 1

In [55]:
index_2_label = {}
for label in label_2_id:
    index_2_label[label_2_id[label]] = label

In [21]:
label_words = []
for _class in classes:
    label_words.append([_class])

In [23]:
label_words

[['Travel-Query'],
 ['Music-Play'],
 ['FilmTele-Play'],
 ['Video-Play'],
 ['Radio-Listen'],
 ['HomeAppliance-Control'],
 ['Weather-Query'],
 ['Alarm-Update'],
 ['Calendar-Query'],
 ['TVProgram-Play'],
 ['Audio-Play'],
 ['Other']]

In [4]:
from openprompt.data_utils import InputExample
def load_local_dataset(data=None, split="train"):
    dataset = []
    if split == "train":
        for i in range(0,len(data)):
            dataset.append(InputExample(guid=i,text_a=data.iloc[i]["text"],label=label_2_id[data.iloc[i]["label"]]))
    else:
        for i in range(0,len(data)):
            dataset.append(InputExample(guid=i,text_a=data.iloc[i]["text"]))
    return dataset



In [8]:
dataset = {}
dataset['train'] = load_local_dataset(train_data,split="train")
dataset['test'] = load_local_dataset(test_data,split="test")

In [77]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, WrapperClass = load_plm("bert", "G:\\deep_learning\\models\\chinese_wwm_ext_pytorch\\")

Some weights of the model checkpoint at G:\deep_learning\models\chinese_wwm_ext_pytorch\ were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [46]:
from openprompt.prompts import ManualTemplate
promptTemplate = ManualTemplate(
    text = '{"placeholder":"text_a"} 这句话的类别是{"mask"}',
    tokenizer = tokenizer,
)

In [78]:
from openprompt.prompts import ManualTemplate
promptTemplate = ManualTemplate(
    text = '{"placeholder":"text_a"} 请输出前面这句话对应的类别是{"mask"}',
    tokenizer = tokenizer,
)

In [79]:
from openprompt.prompts import ManualVerbalizer
myverbalizer = ManualVerbalizer(tokenizer, num_classes=12,
                        label_words=label_words)

In [80]:
from openprompt import PromptDataLoader
train_data_loader = PromptDataLoader(
    dataset = dataset["train"],
    tokenizer = tokenizer,
    template = promptTemplate,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=64,
    batch_size=32,
    shuffle=True
)

tokenizing: 12100it [00:07, 1552.67it/s]


In [81]:
from openprompt import PromptDataLoader
test_data_loader = PromptDataLoader(
    dataset = dataset["test"],
    tokenizer = tokenizer,
    template = promptTemplate,
    tokenizer_wrapper_class=WrapperClass,
    max_seq_length=64,
    batch_size=32
)

tokenizing: 3000it [00:01, 1555.43it/s]


In [82]:
from openprompt import PromptForClassification
prompt_model  = PromptForClassification(plm=plm,template=promptTemplate, verbalizer=myverbalizer, freeze_plm=False)

In [83]:
use_cuda = True
if use_cuda:
    prompt_model =  prompt_model.cuda()

In [84]:
# Now the training is standard
import torch
from transformers import  AdamW, get_linear_schedule_with_warmup
loss_func = torch.nn.CrossEntropyLoss()
no_decay = ['bias', 'LayerNorm.weight']
# it's always good practice to set no decay to biase and LayerNorm parameters
optimizer_grouped_parameters = [
    {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [85]:
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)



In [86]:
for epoch in range(10):
    tot_loss = 0
    for step, inputs in enumerate(train_data_loader):
        if use_cuda:
            inputs = inputs.cuda()
        logits = prompt_model(inputs)
        labels = inputs['label']
        loss = loss_func(logits, labels)
        loss.backward()
        tot_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad()
        if step %100 ==1:
            print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)

Epoch 0, average loss: 4.581607818603516
Epoch 0, average loss: 0.5449433169492027
Epoch 0, average loss: 0.4169382690959195
Epoch 0, average loss: 0.3813562760351608
Epoch 1, average loss: 0.20896898582577705
Epoch 1, average loss: 0.17732031825089864
Epoch 1, average loss: 0.1835580710506085
Epoch 1, average loss: 0.19000956037124953
Epoch 2, average loss: 0.016772496979683638
Epoch 2, average loss: 0.1047553962425274
Epoch 2, average loss: 0.12468694973654013
Epoch 2, average loss: 0.12979972778606716
Epoch 3, average loss: 0.05020926636643708
Epoch 3, average loss: 0.11516877175673988
Epoch 3, average loss: 0.11737516069886192
Epoch 3, average loss: 0.11143055011669409
Epoch 4, average loss: 0.10534731857478619
Epoch 4, average loss: 0.06695033097126082
Epoch 4, average loss: 0.07712411430075368
Epoch 4, average loss: 0.08809662606996571
Epoch 5, average loss: 0.11918643489480019
Epoch 5, average loss: 0.09508560088864874
Epoch 5, average loss: 0.0973258451616281
Epoch 5, average l

In [87]:
allpreds = []
for step, inputs in enumerate(test_data_loader):
    if use_cuda:
        inputs = inputs.cuda()
    logits = prompt_model(inputs)
    allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

In [88]:
pred_by_promptbert = []
for pred in allpreds:
    pred_by_promptbert.append(index_2_label[pred])

In [89]:
test_data["pred_by_promptbert"] = pred_by_promptbert

In [67]:
#在https://competition.coggle.club/上的结果是0.797000
#这个是使用开源bert,max_len=64,batch_size=32,epoch=10,prompt是：这句话的类别是{"mask"}
with open("results\\bert_prompt_64_32_10.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["pred_by_promptbert"]+"\n")

In [90]:
#在https://competition.coggle.club/上的结果是0.796333
#这个是使用开源bert,max_len=64,batch_size=32,epoch=10,prompt是：请输出前面这句话对应的类别是{"mask"}
with open("results\\bert_prompt_64_32_10_2.txt","w") as f:
    f.write("ID,Target\n")
    for i in range(len(test_data)):
        f.write(str(i+1)+","+test_data.iloc[i]["pred_by_promptbert"]+"\n")

In [91]:
#Prompt分类比BERT分类相比，在精度上有什么区别？从效果上看没有BERT分类效果好
#自定义prompt对模型的精度是否有影响？可以尝试2种不同的prompt。对模型精度有影响，但是不大，肯定和我的prompt设计有关系