更新ocnli csl模版，并更新得分

CLUEbenchmark · Aug 13, 2021 · cdc4520 · cdc4520
1 parent 1c0c647
commit cdc4520
Show file tree

Hide file tree

Showing 7 changed files with 423 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -71,7 +71,7 @@ FewCLUE: A Chinese Few-shot Learning Evaluation Benchmark
 | <a href="https://github.com/ymcui/Chinese-BERT-wwm">FineTuningB</a>        | 39.35 |61.9N   | 54.1N   | 33.6N  | 25.6N |40.5N | 50.3N |22.6N | 50.5N| 15.0N|
 | <a href="https://github.com/CLUEbenchmark/FewCLUE/tree/main/baselines/models_keras/pet">PET</a>      | 57.44 | 86.66(1.02, 88.2) | 56.04(4.98, 62.19)  | 44.02(0.42, 49.25) | 51.69(1.04, 58.80) |54.47(1.21, 55.12)  | 57.52(2.70, 64.65)| 46.01(1.07, 51.34) | 59.35(1.27, 66.28) | 61.21(1.10, 62.39) |
 | <a href="https://github.com/CLUEbenchmark/FewCLUE/tree/main/baselines/models_keras/ptuning">PtuningB</a>      | 51.81| 88.5N | 65.4  | 35.0N | 44.4N |  48.2N  | 51.0N | 32.0N| 50.0N | 57.6N |
-| <a href="https://github.com/CLUEbenchmark/FewCLUE/tree/main/baselines/models_keras/ptuning_origin">ori-PtuningB</a>      | 57.77| 86.88(1.1, 89.34)	| 60.92(2.9,65.01)	| 35.81(1.8,44.44)	| 56.02(1.1,63.50)	| 54.23(1.0,55.91)	| 57.52(2.4,67.52)	| 57.63(0.9,62.26)	| 50.87(1.3,51.86)	| 60.04(1.2,60.14)	| 
+| <a href="https://github.com/CLUEbenchmark/FewCLUE/tree/main/baselines/models_keras/ptuning_origin">ori-PtuningB</a>      | 59.91| 88.26(0.7, 89.83)	| 60.92(2.9,65.01)	| 41.90(1.9,49.60)	| 56.02(1.1,63.50)	| 54.23(1.0,55.91)	| 58.11(2.2,67.52)	| 57.63(0.9,62.26)	| 62.91(2.3,70.82)	| 59.27(1.4,61.19)	| 
 | <a href="https://arxiv.org/pdf/2009.07118.pdf">PtuningGPT</a>      | 46.44| 75.65N  | 54.9N   | 35.75N  | 33.69N  |  45.3N   | 49.0N | 24.0N | 53.5N  | 13.7N  |
 | <a href="https://github.com/CLUEbenchmark/FewCLUE/tree/main/baselines/models_keras/gpt">Zero-shotG</a>      | 43.36N |  57.54N |  50N  | 34.4N  |  26.23N |  36.96N | 50.31N | 19.04N | 50.14N  | 65.63N  |
 | <a href="https://arxiv.org/abs/2005.14165">Zero-shotR</a>      | 44.61N |  85.2N |   50.6N | 40.3N | 12.6N  |   25.3N  | 50.0N | 27.7N |  52.2N |  57.6N |

diff --git a/baselines/models_keras/ptuning_origin/README.md b/baselines/models_keras/ptuning_origin/README.md
@@ -29,11 +29,11 @@ python ptuning_iflytek.py 0 # 运行iflytek任务，并使用第0个数据集
 ## 结果
 | 数据集   | score     | eprstmt  | bustm  | ocnli   | csldcp   | tnews | wsc | ifytek| csl | chid  |
 | :----:| :----:  | :----: |:----: |:----: |:----: |:----: |:----: |:----: |:----: |:----: |
-|0|0.578132222| 0.85902	| 0.59086	| 0.38135	| 0.57063	| 0.55277	| 0.55328	| 0.57791	| 0.51797	| 0.5994	| 
-|1|0.584647778| 0.88361	| 0.6456	| 0.34405	| 0.56166	| 0.54158	| 0.61373	| 0.57599	| 0.49471	| 0.6009	| 
-|2|0.576458889| 0.87705	| 0.61456	| 0.34246	| 0.54877	| 0.53038	| 0.58402	| 0.57216	| 0.50035	| 0.61838	| 
-|3|0.57269| 0.85738	| 0.62415	| 0.37302	| 0.54933	| 0.53412	| 0.5625	| 0.56577	| 0.50352	| 0.58442	| 
-|4|0.576698889| 0.86721	| 0.57111	| 0.35	| 0.57063	| 0.55277	| 0.5625	| 0.59004	| 0.52713	| 0.5989	| 
-|few_all|0.622237778| 0.89344	| 0.65011	| 0.44444	| 0.63509	| 0.55917	| 0.6752	| 0.62261	| 0.51868	| 0.6014	| 
-|avg|0.577725556| 0.868854	| 0.609256	| 0.358176	| 0.560204	| 0.542324	| 0.575206	| 0.576374	| 0.508736	| 0.6004	| 
-|std|-| 0.011357563	| 0.029001638	| 0.017823875	| 0.010822466	| 0.010353214	| 0.024317789	| 0.008936847	| 0.013396514	| 0.012063797	
+|0|0.611124444| 0.88525	| 0.59086	| 0.44405	| 0.57063	| 0.55277	| 0.60656	| 0.57791	| 0.67019	| 0.6019	| 
+|1|0.604553333| 0.88197	| 0.6456	| 0.42659	| 0.56166	| 0.54158	| 0.60246	| 0.57599	| 0.62121	| 0.58392	| 
+|2|0.593363333| 0.87049	| 0.61456	| 0.40556	| 0.54877	| 0.53038	| 0.5666	| 0.57216	| 0.62086	| 0.61089	| 
+|3|0.591682222| 0.88689	| 0.62415	| 0.39603	| 0.54933	| 0.53412	| 0.57172	| 0.56577	| 0.62121	| 0.57592	| 
+|4|0.595266667| 0.88852	| 0.57111	| 0.42262	| 0.57063	| 0.55277	| 0.5584	| 0.59004	| 0.6124	| 0.59091	| 
+|few_all|0.650745556| 0.89836	| 0.65011	| 0.49603	| 0.63509	| 0.55917	| 0.6752	| 0.62261	| 0.70825	| 0.61189	| 
+|avg|0.599198| 0.882624	| 0.609256	| 0.41897	| 0.560204	| 0.542324	| 0.581148	| 0.576374	| 0.629174	| 0.599198	| 
+|std|-| 0.007202762	| 0.029001638	| 0.018751433	| 0.010822466	| 0.010353214	| 0.02189734	| 0.008936847	| 0.023236031	| 0.013940738	
diff --git a/baselines/models_keras/ptuning_origin/pet_chid.py b/baselines/models_keras/ptuning_origin/pet_chid.py
@@ -116,6 +116,7 @@ def __iter__(self, random=False):
                         target_ids[ind] = label_ids[i]
                     else:
                         source_ids[ind] = i - 3
+                        target_ids[ind] = i - 3
                 # for i, label_id_ in zip(mask_idxs, label_ids):
                 #     source_ids[i] = tokenizer._token_mask_id # i: 7(mask1的index) ;j: 1093(农); i:8 (mask2的index) ;j: 689(业)
                 #     target_ids[i] = label_id_

diff --git a/baselines/models_keras/ptuning_origin/pet_csl.py b/baselines/models_keras/ptuning_origin/pet_csl.py
@@ -0,0 +1,192 @@
+#! -*- coding:utf-8 -*-
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open
+import json
+import sys
+from modeling import tokenizer
+
+
+maxlen = 256
+batch_size = 16
+unused_length=2
+
+
+
+# 模板
+# input_str_format = "{}，黴鹹{}几点内容" # 黴鹹：生僻字组合会被替换为 强调 or 提到，方便寻找mask index [7957, 7919]
+input_str_format = "#"*unused_length+"黴鹹用{}概括{}" # 黴鹹：生僻字组合会被替换为 不能 or 可以，方便寻找mask index [7957, 7919]
+labels = ["不能", "可以"]
+label2words = {"0": "不能", "1":"可以"}
+
+num_classes = 2
+acc_list = []
+
+
+def load_data(filename): # 加载数据
+    D = []
+    with open(filename, encoding='utf-8') as f:
+        for i, l in enumerate(f):
+            l = json.loads(l.strip())
+            keyword = "，".join(l["keyword"])
+            abst = l['abst']
+            content = input_str_format.format(keyword, abst)
+            content_ids, segment_ids = tokenizer.encode(content)
+            while len(content_ids) > 256:
+                content_ids.pop(-2) # 截断abst内容保证max_seq_length==256
+                segment_ids.pop(-2)
+            # abst_ids = tokenizer.encode(abst)[0]
+            # keyword_ids = tokenizer.encode(keyword)[0]
+            # abst_ids_len = min(256-7-2-(len(keyword_ids)-2), len(abst_ids)-2) # seq_length-promopt_length-keyword_length
+            # abst = tokenizer.decode(abst_ids[1:1+abst_ids_len])
+
+            mask_idxs = [idx for idx, c in enumerate(content_ids) if c == 7957 and content_ids[idx+1] == 7919]
+            mask_idxs.append(mask_idxs[0]+1)
+            if "label" in l:
+                label = l["label"]
+            else:
+                label = "0"
+            D.append(((content, content_ids, segment_ids), label2words[label], mask_idxs))
+    return D
+
+
+path = '../../../datasets/csl'
+data_num = sys.argv[1]
+
+# 加载数据集
+train_data = load_data('{}/train_{}.json'.format(path,data_num))
+valid_data = load_data('{}/dev_{}.json'.format(path,data_num))
+test_data = load_data('{}/test_public.json'.format(path))
+
+
+def random_masking(token_ids):
+    """对输入进行随机mask
+    """
+    rands = np.random.random(len(token_ids))
+    source, target = [], []
+    for r, t in zip(rands, token_ids):
+        if r < 0.15 * 0.8:
+            source.append(tokenizer._token_mask_id)
+            target.append(t)
+        elif r < 0.15 * 0.9:
+            source.append(t)
+            target.append(t)
+        elif r < 0.15:
+            source.append(np.random.choice(tokenizer._vocab_size - 1) + 1)
+            target.append(t)
+        else:
+            source.append(t)
+            target.append(0)
+    return source, target
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
+        for is_end, (content_ids, label, mask_idx) in self.sample(random):
+            # if len(label) == 2: # label是两个字的文本
+            #     text = text # 拼接文本
+            # token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            content, token_ids, segment_ids = content_ids[0], content_ids[1], content_ids[2]
+            if random:
+                source_ids, target_ids = random_masking(token_ids)
+            else:
+                source_ids, target_ids = token_ids[:], token_ids[:]
+            if len(label) == 2: # label是两个字的文本
+                label_ids = tokenizer.encode(label)[0][1:-1] # label_ids: [1093, 689]。 e.g. [101, 1093, 689, 102] =[CLS,农,业,SEP]. tokenizer.encode(label): ([101, 1093, 689, 102], [0, 0, 0, 0])
+                for i, label_id_ in zip(mask_idx, label_ids):
+                    #if tokenizer.id_to_token(source_ids[i]) not in ["黴", "鹹", "[MASK]"]:
+                    #    print(content, tokenizer.id_to_token(source_ids[i]), mask_idx) # 确保mask掉了正确的token
+                    source_ids[i] = tokenizer._token_mask_id # i: 7(mask1的index) ;j: 1093(农); i:8 (mask2的index) ;j: 689(业)
+                    target_ids[i] = label_id_
+                for i in range(1, unused_length+1):
+                    source_ids[i] = i
+                    target_ids[i] = i
+            batch_token_ids.append(source_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_output_ids.append(target_ids)
+
+            if len(batch_token_ids) == self.batch_size or is_end: # 分批padding和生成
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_output_ids = sequence_padding(batch_output_ids)
+                yield [
+                    batch_token_ids, batch_segment_ids, batch_output_ids
+                ], None
+                batch_token_ids, batch_segment_ids, batch_output_ids = [], [], []
+
+
+from modeling import get_model
+model, train_model = get_model(pattern_len=unused_length, trainable=True, lr=3e-5)
+
+
+# 转换数据集
+train_generator = data_generator(train_data, batch_size)
+valid_generator = data_generator(valid_data, batch_size)
+test_generator = data_generator(test_data, batch_size)
+
+
+class Evaluator(keras.callbacks.Callback):
+    def __init__(self):
+        self.best_val_acc = 0.
+
+    def on_epoch_end(self, epoch, logs=None):
+        # model.save_weights('pet_tnews_model.weights')
+        val_acc = evaluate(valid_generator)
+        if val_acc > self.best_val_acc:  # #  保存最好的模型，并记录最好的准确率
+            self.best_val_acc = val_acc
+            # model.save_weights('best_model_pet_sentencepair.weights')
+        test_acc = evaluate(test_generator)
+        print(
+            u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
+            (val_acc, self.best_val_acc, test_acc)
+        )
+
+
+def evaluate(data):
+    """
+    计算候选标签列表中每一个标签（如'科技'）的联合概率，并与正确的标签做对比。候选标签的列表：['科技','娱乐','汽车',..,'农业']
+    y_pred=(32, 2, 21128)=--->(32, 1, 14) = (batch_size, 1, label_size)---argmax--> (batch_size, 1, 1)=(batch_size, 1, index in the label)，批量得到联合概率分布最大的标签词语
+    :param data:
+    :return:
+    """
+    pred_result_list = []
+    label_ids = np.array([tokenizer.encode(l)[0][1:-1] for l in labels]) # 获得两个字的标签对应的词汇表的id列表，如: label_id=[1093, 689]。label_ids=[[1093, 689],[],[],..[]]tokenizer.encode('农业') = ([101, 1093, 689, 102], [0, 0, 0, 0])
+    total, right = 0., 0.
+    for x_true, _ in data:
+        x_true, y_true = x_true[:2], x_true[2] # x_true = [batch_token_ids, batch_segment_ids]; y_true: batch_output_ids
+        mask_idxs = np.where(x_true[0] == tokenizer._token_mask_id)[1].reshape(y_true.shape[0], 2)
+
+        y_pred = model.predict(x_true)
+        y_pred = np.array([y_pred[i][mask_idx] for i, mask_idx in enumerate(mask_idxs)]) # 取出每个样本特定位置上的索引下的预测值。y_pred=[batch_size, 2, vocab_size]。mask_idxs = [7, 8]
+
+        y_true = np.array([y_true[i][mask_idx] for i, mask_idx in enumerate(mask_idxs)])
+        # print("y_pred:",y_pred.shape,";y_pred:",y_pred) # (32, 2, 21128)
+        # print("label_ids",label_ids) # [[4906 2825],[2031  727],[3749 6756],[3180 3952],[6568 5307],[3136 5509],[1744 7354],[2791  772],[4510 4993],[1092  752],[3125  752],[3152 1265],[ 860 5509],[1093  689]]
+        y_pred = y_pred[:, 0, label_ids[:, 0]] * y_pred[:, 1, label_ids[:, 1]] # y_pred=[batch_size,1,label_size]=[32,1,14]。联合概率分布。 y_pred[:, 0, label_ids[:, 0]]的维度为：[32,1,21128]
+        y_pred = y_pred.argmax(axis=1) # 找到概率最大的那个label(词)。如“财经”
+        # print("y_pred:",y_pred.shape,";y_pred:",y_pred) # O.K. y_pred: (16,) ;y_pred: [4 0 4 1 1 4 5 3 9 1 0 9]
+        # print("y_true.shape:",y_true.shape,";y_true:",y_true) # y_true: (16, 128)
+        y_true = np.array([labels.index(tokenizer.decode(y)) for y in y_true])
+        total += len(y_true)
+        right += np.where(np.array(y_pred) == np.array(y_true))[0].shape[0]  # (y_true == y_pred).sum()
+    return right / total
+    #     pred_result_list += (y_true == y_pred).tolist()
+    # return pred_result_list
+
+
+if __name__ == '__main__':
+    evaluator = Evaluator()
+
+    train_model.fit_generator(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator) * 5,
+        epochs=10,
+        callbacks=[evaluator]
+    )