### 分析句子长度对模型性能的影响

In [None]:
# %%
import json
import math
import numpy as np

def RMSE(x, y):
    return (x - y) ** 2


with open('./dataset/CHIP-STS/dev.json') as f:
    ori_list = f.read()
ori_list = json.loads(ori_list)

with open('./data_record/SAS_CNSTSX_r2bert/predict_gold.csv') as f:
    pred_gold_list = f.read()
    pred_gold_list = pred_gold_list.split('\n')
    if pred_gold_list[-1] == '':
        pred_gold_list = pred_gold_list[:-1]

slot = {
    # 10: {
    #     'RMSE': [],
    #     'count': 0
    # },
    25: {
        'RMSE': [],
        'count': 0
    },
    50: {
        'RMSE': [],
        'count': 0
    },
    75: {
        'RMSE': [],
        'count': 0
    },
    100: {
        'RMSE': [],
        'count': 0
    },
    125: {
        'RMSE': [],
        'count': 0
    },
    150: {
        'RMSE': [],
        'count': 0
    },
    175: {
        'RMSE': [],
        'count': 0
    },
    200: {
        'RMSE': [],
        'count': 0
    },
    999: {
        'RMSE': [],
        'count': 0
    }
}

for idx, item in enumerate(ori_list):
    ans = item['text2']
    gold, pred = pred_gold_list[idx].split('\t')
    length = len(ans)
    for key in slot:
        if length > key:
            continue
        else:
            slot[key]['count'] += 1
            slot[key]['RMSE'].append(RMSE(float(pred), float(gold)))
            break

for key in slot:
    print('<={}: count: {}, rmse: {:.3f}'.format(key, slot[key]['count'], math.sqrt(np.mean(slot[key]['RMSE']))))

# %%


### 分析不同评分对模型性能的影响

In [None]:
# %%
import math
import numpy as np

def RMSE(x, y):
    return (x - y) ** 2

def print_model_rmse_in_gold_dis(model_list=[], dataset_name='SAS'):
    for model in model_list:
        with open('./data_record/SAS_{}_{}/predict_gold.csv'.format(dataset_name, model)) as f:
            pred_gold_list = f.read()
            pred_gold_list = pred_gold_list.split('\n')
            if pred_gold_list[-1] == '':
                pred_gold_list = pred_gold_list[:-1]

        slot = {
            0.2: {
                'RMSE': [],
                'count': 0
            },
            0.4: {
                'RMSE': [],
                'count': 0
            },
            0.6: {
                'RMSE': [],
                'count': 0
            },
            0.8: {
                'RMSE': [],
                'count': 0
            },
            1.2: {
                'RMSE': [],
                'count': 0
            }
        }

        for idx, line in enumerate(pred_gold_list):
            pred, gold = pred_gold_list[idx].split('\t')
            for key in slot:
                if float(gold) > key:
                    continue
                else:
                    slot[key]['count'] += 1
                    slot[key]['RMSE'].append(RMSE(float(pred), float(gold)))
                    break
        
        print('{}:'.format(model))
        for key in slot:
            print('<={}: count: {}, rmse: {:.3f}'.format(key, slot[key]['count'], math.sqrt(np.mean(slot[key]['RMSE']))))


In [None]:
print_model_rmse_in_gold_dis(['textcnn', 'bert', 'bimpm', 'esim', 'sbert', 'r2bert'], 'ASAG')

In [None]:
print_model_rmse_in_gold_dis(['bert'], 'SFR')

### 计算模型分类性能

In [None]:
import numpy as np
from sklearn.metrics import classification_report


def F1(X, Y):
    result_dict = {}
    for idx, _ in enumerate(X):
        x, y = X[idx], Y[idx]
        if x not in result_dict:
            result_dict[x] = {
                'TP': 0,
                'FP': 0,
                'FN': 0
            }
        if y not in result_dict:
            result_dict[y] = {
                'TP': 0,
                'FP': 0,
                'FN': 0
            }
        if x == y:
            result_dict[x]['TP'] += 1
        else:
            result_dict[x]['FP'] += 1
            result_dict[y]['FN'] += 1

    P = []
    R = []
    F1Score = []
    for key in result_dict:
        TP, FP, FN = result_dict[key]['TP'], result_dict[key]['FP'], result_dict[key]['FN']
        if TP == 0:
            continue
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        f1 = 2 * precision * recall / (precision + recall)
        P.append(precision)
        R.append(recall)
        F1Score.append(f1)

    return np.mean(P), np.mean(R), np.mean(F1Score)

def map(score):
    if score <= 3:
        return 0
    elif score <= 6:
        return 1
    else:
        return 2


def print_model_f1(model_list=[], dataset_name='SAS'):
    for model in model_list:
        with open('./data_record/SAS_{}_{}/predict_gold{}.csv'.format(dataset_name, model, '_sota' if model == 'msim' else '')) as f:
            pred_gold_list = f.read()
            pred_gold_list = pred_gold_list.split('\n')
            if pred_gold_list[-1] == '':
                pred_gold_list = pred_gold_list[:-1]

        pred_list = []
        gold_list = []

        for line in pred_gold_list:
            pred, gold = line.split('\t')
            pred = round(float(pred) * 10)
            gold = round(float(gold) * 10)
            pred_list.append(map(pred))
            gold_list.append(map(gold))

        result = F1(pred_list, gold_list)

        print('{}\n P: {:.2f}, R: {:.2f}, F1: {:.2f}\n'.format(
            model, result[0] * 100, result[1] * 100, result[2] * 100))


In [None]:
print_model_f1(['textcnn', 'esim', 'bimpm', 'bert', 'sbert', 'r2bert'], 'CNSTSX')
print_model_f1(['msim'], 'CNSTSX')

### 计算数据集句向量

In [None]:
import os
import json
import torch
from tqdm import tqdm
from transformers import BertTokenizer, BertConfig, BertModel

tokenizer = BertTokenizer.from_pretrained('./model/chinese_wwm_ext/vocab.txt')
config = BertConfig.from_pretrained('./model/chinese_wwm_ext/bert_config.json')
model = BertModel.from_pretrained('./model/chinese_wwm_ext/pytorch_model.bin', config=config)

model.eval()
model.cuda()

with open('./dataset/CHIP-STS/train.json') as f:
    data = json.load(f)

ref_list = []
std_list = []

for idx, line in tqdm(enumerate(data)):
    ref, std = line['text1'], line['text2']
    ref_T = tokenizer(ref, return_tensors="pt")
    std_T = tokenizer(std, return_tensors="pt")
    ref_T = {k: v.cuda() for k, v in ref_T.items()}
    std_T = {k: v.cuda() for k, v in std_T.items()}
    output = model(**ref_T)
    ref_list.append(output.pooler_output[0].tolist())
    output = model(**std_T)
    std_list.append(output.pooler_output[0].tolist())

if not os.path.exists('./data_record/tSNE/CNSTS'):
    os.mkdir('./data_record/tSNE/CNSTS')

with open('./data_record/tSNE/CNSTS/ref.tsv', 'w+') as f:
    for ref in ref_list:
        f.write('\t'.join([str(i) for i in ref]) + '\n')

with open('./data_record/tSNE/CNSTS/std.tsv', 'w+') as f:
    for std in std_list:
        f.write('\t'.join(str(i) for i in std) + '\n')

### STS样例预测结果

In [None]:
import json
from CC.predictor import Predictor
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('./model/chinese_wwm_ext/vocab.txt')
predictor = Predictor(tokenizer, model_name="bert",  padding_length=150, resume_path='./save_model/STS_CNSTS_bert/bert/bert_2000.pth', batch_size=16)

In [None]:
with open('./dataset/CHIP-STS/dev.json') as f:
    data = json.load(f)

eval_list = []
for line in data:
    ref, std = line['text1'], line['text2']
    eval_list.append([ref, std])

pred_result = {
    'pred': [],
    'pred_socres': []
}
for i in predictor(eval_list):
    pred_result['pred'] = pred_result['pred'] + i['pred']
    pred_result['pred_socres'] = pred_result['pred_socres'] + i['pred_socres']


In [None]:
N_list = []

for idx, line in enumerate(data):
    label = int(line['label'])
    if pred_result['pred'][idx] != label:
        line['pred_label'] = pred_result['pred'][idx]
        line['pred_score'] = pred_result['pred_socres'][idx]
        N_list.append(line)

with open('./data_record/STS_CNSTS_bert/CHIP-STS_N.json', 'w+') as f:
    json.dump(N_list, f, ensure_ascii=False, indent=4)

### 计算实体向量

In [None]:
import os
import json
import torch
import pickle
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizer, BertConfig, BertModel

tokenizer = BertTokenizer.from_pretrained('/home/lpc/models/text2vec-base-chinese/')
model = BertModel.from_pretrained('/home/lpc/models/text2vec-base-chinese/')
DATASET = 'CNSTS'
BATCH_SIZE = 64

model.eval()
model.cuda()

with open('./dataset/CHIP-STS/ner_final') as f:
    ori_list = f.read().split('\n')

if ori_list[-1] == '':
    ori_list = ori_list[:-1]

result_list = []

result_list.append([0 for _ in range(768)])

num_batches = len(ori_list) / BATCH_SIZE if len(ori_list) % BATCH_SIZE == 0 else int(len(ori_list) / BATCH_SIZE + 1)

for idx in tqdm(range(num_batches)):
    lines = ori_list[idx*BATCH_SIZE : (idx+1)*BATCH_SIZE]
    entity_list = [line.split(',')[0] for line in lines]
    T = tokenizer(entity_list, padding=True, truncation=True, return_tensors="pt")
    T = {k: v.cuda() for k, v in T.items()}
    output = model(**T)
    result_list += output.pooler_output.tolist()

if not os.path.exists(f'./embedding/{DATASET}'):
    os.mkdir(f'./embedding/{DATASET}')

result_list_num = np.array(result_list)

with open(f'./embedding/{DATASET}/ori_{DATASET}.numpy', 'wb') as f:
    pickle.dump(result_list_num, f, 2)


### 输出负样例

In [None]:
import json

with open('./data_record/CLS_CNSTSAC_acbert_simcse_entity/predict_gold.csv') as f:
    predict_label = f.read().split('\n')

if predict_label[-1] == '':
    predict_label = predict_label[:-1]

predict_label = [item.split('\t') for item in predict_label]

with open('./dataset/CHIP-STS/dev.json') as f:
    data = json.load(f)

result = []
for idx, line in enumerate(data):
    if float(predict_label[idx][0]) != float(predict_label[idx][1]):
        result.append(line)

with open('./dataset/CHIP-STS/CHIP-STS_N.json', 'w+') as f:
    json.dump(result, f, ensure_ascii=False, indent=4)