#### 验证SuperNER_note4是否与验证集有交集

In [None]:
# 读取SuperNER_note4的内容
from typing import Set
from tqdm import tqdm
import json
origins:Set[str] = set()
with open("data/SuperNER_note4/train.json","r",encoding="utf-8") as reader:
    for line in reader:
        data = json.loads(line.strip())
        text = ''.join(data["text"])
        origins.add(text)


cnt = 0

print(len(origins))

# 读取weibo 验证集的类型
with open("data/weibo/dev.json","r",encoding="utf-8") as weibo:
    for line in tqdm(weibo):
        data = json.loads(line.strip())
        text = "".join(data["text"])
        for l in origins:
            if (l.startswith(text) or text.startswith(l)) and len(l)>10 and len(text)>10:
                cnt+=1
                print(text,l)

print(cnt)

结论：构造的超集不包含weibo数据验证集

#### 数据集构造步骤，使用main分支
1. 使用main分支LeBert训练模型，数据集采用`data/SuperNER_note4/train.json`,标签采用weibo超集标签`data/SuperNER_note4/labels.txt`
2. 使用该模型预测weibo `data/weibo/train.json`生成数据`data/weibo/train_super.json`

#### 预训练，使用LTS_Prompt_Enhanced分支
1. 采用`data/weibo/train_super.json`进行PTV预训练
2. 使用预训练模型，通过`data/weibo/train.json`PTV进行fine-tune

In [None]:
# 切换分支
!git checkout main
# 合并数据
from tools.merge_json import merge_datasets

# 将note4和SuperNER的数据合并在一起
datasets = ["data/SuperNER/pre_train.json","data/lebert/dataset/NER/note4/train.json"]

merge_datasets(datasets,"data/SuperNER_note4/train.json")

# 合并标签
from tools.merge_json import merge_labels

labels = ["data/weibo/labels.txt","data/SuperNER/tags_list.txt","data/lebert/dataset/NER/note4/labels.txt"]

merge_labels(labels,"data/SuperNER_note4/labels.txt")



In [None]:
# LeBert Fine-Tune
# 训练预测模型
from CC.trainer import NERTrainer

args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': 'data/SuperNER_note4/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': 'data/SuperNER_note4/labels.txt',
    'output_eval': True,
    'loader_name': 'le_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "default_tag": "O",
    'batch_size': 64,
    'eval_batch_size': 512,
    'do_shuffle': True,
    'model_name': 'LEBert',
    'task_name': 'super_predict_model'
}

# Trainer
trainer = NERTrainer(**args)

for i in trainer(lr2=1e-2):
    a = i

In [None]:
# 预测Train.json
# weibo train.json
from CC.predicter import NERPredict
import json

# 使用了预训练模型
args["lstm_crf_model_file"] = "save_model/super_predict_model/lstm_crf/lstm_crf_66930.pth"
args["bert_model_file"] = "save_model/super_predict_model/LEBert/LEBert_66930.pth"
predict = NERPredict(**args)

filename = "data/weibo/train.json"

batch_size = 64
index = 0
sentences = []

with open("data/weibo/train_super.json", "w", encoding="utf-8") as out:
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            text = data["text"]
            
            sentences.append(text)
            index += 1
            if index % batch_size == batch_size-1:
                for s, label in predict(sentences):
                    assert len(s[:args["max_seq_length"]-2])==len(label)
                    out.write(f"""{json.dumps({"text":s[:args["max_seq_length"]-2],"label":label},ensure_ascii=False)}\n""")
                sentences = []
                out.flush()
        if len(sentences)>0:
            for s, label in predict(sentences):
                assert len(s[:args["max_seq_length"]])==len(label)
                out.write(f"""{json.dumps({"text":s[:args["max_seq_length"]-2],"label":label},ensure_ascii=False)}\n""")
    

#### PTV 训练(LTS分支)


In [None]:
# git 切换到LTS分支

args = {
    'num_epochs': 35,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train_super.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/data/SuperNER_note4/labels.txt',
    'loader_name': 'ptloader_v2',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 8,
    'eval_batch_size': 32,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'weibo_multiple_pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "O": "非实体",
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "NR": "人名",
        "NS": "地名",
        "NT": "组织机构",
        "CONT": "国家",
        "PRO": "职位",
        "RACE": "种族",
        "TITLE": "工作名称",
        "EDU": "教育经历",
        "NAME": "名字",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "GPE": "政治实体",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名",
        "NORP": "政体民族",
        "PERSON": "人名",
    }
}

from CC.pre_trained import NERPreTrainer
pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i


In [None]:
# Fint-Tune

args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': '.save_pretrained/weibo_multiple_pretrained/Bert_5915/pytorch_model.bin',
    'prompt_pretrained_file_name': 'save_pretrained/weibo_multiple_pretrained/Bert_5915/pytorch_model.bin',
    'prompt_config_file_name': 'save_pretrained/weibo_multiple_pretrained/Bert_5915/config.json',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/labels.txt',
    'output_eval': True,
    'loader_name': 'ft_loader_v4',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "default_tag": "O",
    'batch_size': 8,
    'eval_batch_size': 32,
    'do_shuffle': True,
    'model_name': 'LEBert',
    "tag_rules": {
        "O": "非实体",
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "NR": "人名",
        "NS": "地名",
        "NT": "组织机构",
        "CONT": "国家",
        "PRO": "职位",
        "RACE": "种族",
        "TITLE": "工作名称",
        "EDU": "教育经历",
        "NAME": "名字",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "GPE": "政治实体",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名",
        "NORP": "政体民族",
        "PERSON": "人名",
    },
    'task_name': 'weibo_tag_multiple_3'
}

from CC.enhanced_trainer import EnhancedNERTrainer
trainer = EnhancedNERTrainer(**args)

for _ in trainer(lr2=1e-2):
    pass