#### PCBERT only

Train Steps:
1. Use LEBERT train from SUPERNER datasets.
2. Predict target domain train dataset.
3. Use predict dataset train P-BERT.
4. Load P-BERT and fine tune C-BERT.

In [None]:
# Predict target domain train dataset.
from CC.predicter import NERPredict
import json

args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json', # chinese_wwm_ext
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',

    'hidden_dim': 300, 

    'max_seq_length': 150, # max_length 150 , cut all datasets
    'max_scan_num': 1000000, # embedding size LEBERT

    'train_file': 'data/SuperNER_note4/train.json', # predict model TRAIN dataset
    'eval_file': 'data/weibo/dev.json', # predict model eval dataset
    'test_file': 'data/weibo/test.json', # test model test dataset
    
    'tag_file': 'data/SuperNER_note4/labels.txt',

    'output_eval': True,

    'loader_name': 'le_loader',

    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",

    "default_tag": "O",

    'batch_size': 64,
    'eval_batch_size': 512,

    'do_shuffle': True,
    'model_name': 'LEBert',

    'task_name': 'origin_super_note4_predict_model' # replace your task_name
}

# load trained model
args["lstm_crf_model_file"] = "save_model/super_predict_model/lstm_crf/lstm_crf_66930.pth"
args["bert_model_file"] = "save_model/super_predict_model/LEBert/LEBert_66930.pth"

predict = NERPredict(**args)


def predict_train(train_file, saved_file):
    batch_size = 64
    index = 0
    sentences = []

    with open(saved_file, "w", encoding="utf-8") as out:
        with open(train_file, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                text = data["text"]

                sentences.append(text)
                index += 1
                if index % batch_size == batch_size-1:
                    for s, label in predict(sentences):
                        assert len(s[:args["max_seq_length"]-2]) == len(label),f"{s} {label} {len(s)} {len(label)}"
                        out.write(
                            f"""{json.dumps({"text":s[:args["max_seq_length"]-2],"label":label},ensure_ascii=False)}\n""")
                    sentences = []
                    out.flush()
            # last sentence
            if len(sentences) > 0:
                for s, label in predict(sentences):
                    assert len(s[:args["max_seq_length"]]) == len(label),f"{s} {label} {len(s)} {len(label)}"
                    out.write(
                        f"""{json.dumps({"text":s[:args["max_seq_length"]-2],"label":label},ensure_ascii=False)}\n""")

# datasets
trainsets = ["weibo","note4","resume","msra"]
# scales
scales = [250,500,1000,1350]
# datasets root path
path = f"data/few_shot"

for dataname in trainsets:
    for scale in scales:
        # origin train path
        train_file = f"{path}/{dataname}/train_{scale}.json"
        # output path
        saved_file = f"{path}/{dataname}/train_{scale}_super_note4_pred.json"
        predict_train(train_file,saved_file)

In [None]:
from tools.merge_json import merge_labels

labels_origin = [
    "data/few_shot/weibo/labels.txt",
    "data/few_shot/note4/labels.txt",
    "data/few_shot/resume/labels.txt",
    "data/few_shot/msra/labels.txt"
]
# merge labels
labels = ["data/SuperNER/tags_list.txt", "data/lebert/dataset/NER/note4/labels.txt"]

for origin in labels_origin:
    # remain label ordered
    merge_labels([origin]+labels,origin.replace("labels.txt","super_labels.txt"))

In [None]:
# Use predict dataset train P-BERT.
# LTS branch
pretrain_model_args = {
    'num_epochs': 35,
    'num_gpus': [0, 1, 2, 3],

    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',

    'max_seq_length': 512,
    'max_scan_num': 1000000,

    'train_file': './data/few_shot/weibo/train_250_super_note4_pred.json',
    'eval_file': './data/few_shot/weibo/dev.json',
    'test_file': './data/few_shot/weibo/test.json',
    
    'tag_file': 'data/few_shot/weibo/super_labels.txt',

    'loader_name': 'ptloader_v2',

    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",

    "default_tag": "O",
    'batch_size': 8,
    'eval_batch_size': 32,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'weibo_pbert_pretrain_250', # replace your task name
    "use_gpu": True,
    "debug": True,

    "tag_rules": {
        "O": "非实体",
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "NR": "人名",
        "NS": "地名",
        "NT": "组织机构",
        "CONT": "国家",
        "PRO": "职位",
        "RACE": "种族",
        "TITLE": "工作名称",
        "EDU": "教育经历",
        "NAME": "名字",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "GPE": "政治实体",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名",
        "NORP": "政体民族",
        "PERSON": "人名",
    }
}

import torch
from CC.pre_trained import NERPreTrainer

In [None]:
datasets = ["weibo","note4","resume","msra"]

scales = [250,500,1000,1350]

tasks_args = {
        "train_file":"./data/few_shot/{0}/train_{1}_super_note4_pred.json",
        "eval_file":'./data/few_shot/{0}/dev.json',
        "test_file":'./data/few_shot/{0}/test.json',
        "tag_file":"data/few_shot/{0}/super_labels.txt",
        'task_name': '{0}_pbert_pretrain_{1}'
    }

for name in datasets:
    for scale in scales:
        for k,v in tasks_args.items():
            pretrain_model_args[k] = v.format(name,scale)
        pre_trainer = NERPreTrainer(**pretrain_model_args)

        for i in pre_trainer():
            a = i
        torch.cuda.empty_cache()

In [None]:
model_args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2,3],

    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': '.save_pretrained/weibo_multiple_pretrained/Bert_5915/pytorch_model.bin',
    'prompt_pretrained_file_name': 'save_pretrained/weibo_multiple_pretrained/Bert_5915/pytorch_model.bin',
    'prompt_config_file_name': 'save_pretrained/weibo_multiple_pretrained/Bert_5915/config.json',

    'hidden_dim': 300, # CRF LSTM
#    "hidden_dim":768,

    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/labels.txt',
    'output_eval': True,
    'loader_name': 'ft_loader_v4',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "default_tag": "O",
    'batch_size': 8,
    'eval_batch_size': 32,
    'do_shuffle': True,
    'model_name': 'LEBert',

    "tag_rules": {
        "O": "非实体",
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "NR": "人名",
        "NS": "地名",
        "NT": "组织机构",
        "CONT": "国家",
        "PRO": "职位",
        "RACE": "种族",
        "TITLE": "工作名称",
        "EDU": "教育经历",
        "NAME": "名字",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "GPE": "政治实体",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名",
        "NORP": "政体民族",
        "PERSON": "人名",
    },
    'task_name': 'weibo_tag_multiple_3'
}

In [None]:

datasets = ["weibo"]
scales = [1350]
steps = [5985]
# scales = [250]
# steps = [1120]
# scales = [250,500,1000,1350]
# steps = [1120,2205,4375,5915]

# resume
# steps = [1120,2240,4410,5950]

# weibo
# steps = [1120,2240,4445,5985]

per_task_count = 1

tasks_args = {
    "task_name": "{0}_pc_bert_crf_{1}_{2}",
    
    'prompt_config_file_name':"save_pretrained/{0}_pbert_pretrain_{1}/Bert_{3}/config.json",
    "pretrained_file_name":"save_pretrained/{0}_pbert_pretrain_{1}/Bert_{3}/pytorch_model.bin",
    'prompt_pretrained_file_name':"save_pretrained/{0}_pbert_pretrain_{1}/Bert_{3}/pytorch_model.bin",

    'train_file': './data/few_shot/{0}/train_{1}.json',
    'eval_file': './data/few_shot/{0}/dev.json',
    'test_file': './data/few_shot/{0}/test.json',
    'tag_file':"./data/few_shot/{0}/labels.txt"
}
from CC.enhanced_trainer import EnhancedNERTrainer
import torch

for dataset in datasets:
    for scale,step in zip(scales,steps):
        for i in range(per_task_count):
            for k,v in tasks_args.items():
                model_args[k]=v.format(dataset,scale,f"x{i}",step)
            trainer = EnhancedNERTrainer(**model_args)

            for _ in trainer(lr2=1e-2):
                pass

            torch.cuda.empty_cache()