## Fine-tune

In [None]:
import json
from CC.predicter import NERPredict
from CC.trainer import NERTrainer

### Fine-tune 参数设置

适用于**le_loader**下的fine-tune任务

In [None]:
# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/labels.txt',
    'output_eval': True,
    'loader_name': 'le_loader',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "default_tag":"O",
    'batch_size': 64,
    'eval_batch_size': 512,
    'do_shuffle': True,
    'model_name': 'LEBert',
    'task_name': 'weibo'
}

### 参数微调

调整`Pretrained`和`task name`

In [None]:
# args['pretrained_file_name'] = './save_pretrained/Weibo_Super1x_Pretrained/Bert_2550/pytorch_model.bin'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/labels.txt'
args['batch_size'] = 32
args['task_name'] = 'ontonotes5_s_02'

In [None]:
trainer = NERTrainer(**args)

for i in trainer(lr2=1e-2):
    a = i

## PLE EnhancedTraining

### Fine-tune 参数设置

适用于**le_loader**下的fine-tune任务

In [None]:
import json
from CC.predicter import NERPredict
from CC.trainer import NERTrainer

# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'prompt_pretrained_file_name': './save_pretrained/Ontonotes5_s_02_Pretrained_v2/Bert_3430/pytorch_model.bin',
    'prompt_config_file_name': './save_pretrained/Ontonotes5_s_02_Pretrained_v2/Bert_3430/config.json',
    'bert_pretrain_path': './model/chinese_wwm_ext/',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/ontonotes5/train.json',
    'eval_file': './data/ontonotes5/dev.json',
    'test_file': './data/ontonotes5/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/ontonotes5/ontonotes5_labels.txt',
    'output_eval': True,
    'loader_name': 'ft_loader_v1',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "tag_embedding_file":"./data/tencent/label_embedding.txt",
    "external_entities_file": "./data/ontonotes5_s/entities_data_label.json",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag":"O",
    'batch_size': 64,
    'eval_batch_size': 256,
    'do_shuffle': True,
    'model_name': 'PLEBert',
    "tag_rules": {
        "LOC": "地名",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "NR":"人名",
        "NS":"地名",
        "NT":"组织机构",
        "CONT": "国家",
        "PRO":"职位",
        "RACE":"种族",
        "TITLE":"工作名称",
        "EDU":"教育经历",
        "NAME":"名字",
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    },
    'task_name': 'weibo'
}

### 参数微调

调整`Pretrained`和`task name`

In [None]:
args['pretrained_file_name'] = './model/chinese_wwm_ext/pytorch_model.bin'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/labels.txt'
args['batch_size'] = 32
args['task_name'] = 'ontonotes5_s_02_pc'

In [None]:
trainer = NERTrainer(**args)

def out_eval(epoch):
    if epoch > 6:
        return True
    return epoch % 3 == 0

for i in trainer(eval_call_epoch=out_eval):
    a = i

## EnhancedTraining
适用于**ft_loader_v3**下的fine-tune任务

In [None]:
import json
from CC.predicter import NERPredict
from CC.enhanced_trainer import EnhancedNERTrainer

# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'prompt_pretrained_file_name': './save_pretrained/Ontonotes5_s_02_Pretrained_v2/Bert_3430/pytorch_model.bin',
    'prompt_config_file_name': './save_pretrained/Ontonotes5_s_02_Pretrained_v2/Bert_3430/config.json',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/ontonotes5/train.json',
    'eval_file': './data/ontonotes5/dev.json',
    'test_file': './data/ontonotes5/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/ontonotes5/ontonotes5_labels.txt',
    'output_eval': True,
    'loader_name': 'ft_loader_v4',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "default_tag":"O",
    'batch_size': 64,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'model_name': 'LEBert',
    "tag_rules": {
        "O": "其他",
        "LOC": "地名",
        "NORP": "政体民教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "NR":"人名",
        "NS":"地名",
        "NT":"组织机构",
        "CONT": "国家",
        "PRO":"职位",
        "RACE":"种族",
        "TITLE":"工作名称",
        "EDU":"教育经历",
        "NAME":"名字",
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    },
    'task_name': 'weibo'
}

In [None]:
args['pretrained_file_name'] = './save_pretrained/Ontonotes5_0.5k_Pretrained_v2/Bert_560/pytorch_model.bin'
args['prompt_pretrained_file_name'] = './save_pretrained/Ontonotes5_Pretrained_v2/Bert_18030/pytorch_model.bin'
args['train_file'] = './data/ontonotes5_s/train_0.5k.json'
args['eval_file'] = './data/ontonotes5_s/dev_remade.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/labels.txt'
args['batch_size'] = 8
args['task_name'] = 'enhanced_ontonotes5_s_0.5k'

In [None]:
trainer = EnhancedNERTrainer(**args)

for i in trainer(lr2=1e-2):
    a = i

## GPT2-预训练
loader: pt_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.gpt_pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'gpt_config_file_name': './model/gpt2-chinese/config.json',
    'pretrained_file_name': './model/gpt2-chinese/pytorch_model.bin',
    'max_seq_length': 256,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'ptloader_v1',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Weibo_Full_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "LOC": "地名",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

# args['pretrained_file_name'] = './save_pretrained/GPT_2_Ontonotes5_s_02_Pretrained/GPT-2_5880/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/labels.txt'
args['task_name'] = 'GPT_2_Ontonotes5_s_02_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## Super NER的Fine-tune

In [None]:
import json
from CC.predicter import NERPredict
from CC.trainer import NERTrainer

# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/Super_x_Ontonotes5/pre_train_0.2x.json',
    'eval_file': './data/Super_x_Ontonotes5/pre_dev.json',
    'test_file': './data/Super_x_Ontonotes5/pre_dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/Super_x_Ontonotes5/tags_list.txt',
    'output_eval': True,
    'loader_name': 'le_loader',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "default_tag":"O",
    'batch_size': 64,
    'eval_batch_size': 512,
    'do_shuffle': True,
    'model_name': 'LEBert',
    'task_name': 'super_x_ontonotes'
}

trainer = NERTrainer(**args)

for i in trainer(lr2=1e-2, resume_path='./save_model/super_x_ontonotes', resume_step=2586):
    a = i

## Super NER的预训练

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1500000,
    'train_file': './data/SuperNER/pre_train.json',
    'eval_file': './data/SuperNER/pre_dev.json',
    'test_file': './data/SuperNER/pre_test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/SuperNER/tags_list.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Pre_trained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## Weibo的预训练
loader: lex_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Weibo_Full_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag_combine.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Ontonotes5_02_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PRODUCT": "产品",
        "FAC": "场地",
        "ORDINAL": "排名",
        "QUANTITY": "无单位数量",
        "CARDINAL": "有单位数量",
        "EVENT": "事件",
        "MONEY": "金额",
        "DATE": "日期",
        "LOC": "地名",
        "WORK_OF_ART": "作品",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "PERCENT": "百分数",
        "LANGUAGE": "语言",
        "GPE": "政体",
        "PERSON": "人名",
        "LAW": "法文",
        "TIME": "时间",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

args['pretrained_file_name'] = './save_pretrained/Ontonotes5_003_Pretrained/Bert_1290/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/ontonotes5/train_003.json'
args['eval_file'] = './data/ontonotes5/dev.json'
args['test_file'] = './data/ontonotes5/test.json'
args['tag_file'] = './data/ontonotes5/ontonotes5_pretrained_labels.txt'
args['task_name'] = 'Ontonotes5_003_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer(lr=1e-5):
    a = i


## Ontonote5(Four Labels)的预训练
loader: lex_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 35,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Ontonotes5_02_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "LOC": "地名",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

args['pretrained_file_name'] = 'save_pretrained/Ontonotes5_s_02_Pretrained/Bert_1960/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/pretrained_labels_ori.txt'
args['task_name'] = 'Ontonotes5_s_02_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer(lr=1e-5):
    a = i

## Ontonote5(Four Labels)的PTV2预训练
loader: ptloader_v4

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 35,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'ptloader_v2',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Ontonotes5_02_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "O": "其他",
        "LOC": "地名",
        "NORP": "政体民教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

args['pretrained_file_name'] = './save_pretrained/Ontonotes5_s_0.5k_Pretrained_v2/Bert_560/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/ontonotes5_s/train_0.5kx5.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/pretrained_labels_ori.txt'
args['task_name'] = 'Ontonotes5_s_0.5k_Pretrained_v2'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## Ontonote5的PTV2预训练
loader: ptloader_v4

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 35,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'ptloader_v2',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Ontonotes5_02_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "O": "其他",
        "PRODUCT": "产品",
        "FAC": "场地",
        "ORDINAL": "排名",
        "QUANTITY": "数量",
        "CARDINAL": "单位数量",
        "EVENT": "事件",
        "MONEY": "金额",
        "DATE": "日期",
        "LOC": "地名",
        "WORK_OF_ART": "作品",
        "NORP": "政体民教",
        "ORG": "机构",
        "PERCENT": "百分数",
        "LANGUAGE": "语言",
        "GPE": "政体",
        "PERSON": "人名",
        "LAW": "法文",
        "TIME": "时间",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

args['pretrained_file_name'] = './save_pretrained/Ontonotes5_0.5k_Pretrained_v2/Bert_560/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/ontonotes5/train_0.5k.json'
args['eval_file'] = './data/ontonotes5/_.json'
args['test_file'] = './data/ontonotes5/test.json'
args['tag_file'] = './data/ontonotes5/ontonotes5_pretrained_labels.txt'
args['task_name'] = 'Ontonotes5_0.5k_Pretrained_v2'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer(lr=1e-5):
    a = i

## Weibo的预训练(屏蔽无关实体)
loader: lex_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Weibo_Full_Pretrained',
    "use_gpu": True,
    "debug": True,
    "pass_none_rule": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "government": "政府"
    }
}

args['pretrained_file_name'] = './model/chinese_wwm_ext/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/weibo/train.json'
args['tag_file'] = './data/weibo/pretrained_labels.txt'
args['task_name'] = 'Weibo_Full_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## 适用于`CONLL`的预训练
loader: cnx_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibonew/train.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'cnx_loader',
    'batch_size': 16,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'use_json': True,
    'model_name': 'Bert',
    'task_name': 'Weibo_CNX_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PRODUCT": "产品",
        "FAC": "场地",
        "ORDINAL": "排名",
        "QUANTITY": "无单位数量",
        "CARDINAL": "有单位数量",
        "EVENT": "事件",
        "MONEY": "金额",
        "DATE": "日期",
        "LOC": "地名",
        "WORK_OF_ART": "作品",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "PERCENT": "百分数",
        "LANGUAGE": "语言",
        "GPE": "政体",
        "PERSON": "人名",
        "LAW": "法文",
        "TIME": "时间",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

args['pretrained_file_name'] = './save_pretrained/Ontonotes5_02_Pretrained/Bert_2886/pytorch_model.bin'
args['train_file'] = './data/ontonotes5/train_02_1.json'
args['tag_file'] = './data/ontonotes5/ontonotes5_pretrained_labels.txt'
args['task_name'] = 'Ontonotes5_02_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer(lr=1e-5):
    a = i

## Weibo 预训练x纠错

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train_5x.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'labellex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Weibo_Full20xC_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## 数据集扩展

In [None]:
from tools.expand_data import *

loader = DataExpand(**{
    "auto_loader": False,
    "debug": True,
    "file_name": "./data/ontonotes5_s/train_0.5k.json",
    "random_rate": 1.0,
    "expansion_rate": 5
}).read_data_set("./data/ontonotes5_s/train_0.5k.json", 1.0) \
    .process_data(5) \
    .to_file("./train_5x.json")

## GPT-2预测

In [None]:
import sys
from transformers import BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('./model/gpt2-chinese/vocab.txt')

model = BertForMaskedLM.from_pretrained('./save_pretrained/Ontonotes5_s_02_Pretrained_v2/Bert_3430')
model.cuda()

In [None]:
from CC.loaders.pretrain.ptloader_v2 import PTLoaderV2

args = {
    'batch_size': 16,
    'eval_batch_size': 64,
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    'train_file': './data/ontonotes5_s/dev.json',
    'eval_file': './data/ontonotes5_s/dev.json',
    'test_file': './data/ontonotes5_s/test.json',
    'tag_file': './data/ontonotes5_s/labels.txt',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'max_seq_length': 150,
    "default_tag": "O",
    'output_eval': True,
    'task_name': 'GPT_2_Ontonotes5_s_02_Pretrained',
    "tag_rules": {
        "O": "其他",
        "LOC": "地名",
        "NORP": "政体民教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

ptv2 = PTLoaderV2(**args)
dataloader = ptv2()
train_data = dataloader['train_set']
train_iter = dataloader['train_iter']

In [None]:
import torch.nn as nn

for it in train_iter:
    for key in it.keys():
        it[key] = it[key].long().cuda()
    outputs = model(input_ids=it['input_ids'], attention_mask=it['attention_mask'],
                                     token_type_ids=it['token_type_ids'])
    pred_scores = outputs.logits
    pred_scores = nn.Softmax(dim=-1)(pred_scores)
    ids = pred_scores.max(dim=-1)[1]
    sys.stdout.write('\r{}'.format(tokenizer.decode(ids[0])))
    sys.stdout.flush()
    break

In [23]:
tokenizer.decode(train_data.__getitem__(2549)['origin_labels'])

'[CLS] 福 建 省 是 中 国 政 府 决 定 在 九 十 年 代 加 速 发 展 并 争 取 用 二 十 年 时 间 基 本 实 现 现 代 化 的 四 个 地 区 之 一 ， 同 时 也 是 中 国 力 图 通 过 发 展 高 新 技 术 增 强 经 济 发 展 后 劲 的 重 点 地 区 。 [SEP] 1 是 政 体 [PAD] [PAD], 3 是 政 体 [PAD] [PAD], 4 是 其 他 [PAD] [PAD], 5 是 政 体 [PAD] [PAD], 6 是 政 体 [PAD] [PAD], 7 是 机 构 [PAD] [PAD], 8 是 机 构 [PAD] [PAD], 9 是 其 他 [PAD] [PAD], 4 7 是 其 他 [PAD] [PAD], 4 8 是 政 体 [PAD] [PAD], [PAD] [PAD] [PAD]'

In [18]:
tokenizer.decode(ids[0])

'[CLS] 澳 门 机 场 ， 单 是 去 年 就 有 近 百 万 人 次 的 台 湾 旅 客 取 道 澳 门 前 往 中 国 大 陆 或 入 境 澳 门 ， 澳 门 机 场 的 主 要 旅 客 中 ， 台 湾 客 人 占 了 八 成 。 [SEP] 1 是 其 他 [PAD] [PAD], 1 7 是 其 他 [PAD] [PAD], 1 8 是 政 体 [PAD] [PAD], 1 9 是 政 他 [PAD] [PAD], 2 0 是 其 他 [PAD] [PAD], 2 3 是 其 他 [PAD] [PAD], 2 4 是 其 他 [PAD] [PAD], 2 5 是 政 体 [PAD] [PAD], 2 6 是 政 体 [PAD] [PAD], 2 7 是 政 体 [PAD] [PAD], 2 8 是 其 他 [PAD] [PAD], [PAD] [PAD] [PAD] [PAD]'

In [None]:
# it['input_ids'][1]
it['origin_labels'][0]
# it['attention_mask'][1]

In [None]:
import json

with open('./data/ontonotes5/dev.json') as f:
    ori_list = f.read().split('\n')
ori_list = ori_list[:-1]

legal = ['PERSON', 'GPE', 'ORG', 'LOC']

result = []

for item in ori_list:
    item = json.loads(item)
    for idx, tag in enumerate(item['label']):
        t = tag.split('-')[1] if tag != 'O' else 'O'
        if t not in legal:
            item['label'][idx] = 'O'
    result.append(item)

with open('./pure.json', mode='a+') as f:
    for item in result:
        f.write('{}\n'.format(json.dumps(item, ensure_ascii=False)))