## Fine-tune

In [None]:
import json
from CC.predicter import NERPredict
from CC.trainer import NERTrainer

### Fine-tune 参数设置

适用于**le_loader**下的fine-tune任务

In [None]:
# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/labels.txt',
    'output_eval': True,
    'loader_name': 'le_loader',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "default_tag":"O",
    'batch_size': 64,
    'eval_batch_size': 512,
    'do_shuffle': True,
    'model_name': 'LEBert',
    'task_name': 'weibo'
}

### 参数微调

调整`Pretrained`和`task name`

In [None]:
# args['pretrained_file_name'] = './save_pretrained/Weibo_Super1x_Pretrained/Bert_2550/pytorch_model.bin'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/labels.txt'
args['batch_size'] = 32
args['task_name'] = 'ontonotes5_s_02'

In [None]:
trainer = NERTrainer(**args)

for i in trainer(lr2=1e-2):
    a = i

## PLEBert Fine-tune

### Fine-tune 参数设置

适用于**le_loader**下的fine-tune任务

In [None]:
import json
from CC.predicter import NERPredict
from CC.trainer import NERTrainer

# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'bert_pretrain_path': './model/chinese_wwm_ext/',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/ontonotes5/train.json',
    'eval_file': './data/ontonotes5/dev.json',
    'test_file': './data/ontonotes5/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/ontonotes5/ontonotes5_labels.txt',
    'output_eval': True,
    'loader_name': 'ft_loader_v1',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "tag_embedding_file":"./data/tencent/label_embedding.txt",
    "external_entities_file": "./data/ontonotes5_s/entities_data_label.json",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag":"O",
    'batch_size': 64,
    'eval_batch_size': 256,
    'do_shuffle': True,
    'model_name': 'PLEBert',
    "tag_rules": {
        "LOC": "地名",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "NR":"人名",
        "NS":"地名",
        "NT":"组织机构",
        "CONT": "国家",
        "PRO":"职位",
        "RACE":"种族",
        "TITLE":"工作名称",
        "EDU":"教育经历",
        "NAME":"名字",
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    },
    'task_name': 'weibo'
}

### 参数微调

调整`Pretrained`和`task name`

In [None]:
args['pretrained_file_name'] = './model/chinese_wwm_ext/pytorch_model.bin'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/labels.txt'
args['batch_size'] = 32
args['task_name'] = 'ontonotes5_s_02_pc'

In [None]:
trainer = NERTrainer(**args)

def out_eval(epoch):
    if epoch > 6:
        return True
    return epoch % 3 == 0

for i in trainer(eval_call_epoch=out_eval):
    a = i

## EnhancedTraining
适用于**ft_loader_v3**下的fine-tune任务

In [1]:
import json
from CC.predicter import NERPredict
from CC.enhanced_trainer import EnhancedNERTrainer

# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'prompt_pretrained_file_name': './save_pretrained/GPT_2_Ontonotes5_s_02_Pretrained/GPT-2_5880/pytorch_model.bin',
    'prompt_config_file_name': './model/gpt2-chinese/config.json',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/ontonotes5/train.json',
    'eval_file': './data/ontonotes5/dev.json',
    'test_file': './data/ontonotes5/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/ontonotes5/ontonotes5_labels.txt',
    'output_eval': True,
    'loader_name': 'ft_loader_v3',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "default_tag":"O",
    'batch_size': 64,
    'eval_batch_size': 32,
    'do_shuffle': True,
    'model_name': 'LEBert',
    "tag_rules": {
        "LOC": "地名",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "NR":"人名",
        "NS":"地名",
        "NT":"组织机构",
        "CONT": "国家",
        "PRO":"职位",
        "RACE":"种族",
        "TITLE":"工作名称",
        "EDU":"教育经历",
        "NAME":"名字",
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    },
    'task_name': 'weibo'
}

In [2]:
args['pretrained_file_name'] = './model/chinese_wwm_ext/pytorch_model.bin'
args['prompt_pretrained_file_name'] = './save_pretrained/Ontonotes5_02_Pretrained/Bert_02_100w_lts2_ori/pytorch_model.bin'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/labels.txt'
args['batch_size'] = 32
args['task_name'] = 'sim_ontonotes5_s_02'

In [3]:
trainer = EnhancedNERTrainer(**args)

for i in trainer(lr2=1e-2):
    a = i

calculate ./data/ontonotes5_s/train_02_1.json etag: 100%|██████████| 1.24M/1.24M [00:00<00:00, 269MB/s]
calculate ./data/ontonotes5_s/dev.json etag: 100%|██████████| 2.53M/2.53M [00:00<00:00, 312MB/s]
calculate ./data/ontonotes5_s/test.json etag: 100%|██████████| 2.51M/2.51M [00:00<00:00, 334MB/s]
calculate ./data/ontonotes5_s/labels.txt etag: 100%|██████████| 110/110 [00:00<00:00, 191kB/s]

kwargs parser: {
    "batch_size": 32,
    "eval_batch_size": 64,
    "test_batch_size": 16,
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "train_file": "./data/ontonotes5_s/train_02_1.json",
    "eval_file": "./data/ontonotes5_s/dev.json",
    "test_file": "./data/ontonotes5_s/test.json",
    "tag_file": "./data/ontonotes5_s/labels.txt",
    "bert_vocab_file": "./model/chinese_wwm_ext/vocab.txt",
    "output_eval": true,
    "max_scan_num": 1000000,
    "add_seq_vocab": false,
    "max_seq_length": 150,
    "max_word_num": 5,
    "max_label_num": 5,
    "default_tag": "O",
    "use_test": false,
    "do_shuffle": true,
    "do_predict": false,
    "task_name": "sim_ontonotes5_s_02",
    "tag_rules": {
        "LOC": "地名",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "NR": "人名",
        "NS": "地名",
        "NT": "组织机构",
        "


count line size ./data/ontonotes5_s/labels.txt: 18L [00:00, 144354.63L/s]
build line mapper: 18L [00:00, 140853.49L/s]
load vocab from files: 100%|██████████| 18/18 [00:00<00:00, 6105.25it/s]
load vocab from list: 100%|██████████| 17/17 [00:00<00:00, 162052.65it/s]
count line size ./data/ontonotes5_s/train_02_1.json: 3132L [00:00, 1257809.28L/s]
load dataset from ./data/ontonotes5_s/train_02_1.json:   0%|          | 0/3132 [00:00<?, ?it/s]

load cached ./temp/e8f00bac3c46f8256b55702fef095739_78e6ea77f57ba7f0bd789b863d786fd2_ce9c592468b96a018ce16213d4861974_a83264c5149f26d36ffa7fa77eac7624/1000000/matched_words
load cached ./temp/e8f00bac3c46f8256b55702fef095739_78e6ea77f57ba7f0bd789b863d786fd2_ce9c592468b96a018ce16213d4861974_a83264c5149f26d36ffa7fa77eac7624/1000000/word_vocab
load cached ./temp/e8f00bac3c46f8256b55702fef095739_78e6ea77f57ba7f0bd789b863d786fd2_ce9c592468b96a018ce16213d4861974_a83264c5149f26d36ffa7fa77eac7624/1000000/vocab_embedding


load dataset from ./data/ontonotes5_s/train_02_1.json: 100%|█████████▉| 3131/3132 [00:26<00:00, 119.24it/s]
count line size ./data/ontonotes5_s/dev.json: 4302L [00:00, 944459.35L/s]
load dataset from ./data/ontonotes5_s/dev.json: 100%|█████████▉| 4301/4302 [00:29<00:00, 143.74it/s]


Load pretrained embedding from file.........


Some weights of the model checkpoint at ./model/chinese_wwm_ext/pytorch_model.bin were not used when initializing LEBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing LEBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LEBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LEBertModel were not initialized from the model checkpoint at ./model/chinese_wwm_ext/pytorch_model.bin and a

RuntimeError: CUDA out of memory. Tried to allocate 58.00 MiB (GPU 0; 11.91 GiB total capacity; 10.27 GiB already allocated; 40.94 MiB free; 11.08 GiB reserved in total by PyTorch)

## GPT2-预训练
loader: pt_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.gpt_pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'gpt_config_file_name': './model/gpt2-chinese/config.json',
    'pretrained_file_name': './model/gpt2-chinese/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'ptloader_v1',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 16,
    'eval_batch_size': 64,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Weibo_Full_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "LOC": "地名",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

# args['pretrained_file_name'] = 'save_pretrained/Ontonotes5_s_02_Pretrained/Bert_1960/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/labels.txt'
args['task_name'] = 'GPT_2_Ontonotes5_s_02_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## Super NER的Fine-tune

In [None]:
import json
from CC.predicter import NERPredict
from CC.trainer import NERTrainer

# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/Super_x_Ontonotes5/pre_train_0.2x.json',
    'eval_file': './data/Super_x_Ontonotes5/pre_dev.json',
    'test_file': './data/Super_x_Ontonotes5/pre_dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/Super_x_Ontonotes5/tags_list.txt',
    'output_eval': True,
    'loader_name': 'le_loader',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "default_tag":"O",
    'batch_size': 64,
    'eval_batch_size': 512,
    'do_shuffle': True,
    'model_name': 'LEBert',
    'task_name': 'super_x_ontonotes'
}

trainer = NERTrainer(**args)

for i in trainer(lr2=1e-2, resume_path='./save_model/super_x_ontonotes', resume_step=2586):
    a = i

## Super NER的预训练

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1500000,
    'train_file': './data/SuperNER/pre_train.json',
    'eval_file': './data/SuperNER/pre_dev.json',
    'test_file': './data/SuperNER/pre_test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/SuperNER/tags_list.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Pre_trained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## Weibo的预训练
loader: lex_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Weibo_Full_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag_combine.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Ontonotes5_02_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PRODUCT": "产品",
        "FAC": "场地",
        "ORDINAL": "排名",
        "QUANTITY": "无单位数量",
        "CARDINAL": "有单位数量",
        "EVENT": "事件",
        "MONEY": "金额",
        "DATE": "日期",
        "LOC": "地名",
        "WORK_OF_ART": "作品",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "PERCENT": "百分数",
        "LANGUAGE": "语言",
        "GPE": "政体",
        "PERSON": "人名",
        "LAW": "法文",
        "TIME": "时间",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

args['pretrained_file_name'] = './save_pretrained/Ontonotes5_003_Pretrained/Bert_1290/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/ontonotes5/train_003.json'
args['eval_file'] = './data/ontonotes5/dev.json'
args['test_file'] = './data/ontonotes5/test.json'
args['tag_file'] = './data/ontonotes5/ontonotes5_pretrained_labels.txt'
args['task_name'] = 'Ontonotes5_003_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer(lr=1e-5):
    a = i


## Ontonote5(Four Labels)的预训练
loader: lex_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 35,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Ontonotes5_02_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "LOC": "地名",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

args['pretrained_file_name'] = 'save_pretrained/Ontonotes5_s_02_Pretrained/Bert_1960/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/pretrained_labels_ori.txt'
args['task_name'] = 'Ontonotes5_s_02_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer(lr=1e-5):
    a = i

## Weibo的预训练(屏蔽无关实体)
loader: lex_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Weibo_Full_Pretrained',
    "use_gpu": True,
    "debug": True,
    "pass_none_rule": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "government": "政府"
    }
}

args['pretrained_file_name'] = './model/chinese_wwm_ext/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/weibo/train.json'
args['tag_file'] = './data/weibo/pretrained_labels.txt'
args['task_name'] = 'Weibo_Full_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## 适用于`CONLL`的预训练
loader: cnx_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibonew/train.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'cnx_loader',
    'batch_size': 16,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'use_json': True,
    'model_name': 'Bert',
    'task_name': 'Weibo_CNX_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PRODUCT": "产品",
        "FAC": "场地",
        "ORDINAL": "排名",
        "QUANTITY": "无单位数量",
        "CARDINAL": "有单位数量",
        "EVENT": "事件",
        "MONEY": "金额",
        "DATE": "日期",
        "LOC": "地名",
        "WORK_OF_ART": "作品",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "PERCENT": "百分数",
        "LANGUAGE": "语言",
        "GPE": "政体",
        "PERSON": "人名",
        "LAW": "法文",
        "TIME": "时间",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

args['pretrained_file_name'] = './save_pretrained/Ontonotes5_02_Pretrained/Bert_2886/pytorch_model.bin'
args['train_file'] = './data/ontonotes5/train_02_1.json'
args['tag_file'] = './data/ontonotes5/ontonotes5_pretrained_labels.txt'
args['task_name'] = 'Ontonotes5_02_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer(lr=1e-5):
    a = i

## Weibo 预训练x纠错

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train_5x.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'labellex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Weibo_Full20xC_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## 数据集扩展

In [None]:
from CC.loaders import *

loader = LabelLoader(**{
    "auto_loader": False,
    "debug": True,
    "file_name": "./data/weibo/train.json",
    "random_rate": 1.0,
    "expansion_rate": 5
}).read_data_set("./data/weibo/train.json", 1.0) \
    .process_data(5) \
    .to_file("./train_5x.json")

## GPT-2预测单句

In [None]:
import sys
from CC.gpt_predictor import Predictor
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('./model/gpt2-chinese/vocab.txt')

gpt2_predictor = Predictor(tokenizer, './save_pretrained/GPT_2_Ontonotes5_s_02_Pretrained/GPT-2_5880')

In [None]:
for i in gpt2_predictor.predict_continous('埃斯特拉达被指控的罪名包括了贪污、受贿、背叛大众的信任以及违反宪法等。'):
    _, ids = i
    sys.stdout.write('\r{}'.format(tokenizer.decode(ids[0])))
    sys.stdout.flush()