## Fine-tune

In [None]:
import json
from CC.predicter import NERPredict
from CC.trainer import NERTrainer

### Fine-tune 参数设置

适用于**le_loader**下的fine-tune任务

In [None]:
# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/labels.txt',
    'output_eval': True,
    'loader_name': 'le_loader',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "default_tag":"O",
    'batch_size': 64,
    'eval_batch_size': 512,
    'do_shuffle': True,
    'model_name': 'LEBert',
    'task_name': 'weibo'
}

### 参数微调

调整`Pretrained`和`task name`

In [None]:
# args['pretrained_file_name'] = './save_pretrained/Weibo_Super1x_Pretrained/Bert_2550/pytorch_model.bin'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/labels.txt'
args['batch_size'] = 32
args['task_name'] = 'ontonotes5_s_02'

In [None]:
trainer = NERTrainer(**args)

for i in trainer(lr2=1e-2):
    a = i

## PLEBert Fine-tune

### Fine-tune 参数设置

适用于**le_loader**下的fine-tune任务

In [None]:
import json
from CC.predicter import NERPredict
from CC.trainer import NERTrainer

# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'bert_pretrain_path': './model/chinese_wwm_ext/',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/ontonotes5/train.json',
    'eval_file': './data/ontonotes5/dev.json',
    'test_file': './data/ontonotes5/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/ontonotes5/ontonotes5_labels.txt',
    'output_eval': True,
    'loader_name': 'ft_loader_v1',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "tag_embedding_file":"./data/tencent/label_embedding.txt",
    "external_entities_file": "./data/ontonotes5_s/entities_data_label.json",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag":"O",
    'batch_size': 64,
    'eval_batch_size': 256,
    'do_shuffle': True,
    'model_name': 'PLEBert',
    "tag_rules": {
        "LOC": "地名",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "NR":"人名",
        "NS":"地名",
        "NT":"组织机构",
        "CONT": "国家",
        "PRO":"职位",
        "RACE":"种族",
        "TITLE":"工作名称",
        "EDU":"教育经历",
        "NAME":"名字",
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    },
    'task_name': 'weibo'
}

### 参数微调

调整`Pretrained`和`task name`

In [None]:
args['pretrained_file_name'] = './model/chinese_wwm_ext/pytorch_model.bin'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/labels.txt'
args['batch_size'] = 32
args['task_name'] = 'ontonotes5_s_02_pc'

In [None]:
trainer = NERTrainer(**args)

def out_eval(epoch):
    if epoch > 6:
        return True
    return epoch % 3 == 0

for i in trainer(eval_call_epoch=out_eval):
    a = i

## Super NER的Fine-tune

In [None]:
import json
from CC.predicter import NERPredict
from CC.trainer import NERTrainer

# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/Super_x_Ontonotes5/pre_train_0.2x.json',
    'eval_file': './data/Super_x_Ontonotes5/pre_dev.json',
    'test_file': './data/Super_x_Ontonotes5/pre_dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/Super_x_Ontonotes5/tags_list.txt',
    'output_eval': True,
    'loader_name': 'le_loader',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "default_tag":"O",
    'batch_size': 64,
    'eval_batch_size': 512,
    'do_shuffle': True,
    'model_name': 'LEBert',
    'task_name': 'super_x_ontonotes'
}

trainer = NERTrainer(**args)

for i in trainer(lr2=1e-2, resume_path='./save_model/super_x_ontonotes', resume_step=2586):
    a = i

## Super NER的预训练

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1500000,
    'train_file': './data/SuperNER/pre_train.json',
    'eval_file': './data/SuperNER/pre_dev.json',
    'test_file': './data/SuperNER/pre_test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/SuperNER/tags_list.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Pre_trained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## Weibo的预训练
loader: lex_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Weibo_Full_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag_combine.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Ontonotes5_02_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PRODUCT": "产品",
        "FAC": "场地",
        "ORDINAL": "排名",
        "QUANTITY": "无单位数量",
        "CARDINAL": "有单位数量",
        "EVENT": "事件",
        "MONEY": "金额",
        "DATE": "日期",
        "LOC": "地名",
        "WORK_OF_ART": "作品",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "PERCENT": "百分数",
        "LANGUAGE": "语言",
        "GPE": "政体",
        "PERSON": "人名",
        "LAW": "法文",
        "TIME": "时间",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

args['pretrained_file_name'] = './save_pretrained/Ontonotes5_003_Pretrained/Bert_1290/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/ontonotes5/train_003.json'
args['eval_file'] = './data/ontonotes5/dev.json'
args['test_file'] = './data/ontonotes5/test.json'
args['tag_file'] = './data/ontonotes5/ontonotes5_pretrained_labels.txt'
args['task_name'] = 'Ontonotes5_003_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer(lr=1e-5):
    a = i


## Ontonote5(Four Labels)的预训练
loader: lex_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 35,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/dev.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'pass_none_rule': True,
    'skip_single_matched_word': True,
    'do_shuffle': True,
    'task_name': 'Ontonotes5_02_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "LOC": "地名",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "GPE": "政体",
        "PERSON": "人名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

args['pretrained_file_name'] = 'save_pretrained/Ontonotes5_s_02_Pretrained/Bert_1960/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/ontonotes5_s/train_02_1.json'
args['eval_file'] = './data/ontonotes5_s/dev.json'
args['test_file'] = './data/ontonotes5_s/test.json'
args['tag_file'] = './data/ontonotes5_s/pretrained_labels_ori.txt'
args['task_name'] = 'Ontonotes5_s_02_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer(lr=1e-5):
    a = i

## Weibo的预训练(屏蔽无关实体)
loader: lex_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Weibo_Full_Pretrained',
    "use_gpu": True,
    "debug": True,
    "pass_none_rule": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "government": "政府"
    }
}

args['pretrained_file_name'] = './model/chinese_wwm_ext/pytorch_model.bin'
args['max_scan_num'] = '1000000'
args['train_file'] = './data/weibo/train.json'
args['tag_file'] = './data/weibo/pretrained_labels.txt'
args['task_name'] = 'Weibo_Full_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## 适用于`CONLL`的预训练
loader: cnx_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibonew/train.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'cnx_loader',
    'batch_size': 16,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'use_json': True,
    'model_name': 'Bert',
    'task_name': 'Weibo_CNX_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PRODUCT": "产品",
        "FAC": "场地",
        "ORDINAL": "排名",
        "QUANTITY": "无单位数量",
        "CARDINAL": "有单位数量",
        "EVENT": "事件",
        "MONEY": "金额",
        "DATE": "日期",
        "LOC": "地名",
        "WORK_OF_ART": "作品",
        "NORP": "政体,民族或宗教",
        "ORG": "机构",
        "PERCENT": "百分数",
        "LANGUAGE": "语言",
        "GPE": "政体",
        "PERSON": "人名",
        "LAW": "法文",
        "TIME": "时间",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

args['pretrained_file_name'] = './save_pretrained/Ontonotes5_02_Pretrained/Bert_2886/pytorch_model.bin'
args['train_file'] = './data/ontonotes5/train_02_1.json'
args['tag_file'] = './data/ontonotes5/ontonotes5_pretrained_labels.txt'
args['task_name'] = 'Ontonotes5_02_Pretrained'

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer(lr=1e-5):
    a = i

## Weibo 预训练x纠错

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train_5x.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'labellex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Weibo_Full20xC_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## 数据集扩展

In [None]:
from CC.loaders import *

loader = LabelLoader(**{
    "auto_loader": False,
    "debug": True,
    "file_name": "./data/weibo/train.json",
    "random_rate": 1.0,
    "expansion_rate": 5
}).read_data_set("./data/weibo/train.json", 1.0) \
    .process_data(5) \
    .to_file("./train_5x.json")

#### Fine-tune Bert+LSTM+CRF(Baseline)

In [None]:
args = {
    "model_name":"Bert",
    "loader_name":"cn_loader",
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'hidden_dim': 300,
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/IMCS-NER/IMCS-V2_train_format.json',
    'eval_file': './data/IMCS-NER/IMCS-V2_dev_format.json',
    'test_file': './data/IMCS-NER/IMCS-V2_dev_format.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/IMCS-NER/IMCS-V2_labels.json',
    'batch_size': 64,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'use_json': True,
    'output_eval': True,
    'task_name': 'IMCS-NER',
    "use_gpu": True,
    "debug": True
}

In [None]:
args['train_file'] = './data/Medical/train.json'
args['eval_file'] = './data/Medical/dev.json'
args['test_file'] = './data/Medical/dev.json'
args['tag_file'] = './data/Medical/tags_list.txt'
args['word_tag_split'] = '\t'
args['use_json'] = True
args['task_name'] = 'Medical'

In [None]:
from CC.trainer import NERTrainer
import torch
seed = 2
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

trainer = NERTrainer(**args)

for i in trainer():
    a = i

##### Bert+BiLSTM+CRF 预测示例代码

In [None]:
args = {
    "use_gpu": True,
    "loader_name": "cn_loader",
    "model_name": "Bert",
    "lstm_crf_model_file": "",
    "bert_model_file": "",
    "hidden_dim": 300,
    'train_file': './data/IMCS-NER/IMCS-V2_train_format.json',
    'eval_file': './data/IMCS-NER/IMCS-V2_dev_format.json',
    'test_file': './data/IMCS-NER/IMCS-V2_dev_format.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    "use_json": True,
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'tag_file': './data/IMCS-NER/IMCS-V2_labels.json',
    "max_seq_length": 150,
    "padding_length": 150,
    "num_gpus": [0, 1, 2, 3],
}


In [None]:
args['train_file'] = './data/Medical/train.json'
args['eval_file'] = './data/Medical/dev.json'
args['test_file'] = './data/Medical/dev.json'
args['tag_file'] = './data/Medical/tags_list.txt'
args['word_tag_split'] = '\t'
args['use_json'] = True
args["bert_model_file"] = "./save_model/Medical/Bert/Bert_6669.pth"
args["lstm_crf_model_file"] = "./save_model/Medical/lstm_crf/lstm_crf_6669.pth"

In [None]:
from CC.predicter import NERPredict
import torch

predict = NERPredict(**args)

In [None]:
predict(['你好,他有点感冒了', '你好,我叫什么名字', '我们看看吧', '你好'])

In [None]:
import json
from tqdm import tqdm

with open('./data/CHIP-STS/CHIP-STS_dev.json') as f:
    ori_json = json.load(f)

batch_size = 512
total_batch = len(ori_json) // batch_size + 1

result_1 = []
result_2 = []
for i in tqdm(range(total_batch)):
    range_json = ori_json[i * batch_size:(i + 1) * batch_size]
    text_1_list = [i['text1'] for i in range_json]
    text_2_list = [i['text2'] for i in range_json]
    result_1 += predict(text_1_list)
    result_2 += predict(text_2_list)

for i in tqdm(range(len(ori_json))):
    ori_json[i]['text1_arr'] = result_1[i]['text']
    ori_json[i]['text1_label'] = result_1[i]['label']
    ori_json[i]['text2_arr'] = result_2[i]['text']
    ori_json[i]['text2_label'] = result_2[i]['label']

with open('./data/CHIP-STS/CHIP-STS_dev_medical_ner.json', 'w') as f:
    for i in tqdm(range(len(result_1))):
        f.write('{}\n'.format(json.dumps(ori_json[i], ensure_ascii=False)))
