## Fine-tune

In [1]:
import json
from CC.predicter import NERPredict
from CC.trainer import NERTrainer

### Fine-tune 参数设置

适用于**le_loader**下的fine-tune任务

In [2]:
# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/bert_config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/labels.txt',
    'output_eval': True,
    'loader_name': 'le_loader',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "default_tag":"O",
    'batch_size': 16,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'model_name': 'LEBert',
    'task_name': 'weibo'
}

### 参数微调

调整`Pretrained`和`task name`

In [3]:
args['pretrained_file_name'] = './save_pretrained/Weibo_Full_Pretrained/Bert_1290/pytorch_model.bin'
args['train_file'] = './data/weibo/train.json'
args['task_name'] = 'weibo'

In [4]:
trainer = NERTrainer(**args)

for i in trainer(lr2=1e-2):
    a = i

kwargs parser: {
    "batch_size": 16,
    "eval_batch_size": 64,
    "test_batch_size": 16,
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "train_file": "./data/weibo/train.json",
    "eval_file": "./data/weibo/dev.json",
    "test_file": "./data/weibo/test.json",
    "tag_file": "./data/weibo/labels.txt",
    "bert_vocab_file": "./model/chinese_wwm_ext/vocab.txt",
    "output_eval": true,
    "max_scan_num": 1000000,
    "add_seq_vocab": false,
    "max_seq_length": 150,
    "max_word_num": 5,
    "default_tag": "O",
    "use_test": false,
    "do_shuffle": true,
    "do_predict": false,
    "task_name": "weibo"
}


load vocabs into trie: 100%|██████████| 1000000/1000000 [00:01<00:00, 672594.07it/s]
build trie: 100%|██████████| 1000000/1000000 [00:10<00:00, 94302.48it/s]
load dataset matched word: 100%|██████████| 1351/1351 [00:00<00:00, 4693.81it/s]
load dataset matched word: 100%|██████████| 271/271 [00:00<00:00, 4760.01it/s]
load dataset matched word: 100%|██████████| 271/271 [00:00<00:00, 4658.29it/s]
load vocab from list: 100%|██████████| 28457/28457 [00:00<00:00, 730577.14it/s]
load vocab from files: 100%|██████████| 28/28 [00:00<00:00, 93059.04it/s]
load vocab from list: 100%|██████████| 28/28 [00:00<00:00, 233016.89it/s]
load dataset from ./data/weibo/train.json: 100%|██████████| 1351/1351 [00:00<00:00, 1379.04it/s]
load dataset from ./data/weibo/dev.json: 100%|██████████| 271/271 [00:00<00:00, 1417.26it/s]
Some weights of the model checkpoint at ./save_pretrained/Weibo_Full_Pretrained/Bert_1290/pytorch_model.bin were not used when initializing LEBertModel: ['cls.predictions.transform.Laye

Load pretrained embedding from file.........
Accessing Resume PATH: ./save_model/weibo step: 5070 ...



Epoch: 1/30 Train: 100%|██████████| 1/1 [00:09<00:00,  9.30s/it, F1=0.19, train_acc=0.949, train_loss=15.7, train_precision=0.286, train_recall=0.143]
Eval Result: 100%|██████████| 1/1 [00:00<00:00,  3.72it/s, F1=0.235, eval_acc=0.946, eval_loss=6.91, eval_precision=0.4, eval_recall=0.167]
Epoch: 2/30 Train: 100%|██████████| 1/1 [00:00<00:00,  2.03it/s, F1=0.125, train_acc=0.957, train_loss=14.8, train_precision=0.5, train_recall=0.0714]
Eval Result: 100%|██████████| 1/1 [00:00<00:00,  3.97it/s, F1=0.235, eval_acc=0.946, eval_loss=6.91, eval_precision=0.4, eval_recall=0.167]
Epoch: 3/30 Train: 100%|██████████| 1/1 [00:00<00:00,  2.05it/s, F1=0.211, train_acc=0.954, train_loss=16.1, train_precision=0.4, train_recall=0.143]
Eval Result: 100%|██████████| 1/1 [00:00<00:00,  4.00it/s, F1=0.235, eval_acc=0.946, eval_loss=6.91, eval_precision=0.4, eval_recall=0.167]
Epoch: 4/30 Train: 100%|██████████| 1/1 [00:00<00:00,  2.08it/s, F1=0.25, train_acc=0.958, train_loss=17, train_precision=1, tra

## Super NER的预训练

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/bert_config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1500000,
    'train_file': './data/SuperNER/pre_train.json',
    'eval_file': './data/SuperNER/pre_dev.json',
    'test_file': './data/SuperNER/pre_test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/SuperNER/tags_list.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Pre_trained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## Weibo的预训练
loader: lex_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/bert_config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Weibo_Full_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## 适用于`CONLL`的预训练
loader: cnx_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/bert_config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibonew/train.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'cnx_loader',
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'use_json': True,
    'model_name': 'Bert',
    'task_name': 'Weibo_CNX_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## Weibo 预训练x纠错

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/bert_config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train_20x.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'labellex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Weibo_Full20xC_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "机构",
        "LOC": "地名",
        "PER": "人名",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "度量",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## 数据集扩展

In [None]:
from CC.loaders import *

loader = LabelLoader(**{
    "auto_loader": False,
    "debug": True,
    "file_name": "./data/weibo/train.json",
    "random_rate": 1.0,
    "expansion_rate": 20
}).read_data_set("./data/weibo/train.json", 1.0) \
    .process_data(20) \
    .to_file("./train_20x.json")