## Fine-tune

In [1]:
import json
from CC.predicter import NERPredict
from CC.trainer import NERTrainer

### Fine-tune 参数设置

适用于**le_loader**下的fine-tune任务

In [2]:
# %%
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/bert_config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'hidden_dim': 300,
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': './data/weibo/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/labels.txt',
    'output_eval': True,
    'loader_name': 'le_loader',
    "word_embedding_file":"./data/tencent/word_embedding.txt",
    "word_vocab_file":"./data/tencent/tencent_vocab.txt",
    "default_tag":"O",
    'batch_size': 8,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'model_name': 'LEBert',
    'task_name': 'weibo'
}

### 参数微调

调整`Pretrained`和`task name`

In [3]:
args['pretrained_file_name'] = './save_pretrained/Weibo_x20_Pretrained/Bert_2024/pytorch_model.bin'
args['train_file'] = './data/weibo_yfy/train_origin.json'
args['task_name'] = 'weibo_new_02_yfy'

In [4]:
trainer = NERTrainer(**args)

for i in trainer():
    a = i

kwargs parser: {
    "batch_size": 8,
    "eval_batch_size": 64,
    "test_batch_size": 16,
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "train_file": "./data/weibo_yfy/train_origin.json",
    "eval_file": "./data/weibo/dev.json",
    "test_file": "./data/weibo/test.json",
    "tag_file": "./data/weibo/labels.txt",
    "bert_vocab_file": "./model/chinese_wwm_ext/vocab.txt",
    "output_eval": true,
    "max_scan_num": 1000000,
    "add_seq_vocab": false,
    "max_seq_length": 150,
    "max_word_num": 5,
    "default_tag": "O",
    "use_test": false,
    "do_shuffle": true,
    "do_predict": false,
    "task_name": "weibo_new_02_yfy"
}


load vocabs into trie: 100%|██████████| 1000000/1000000 [00:01<00:00, 657144.66it/s]
build trie: 100%|██████████| 1000000/1000000 [00:09<00:00, 105574.83it/s]
load dataset matched word: 100%|██████████| 270/270 [00:00<00:00, 4667.14it/s]
load dataset matched word: 100%|██████████| 271/271 [00:00<00:00, 4855.08it/s]
load dataset matched word: 100%|██████████| 271/271 [00:00<00:00, 4749.27it/s]
load vocab from list: 100%|██████████| 16453/16453 [00:00<00:00, 726845.41it/s]
load vocab from files: 100%|██████████| 28/28 [00:00<00:00, 93206.76it/s]
load vocab from list: 100%|██████████| 28/28 [00:00<00:00, 285049.79it/s]
load dataset from ./data/weibo_yfy/train_origin.json: 100%|██████████| 270/270 [00:00<00:00, 1379.70it/s]
load dataset from ./data/weibo/dev.json: 100%|██████████| 271/271 [00:00<00:00, 1389.37it/s]


Load pretrained embedding from file.........


Some weights of the model checkpoint at ./save_pretrained/Weibo_x20_Pretrained/Bert_2024/pytorch_model.bin were not used when initializing LEBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing LEBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LEBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LEBertModel were not initialized from the model checkpoint at ./save_pretrained/Weibo_x20_Pretrained/Bert_2024/p

## Super NER的预训练

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/bert_config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1500000,
    'train_file': './data/SuperNER/pre_train.json',
    'eval_file': './data/SuperNER/pre_dev.json',
    'test_file': './data/SuperNER/pre_test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/SuperNER/tags_list.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Pre_trained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "ORG": "组织",
        "LOC": "地点",
        "PER": "人",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "测量单位",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

## Weibo的预训练
loader: lex_loader

In [1]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [2]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/bert_config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibonew/train.json',
    'eval_file': './data/weibo/dev.json',
    'test_file': './data/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'Weibo_x20_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PER.NOM": "人的象征",
        "LOC.NAM": "地点",
        "PER.NAM": "人",
        "GPE.NAM": "政治实体",
        "ORG.NAM": "组织",
        "ORG.NOM": "组织的象征",
        "LOC.NOM": "地点的象征",
        "GPE.NOM": "政治实体的象征",
        "ORG": "组织",
        "LOC": "地点",
        "PER": "人",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "测量单位",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i

kwargs parser: {
    "batch_size": 32,
    "eval_batch_size": 64,
    "test_batch_size": 16,
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "train_file": "./data/weibonew/train.json",
    "eval_file": "./data/weibo/dev.json",
    "test_file": "./data/weibo/test.json",
    "tag_file": "./data/weibo/pretrained_labels.txt",
    "bert_vocab_file": "./model/chinese_wwm_ext/vocab.txt",
    "output_eval": true,
    "max_scan_num": 1000000,
    "add_seq_vocab": false,
    "max_seq_length": 512,
    "max_word_num": 5,
    "default_tag": "O",
    "use_test": false,
    "do_shuffle": true,
    "do_predict": false,
    "lexicon_tree_cache_path": null,
    "word_vacab_cache_path": null,
    "task_name": "Weibo_x20_Pretrained",
    "tag_rules": {
        "PER.NOM": "人的象征",
        "LOC.NAM": "地点",
        "PER.NAM": "人",
        "GPE.NAM": "政治实

load vocabs into trie: 100%|██████████| 1000000/1000000 [00:01<00:00, 657181.11it/s]
build trie: 100%|██████████| 1000000/1000000 [00:09<00:00, 101629.29it/s]
load dataset matched word: 100%|██████████| 2915/2915 [00:00<00:00, 4267.35it/s]
load dataset matched word: 100%|██████████| 271/271 [00:00<00:00, 4726.42it/s]
load dataset matched word: 100%|██████████| 271/271 [00:00<00:00, 4699.84it/s]
load vocab from files: 100%|██████████| 1000000/1000000 [00:07<00:00, 142470.87it/s]
load vocab from list: 100%|██████████| 1000000/1000000 [00:02<00:00, 371387.72it/s]
load vocab from list: 100%|██████████| 17670/17670 [00:00<00:00, 437610.94it/s]
load vocab from files: 100%|██████████| 65/65 [00:00<00:00, 258172.12it/s]
load vocab from list: 100%|██████████| 65/65 [00:00<00:00, 416865.08it/s]
load dataset from ./data/weibonew/train.json: 100%|██████████| 2915/2915 [00:02<00:00, 1004.80it/s]
load dataset from ./data/weibo/dev.json: 100%|██████████| 271/271 [00:00<00:00, 1180.15it/s]
Some weight

## 适用于`CONLL`的预训练
loader: cnx_loader

In [None]:
from CC.loaders import *
import pickle
from tqdm import *
from CC.loaders.utils import *
import json
from CC.pre_trained import NERPreTrainer

In [None]:
args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/bert_config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': './data/weibonew/train.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': './data/weibo/pretrained_labels.txt',
    'loader_name': 'cnx_loader',
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'use_json': True,
    'model_name': 'Bert',
    'task_name': 'Weibo_CNX_Pretrained',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "PER.NOM": "人的象征",
        "LOC.NAM": "地点",
        "PER.NAM": "人",
        "GPE.NAM": "政治实体",
        "ORG.NAM": "组织",
        "ORG.NOM": "组织的象征",
        "LOC.NOM": "地点的象征",
        "GPE.NOM": "政治实体的象征",
        "ORG": "组织",
        "LOC": "地点",
        "PER": "人",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "测量单位",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

pre_trainer = NERPreTrainer(**args)

for i in pre_trainer():
    a = i