### 小样本实验
数据规模：0.25k 0.5k 1k 1.35k  
抽样方式：随机抽样，设置统一的样本种子   
结果：每个模型训练3次，取最好的一次作为结果

#### 数据构造

In [None]:
# 创建数据样本
import os

path = "data/few_shot"
if not os.path.exists(path):
    os.makedirs(path)
print("done!")

##### 微博数据

In [None]:
# 复制微博数据
from shutil import copyfile
import os

data = ["train.json","dev.json","test.json","labels.txt"]

if not os.path.exists("data/few_shot/weibo"):
    os.makedirs("data/few_shot/weibo")
for f in data:
    copyfile(f"data/weibo/{f}",f"data/few_shot/weibo/{f}")

In [None]:
# 抽样数据
postfix = [250,500,1000,1350]

# 设置统一的随机种子
import random

# 读取原始训练集
train_file = "data/few_shot/weibo/train.json"
train_data = []

from tqdm import tqdm
import json

with open(train_file,"r",encoding="utf-8") as reader:
    for line in tqdm(reader,desc=f"load {train_file}"):
        data = json.loads(line.strip())
        text,label = data["text"],data["label"]
        assert len(text)==len(label)
        if len(text)>0:
            train_data.append(data)
        else:
            print(text,label)

# 生成数据
for post in postfix:
    # 重置随机，确保结果可复现
    random.seed(2021)
    data = random.sample(train_data,post)
    assert len(data)==post
    with open(f"data/few_shot/weibo/train_{post}.json","w",encoding="utf-8") as f: 
        for line in tqdm(data,desc=f"{post}"):
            f.write(f"{json.dumps(line,ensure_ascii=False)}\n")
        f.flush()

##### Notonotes4 数据

In [None]:
# 复制Ontonotes4数据
from shutil import copyfile
import os

data = ["train.json","dev.json","test.json","labels.txt"]

if not os.path.exists("data/few_shot/note4"):
    os.makedirs("data/few_shot/note4")
for f in data:
    copyfile(f"data/lebert/dataset/NER/note4/{f}",f"data/few_shot/note4/{f}")

In [None]:
# 抽样数据
postfix = [250,500,1000,1350]

# 设置统一的随机种子
import random

# 读取原始训练集
train_file = "data/few_shot/note4/train.json"
train_data = []

from tqdm import tqdm
import json

with open(train_file,"r",encoding="utf-8") as reader:
    for line in tqdm(reader,desc=f"load {train_file}"):
        data = json.loads(line.strip())
        text,label = data["text"],data["label"]
        assert len(text)==len(label)
        if len(text)>0:
            train_data.append(data)
        else:
            print(text,label)

# 生成数据
for post in postfix:
    # 重置随机，确保结果可复现
    random.seed(2021)
    data = random.sample(train_data,post)
    assert len(data)==post
    with open(f"data/few_shot/note4/train_{post}.json","w",encoding="utf-8") as f: 
        for line in tqdm(data,desc=f"{post}"):
            f.write(f"{json.dumps(line,ensure_ascii=False)}\n")
        f.flush()

##### MSRA 数据

In [None]:
# 复制MSRA数据
from shutil import copyfile
import os

data = ["train.json","dev.json","test.json","labels.txt"]

if not os.path.exists("data/few_shot/msra"):
    os.makedirs("data/few_shot/msra")
for f in data:
    copyfile(f"data/lebert/dataset/NER/msra/{f}",f"data/few_shot/msra/{f}")

In [None]:
# 抽样数据
postfix = [250,500,1000,1350]

# 设置统一的随机种子
import random

# 读取原始训练集
train_file = "data/few_shot/msra/train.json"
train_data = []

from tqdm import tqdm
import json

with open(train_file,"r",encoding="utf-8") as reader:
    for line in tqdm(reader,desc=f"load {train_file}"):
        data = json.loads(line.strip())
        text,label = data["text"],data["label"]
        assert len(text)==len(label)
        if len(text)>0:
            train_data.append(data)
        else:
            print(text,label)

# 生成数据
for post in postfix:
    # 重置随机，确保结果可复现
    random.seed(2021)
    data = random.sample(train_data,post)
    assert len(data)==post
    with open(f"data/few_shot/msra/train_{post}.json","w",encoding="utf-8") as f: 
        for line in tqdm(data,desc=f"{post}"):
            f.write(f"{json.dumps(line,ensure_ascii=False)}\n")
        f.flush()

##### Resume数据

In [None]:
# 复制Resume数据
from shutil import copyfile
import os

data = ["train.json","dev.json","test.json","labels.txt"]

if not os.path.exists("data/few_shot/resume"):
    os.makedirs("data/few_shot/resume")
for f in data:
    copyfile(f"data/lebert/dataset/NER/resume/{f}",f"data/few_shot/resume/{f}")

In [None]:
# 抽样数据
postfix = [250,500,1000,1350]

# 设置统一的随机种子
import random

# 读取原始训练集
train_file = "data/few_shot/resume/train.json"
train_data = []

from tqdm import tqdm
import json

with open(train_file,"r",encoding="utf-8") as reader:
    for line in tqdm(reader,desc=f"load {train_file}"):
        data = json.loads(line.strip())
        text,label = data["text"],data["label"]
        assert len(text)==len(label)
        if len(text)>0:
            train_data.append(data)
        else:
            print(text,label)

# 生成数据
for post in postfix:
    # 重置随机，确保结果可复现
    random.seed(2021)
    data = random.sample(train_data,post)
    assert len(data)==post
    with open(f"data/few_shot/resume/train_{post}.json","w",encoding="utf-8") as f: 
        for line in tqdm(data,desc=f"{post}"):
            f.write(f"{json.dumps(line,ensure_ascii=False)}\n")
        f.flush()

#### Bert+LSTM+CRF(Baseline)

In [None]:
model_args = {
    "model_name":"Bert",
    "loader":"cn_loader",
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'hidden_dim': 512,
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': 'data/few_shot/weibo/train_250.json',
    'eval_file': 'data/few_shot/weibo/dev.json',
    'test_file': 'data/few_shot/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': 'data/few_shot/weibo/labels.txt',
    'loader_name': 'cn_loader',
    'batch_size': 8,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'use_json': True,
    'model_name': 'Bert',
    'output_eval': True,
    'task_name': 'weibo_bert_crf_250_x3',
    "use_gpu": True,
    "debug": True,
}

In [None]:
from CC.trainer import NERTrainer
import torch

per_task_count = 3

datasets = ["weibo","note4","msra","resume"]

task_trainset = [250,500,1000,1350]

tasks_args = {
        "train_file":"data/few_shot/{}/train_{}.json",
        "eval_file":"data/few_shot/{}/dev.json",
        "test_file":"data/few_shot/{}/test.json",
        'tag_file':"data/few_shot/{}/labels.txt",
        "task_name":"{}_bert_crf_{}_{}",
}

for name in datasets:
    for trainset in task_trainset:
        for i in range(per_task_count):
            for key,value in tasks_args.items():
                model_args[key] = value.format(name,trainset,f"x{i+1}")
            trainer = NERTrainer(**model_args)

            for _ in trainer.train():
                pass

            torch.cuda.empty_cache()


#### LEBert+LSTM+CRF

In [None]:
model_args = {
    "loader":"le_loader",
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'hidden_dim': 512,
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': 'data/few_shot/weibo/train_250.json',
    'eval_file': 'data/few_shot/weibo/dev.json',
    'test_file': 'data/few_shot/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'word_embedding_file': 'data/tencent/word_embedding.txt',
    'tag_file': 'data/few_shot/weibo/labels.txt',
    'word_vocab_file':'data/tencent/tencent_vocab.txt',
    'loader_name': 'le_loader',
    'batch_size': 8,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'use_json': True,
    'model_name': 'LEBert',
    'output_eval': True,
    'task_name': 'weibo_lebert_crf_250_x3',
    "use_gpu": True,
    "debug": True,
}

In [None]:
from CC.trainer import NERTrainer
import torch

per_task_count = 3

datasets = ["weibo","note4","msra","resume"]

task_trainset = [250,500,1000,1350]

tasks_args = {
        "train_file":"data/few_shot/{}/train_{}.json",
        "eval_file":"data/few_shot/{}/dev.json",
        "test_file":"data/few_shot/{}/test.json",
        'tag_file':"data/few_shot/{}/labels.txt",
        "task_name":"{}_lebert_crf_{}_{}"
}

for name in datasets:
    for trainset in task_trainset:
        for i in range(per_task_count):
            for key,value in tasks_args.items():
                model_args[key] = value.format(name,trainset,f"x{i+1}")
            trainer = NERTrainer(**model_args)

            for _ in trainer.train():
                pass

            torch.cuda.empty_cache()



#### Prompts PreTrain+LeBert FineTune

##### PreTrainer

In [None]:
# weibo
from CC.pre_trained import NERPreTrainer
import torch

pretrain_model_args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': 'data/few_shot/weibo/train_250.json',
    'eval_file': 'data/few_shot/weibo/dev.json',
    'test_file': 'data/few_shot/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': 'data/few_shot/weibo/labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'weibo_pretrain_lebert_250_pretraind_task',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "O":"非实体",
        "PER.NOM": "指代人名",
        "LOC.NAM": "地名",
        "PER.NAM": "人名",
        "GPE.NAM": "政体",
        "ORG.NAM": "机构",
        "ORG.NOM": "指代机构",
        "LOC.NOM": "指代地名",
        "GPE.NOM": "指代政体",
        "ORG": "组织",
        "LOC": "地点",
        "PER": "人",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "测量单位",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

trainsets = [250,500,1000,1350]

for train in trainsets:
    pretrain_model_args['train_file'] = f'data/few_shot/weibo/train_{train}.json'
    pretrain_model_args['task_name'] = f'weibo_pretrain_lebert_{train}_pretraind_task'

    pre_trainer = NERPreTrainer(**pretrain_model_args)

    for i in pre_trainer():
        a = i

    torch.cuda.empty_cache()

In [None]:
# note4
from CC.pre_trained import NERPreTrainer
import torch

pretrain_model_args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': 'data/few_shot/note4/train_250.json',
    'eval_file': 'data/few_shot/note4/dev.json',
    'test_file': 'data/few_shot/note4/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': 'data/few_shot/note4/labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'note4_pretrain_lebert_250_pretraind_task',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "O":"非实体",
        "ORG": "组织",
        "LOC": "地点",
        "PER": "人",
        "GPE": "政治",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "测量单位",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

trainsets = [250,500,1000,1350]

for train in trainsets:
    pretrain_model_args['train_file'] = f'data/few_shot/note4/train_{train}.json'
    pretrain_model_args['task_name'] = f'note4_pretrain_lebert_{train}_pretraind_task'

    pre_trainer = NERPreTrainer(**pretrain_model_args)

    for i in pre_trainer():
        a = i

    torch.cuda.empty_cache()

In [None]:
# msra
from CC.pre_trained import NERPreTrainer
import torch

pretrain_model_args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': 'data/few_shot/msra/train_250.json',
    'eval_file': 'data/few_shot/msra/dev.json',
    'test_file': 'data/few_shot/msra/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': 'data/few_shot/msra/labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'msra_pretrain_lebert_250_pretraind_task',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "O":"非实体",
        "NS":"地名",
        "NR":"人名",
        "NT":"机构团体",
        "ORG": "组织",
        "LOC": "地点",
        "PER": "人",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "测量单位",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

trainsets = [250,500,1000,1350]

for train in trainsets:
    pretrain_model_args['train_file'] = f'data/few_shot/msra/train_{train}.json'
    pretrain_model_args['task_name'] = f'msra_pretrain_lebert_{train}_pretraind_task'

    pre_trainer = NERPreTrainer(**pretrain_model_args)

    for i in pre_trainer():
        a = i

    torch.cuda.empty_cache()

In [None]:
# resume
from CC.pre_trained import NERPreTrainer
import torch

pretrain_model_args = {
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 512,
    'max_scan_num': 1000000,
    'train_file': 'data/few_shot/resume/train_250.json',
    'eval_file': 'data/few_shot/resume/dev.json',
    'test_file': 'data/few_shot/resume/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': 'data/few_shot/resume/labels.txt',
    'loader_name': 'lex_loader',
    "word_embedding_file": "./data/tencent/word_embedding.txt",
    "word_vocab_file": "./data/tencent/tencent_vocab.txt",
    "word_vocab_file_with_tag": "./data/tencent/tencent_vocab_with_tag.json",
    "default_tag": "O",
    'batch_size': 32,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'task_name': 'resume_pretrain_lebert_250_pretraind_task',
    "use_gpu": True,
    "debug": True,
    "tag_rules": {
        "O":"非实体",
        "NAME":"名字",
        "CONT":"国家",
        "RACE":"种族背景",
        "TITLE":"职位",
        "EDU":"教育机构",
        "PRO":"专业",
        "ORG": "组织",
        "LOC": "地点",
        "PER": "人",
        "Time": "时间",
        "Thing": "物品",
        "Metric": "测量单位",
        "Abstract": "作品",
        "Physical": "实体",
        "Term": "术语",
        "company": "企业",
        "name": "名字",
        "game": "游戏",
        "movie": "电影",
        "position": "职位",
        "address": "地址",
        "government": "政府",
        "scene": "景点",
        "book": "书名"
    }
}

trainsets = [250,500,1000,1350]

for train in trainsets:
    pretrain_model_args['train_file'] = f'data/few_shot/resume/train_{train}.json'
    pretrain_model_args['task_name'] = f'resume_pretrain_lebert_{train}_pretraind_task'

    pre_trainer = NERPreTrainer(**pretrain_model_args)

    for i in pre_trainer():
        a = i

    torch.cuda.empty_cache()

In [None]:
model_args = {
    "loader":"le_loader",
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'hidden_dim': 512,
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': 'data/few_shot/weibo/train_250.json',
    'eval_file': 'data/few_shot/weibo/dev.json',
    'test_file': 'data/few_shot/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'word_embedding_file': 'data/tencent/word_embedding.txt',
    'tag_file': 'data/few_shot/weibo/labels.txt',
    'word_vocab_file':'data/tencent/tencent_vocab.txt',
    'loader_name': 'le_loader',
    'batch_size': 8,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'use_json': True,
    'model_name': 'LEBert',
    'output_eval': True,
    'task_name': 'weibo_lebert_crf_250_x3',
    "use_gpu": True,
    "debug": True,
}

In [None]:
from CC.trainer import NERTrainer
import torch

per_task_count = 3

datasets = ["weibo","note4","msra","resume"]

task_trainset = [250,500,1000,1350]

pretrain = [240,480,960,1290]

# dataset,counts,time,others
tasks_args = {
        'pretrained_file_name': 'save_pretrained/{0}_pretrain_lebert_{1}_pretraind_task/Bert_{3}/pytorch_model.bin',
        "train_file":"data/few_shot/{0}/train_{1}.json",
        "eval_file":"data/few_shot/{0}/dev.json",
        "test_file":"data/few_shot/{0}/test.json",
        'tag_file':"data/few_shot/{0}/labels.txt",
        "task_name":"{0}_pretrain_lebert_crf_{1}_{2}"
        
}

for name in datasets:
    for trainset,pre in zip(task_trainset,pretrain): 
        for i in range(per_task_count):
            for key,value in tasks_args.items():
                model_args[key] = value.format(name,trainset,f"x{i+1}",pre)
            trainer = NERTrainer(**model_args)

            for _ in trainer.train():
                pass

            torch.cuda.empty_cache()

