### 小样本实验
数据规模：0.25k 0.5k 1k 1.35k  
抽样方式：随机抽样，设置统一的样本种子   
结果：每个模型训练3次，取最好的一次作为结果

#### weibo小样本数据实验

In [None]:
# 创建数据样本
import os

path = "data/few_shot"
if not os.path.exists(path):
    os.makedirs(path)
print("done!")

In [None]:
# 复制微博数据
from shutil import copyfile
import os

data = ["train.json","dev.json","test.json","labels.txt"]

if not os.path.exists("data/few_shot/weibo"):
    os.makedirs("data/few_shot/weibo")
for f in data:
    copyfile(f"data/weibo/{f}",f"data/few_shot/weibo/{f}")

In [None]:
# 抽样数据
postfix = [250,500,1000,1350]

# 设置统一的随机种子
import random

# 读取原始训练集
train_file = "data/few_shot/weibo/train.json"
train_data = []

from tqdm import tqdm
import json

with open(train_file,"r",encoding="utf-8") as reader:
    for line in tqdm(reader,desc=f"load {train_file}"):
        data = json.loads(line.strip())
        text,label = data["text"],data["label"]
        assert len(text)==len(label)
        if len(text)>0:
            train_data.append(data)
        else:
            print(text,label)

# 生成数据
for post in postfix:
    # 重置随机，确保结果可复现
    random.seed(2021)
    data = random.sample(train_data,post)
    assert len(data)==post
    with open(f"data/few_shot/weibo/train_{post}.json","w",encoding="utf-8") as f: 
        for line in tqdm(data,desc=f"{post}"):
            f.write(f"{json.dumps(line,ensure_ascii=False)}\n")
        f.flush()

In [None]:
# 使用BertBase 作为 baseline

# TODO: 将Bert与CRF直接结合在一起，直接使用Transformers保存预训练
# 方法一 Bert+CRF
# loader: cnloader
# model: Bert
# data: 250x weibo.train.json
# name: x1 x2 x3
model_args = {
    "model_name":"Bert",
    "loader":"cn_loader",
    'num_epochs': 30,
    'num_gpus': [0, 1, 2, 3],
    'hidden_dim': 512,
    'bert_config_file_name': './model/chinese_wwm_ext/config.json',
    'pretrained_file_name': './model/chinese_wwm_ext/pytorch_model.bin',
    'max_seq_length': 150,
    'max_scan_num': 1000000,
    'train_file': 'data/few_shot/weibo/train_250.json',
    'eval_file': 'data/few_shot/weibo/dev.json',
    'test_file': 'data/few_shot/weibo/test.json',
    'bert_vocab_file': './model/chinese_wwm_ext/vocab.txt',
    'tag_file': 'data/few_shot/weibo/labels.txt',
    'loader_name': 'cn_loader',
    'batch_size': 8,
    'eval_batch_size': 64,
    'do_shuffle': True,
    'use_json': True,
    'model_name': 'Bert',
    'output_eval': True,
    'task_name': 'weibo_bert_crf_250_x2',
    "use_gpu": True,
    "debug": True,
}

from CC.trainer import NERTrainer

trainer = NERTrainer(**model_args)

for _ in trainer.train():
    pass

import torch
torch.cuda.empty_cache()


In [None]:
# data: weibo.train.json.500x
# name: x1 x2 x3
model_args['train_file'] = "data/few_shot/weibo/train_500.json"
model_args['task_name'] = 'weibo_bert_crf_500_x1'

trainer = NERTrainer(**model_args)

for _ in trainer.train():
    pass

import torch
torch.cuda.empty_cache()