# 处理少数到无标签

### 通常处理已标签数据的缺失的方式：
<img src="WaytoDealingWithFeworNoLabel.png" width=800 height=600>

### 构建一个Github Issuse标签器

In [None]:
# 加载Github数据集
import pandas as pd

dataset_url = "./github-issues-transformers.json"
df_issue = pd.read_json(dataset_url, lines=True)

print(f"DataFrame Shape: {df_issue.shape}")

In [None]:
# 数据裁切
cols = ["url","id","title","user","labels","state","created_at","body"]
df_issue.loc[2,cols].to_frame()

In [None]:
# 以我们当前目的，我们只需要关注每个标签的名字
df_issue['labels'] = (df_issue['labels'].apply(lambda x:[meta['name'] for meta in x]))

df_issue[['labels']].head()

In [None]:
# 计算每个标签下的Issue数量

df_issue['labels'].apply(lambda x:len(x)).value_counts().to_frame().T

In [None]:
# 检查每个类型的标签的数量
df_counts = df_issue['labels'].explode().value_counts()
print(f"Number of labels : {len(df_counts)}")
# 显示前8个类别的数量
df_counts.to_frame().head(8).T  

In [None]:
# 创建一个新的打标器来使得分类任务更加容易处理器

label_map = {
    "Core: Tokenization":"tokenization",
    "New model":"new model",
    "Core: Modeling":"model training",
    "Usage":"usage",
    "Core: Pipeline":"pipeline",
    "TensorFlow":"tensorflow or tf",
    "Pytorch":"pytorch",
    "Examples":"examples",
    "Documentation":"documentation"
}

def filter_labels(x):
    return [label_map[label] for label in x if label in label_map]

df_issue['labels'] = df_issue['labels'].apply(filter_labels)
all_labels = list(label_map.values())

In [None]:
# 检查新标签的分布
df_counts = df_issue['labels'].explode().value_counts()
df_counts.to_frame().T

In [None]:
# 检查数据集中有多少没有标签的数据
df_issue['split'] = "unlabeled"
mask = df_issue['labels'].apply(lambda x:len(x)) > 0
df_issue.loc[mask,"split"] = "labeld"
df_issue['split'].value_counts().to_frame()

In [None]:
# 数据集样本检查
for col in ['title','body','labels']:
    print(f"{col}: {df_issue[col].iloc[26][:500]}\n")

In [None]:
# 将标题和问题主体进行结合
df_issue['text'] = (df_issue.apply(lambda x:x['title'] + "\n\n" +x['body'],axis=1))

In [None]:
# 去重
len_before = len(df_issue)
df_issue = df_issue.drop_duplicates(subset="text")
print(f"Removed {(len_before - len(df_issue)) / len_before:.2%} duplicates")

In [None]:
# 观察以下每个问题下大概用了多少个单词
import numpy as np
import matplotlib.pyplot as plt

(df_issue['text'].str.split().apply(len).hist(bins=np.linspace(0,500,50),grid=False,edgecolor="C0"))
plt.title("Words per Issue")
plt.xlabel("Number of words")
plt.ylabel("Number of Issues")
plt.show()

### 创建训练集

In [None]:
# 使用sklearn中的MultiLabelBinarizer来允许一个Issue可以有多个
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit([all_labels])
mlb.transform([["tokenization","new model"],["pytorch"]])

In [None]:
# 安装skmultilearn库
!pip install scikit-multilearn

In [None]:
# 使用iterative_train_test_split来针对多标签的数据集进行划分
from skmultilearn.model_selection import iterative_train_test_split

# 尽最大可能平衡训练集中的每个标签的数量
def balanced_split(df,test_size=0.5):
    ind = np.expand_dims(np.arange(len(df)),axis=1)
    labels = mlb.transform(df['labels'])
    ind_train,_,ind_test,_ = iterative_train_test_split(ind,labels,test_size=test_size)
    return df.iloc[ind_train[:,0]],df.iloc[ind_test[:,0]]

In [None]:
# 通过上述的函数来创建监督和无监督数据集，并且其中监督的部分可以被平分为训练集、验证集和测试集
from sklearn.model_selection import train_test_split

df_clean = df_issue[['text','labels','split']].reset_index(drop=True).copy()
df_unsup = df_clean.loc[df_clean['split'] == "unlabeled",['text','labels']]
df_sup = df_clean.loc[df_clean['split'] == "labeld",['text','labels']]

# 创建随机种子
np.random.seed(0)
df_train,df_tmp = balanced_split(df_sup,test_size=0.5)
df_valid,df_test = balanced_split(df_train,test_size=0.5)

In [None]:
# 使用Dataset中的Datset和DatsetDict类来创建数据集
from datasets import Dataset, DatasetDict

ds = DatasetDict({
    "train":Dataset.from_pandas(df_train.reset_index(drop=True)), # 使用Dataset.from_pandas()从Pandas数据帧加载数据
    "valid":Dataset.from_pandas(df_valid.reset_index(drop=True)),
    "test":Dataset.from_pandas(df_test.reset_index(drop=True)),
    "unsup":Dataset.from_pandas(df_unsup.reset_index(drop=True))
})

### 创建一个训练切片

In [None]:
np.random.seed(0)
all_indices = np.expand_dims(list(range(len(ds['train']))),axis=1)
indices_pool = all_indices
labels = mlb.transform(ds['train']['labels'])
train_samples = [8,16,32,64,128]
train_slices,last_k = [],0

for i,k in enumerate(train_samples):
    # 将间隙填充到下一个分割尺寸所需的分割样本
    indices_pool,labels,new_slice,_ = iterative_train_test_split(indices_pool,labels,(k-last_k)/len(labels))
    last_k = k
    if i == 0:
        train_slices.append(new_slice)
    else:
        train_slices.append(np.concatenate((train_slices[-1],new_slice)))

In [None]:
# 添加所有的数据集作为最后一个切片
train_slices.append(all_indices)
train_samples.append(len(ds['train']))
train_slices = [np.squeeze(train_slices) for train_slices in train_slices]

In [None]:
# 查看一个切片的大小
print("Target split sizes:",train_samples)
print("Actual split sizes:",[len(x) for x in train_slices])

### 实现一个Naive Bayesline

为什么要实现一个有效的Baselines模型？有以下两个主要的原因：
- 一个baseline模型基本上基于正则表达式、手动定制原则，或者一个非常简单的模型已经能够非常好的适应其目标工作流程。在这种情况下，基本上没有理由搬出像Transformer这样的*大家伙*。这样反而会给生产不是带来很多的麻烦。
- 一个baseline模型可以允许你更快速的检查一个更复杂的模型。举个例子：你也许训练了一个诸如BERT-large这样的模型并且在验证集上拿到了一个将近80%准确率的成绩。你也许会把它当作一个困难的数据集，到此为止。但如果我们告诉你一个很简答的，类似于逻辑回归的分类器就能实现95%的准确度呢？你是不是会甚至怀疑人生并且开始重新debug模型呢？

In [None]:
# 为数据集创建标签
def prepare_labels(batch):
    batch["label_ids"] = mlb.transform(batch["labels"])
    return batch

In [None]:
# 应用上述的函数
ds = ds.map(prepare_labels,batched=True)

为了更好地评估我们的分类器，我们将采用*微观*与*宏观*F1分数，其中前者在频繁标签上跟踪性能，而后者在忽略频率的所有标签上跟踪表现。

In [None]:
from collections import defaultdict

macro_scores,micro_scores = defaultdict(list),defaultdict(list)

In [None]:
# 训练我们的模型
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.feature_extraction.text import CountVectorizer

for train_slice in train_slices:
    # 获取一个训练和测试切片
    ds_train_sample = ds['train'].select(train_slice)
    y_train = np.array(ds_train_sample['label_ids'])
    y_test = np.array(ds['test']['label_ids'])
    # 使用一个简单的统计向量器来编码我们的文本为词元统计
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(ds_train_sample['text'])
    X_test_counts = count_vect.transform(ds['test']['text'])
    # 创建并训练我们的模型
    classifier = BinaryRelevance(classifier=MultinomialNB())
    classifier.fit(X_train_counts,y_train)
    # 生成预测并验证
    y_pred_test = classifier.predict(X_test_counts)
    clf_report = classification_report(y_test,y_pred_test,target_names=mlb.classes_,zero_division=0,output_dict=True)
    # 将分数添加到我们的字典中
    macro_scores['Naive Bayes'].append(clf_report['macro avg']['f1-score'])
    micro_scores['Naive Bayes'].append(clf_report['micro avg']['f1-score'])

In [None]:
# 绘制宏/微观F1分数曲线
import matplotlib.pyplot as plt

def plot_metrics(micro_scores,macro_scores,sample_size,current_model):
    fig,(ax0,ax1) = plt.subplots(1,2,figsize=(10,4),sharey=True)

    for run in micro_scores.keys():
        if run == current_model:
            ax0.plot(sample_size,micro_scores[run],label=run,linewidth=2)
            ax1.plot(sample_size,macro_scores[run],label=run,linewidth=2)
        else:
            ax0.plot(sample_size,micro_scores[run],label=run,linestyle="--")
            ax1.plot(sample_size,macro_scores[run],label=run,linestyle="--")
        
    ax0.set_title("Micro F1 Score")
    ax1.set_title("Macro F1 Score")
    ax0.set_ylabel("Test set F1 Score")
    ax0.legend(loc="lower right")
    for ax in [ax0,ax1]:
        ax.set_xlabel("Number of training samples")
        ax.set_xscale('log')
        ax.set_xticks(sample_size)
        ax.set_xticklabels(sample_size)
        ax.minorticks_off()
    plt.tight_layout()
    plt.show()

In [None]:
plot_metrics(micro_scores,macro_scores,train_samples,"Naive Bayes")

### 处理无标签

In [None]:
# 加载一个预训练的BERT模型来给数据预测掩码
from transformers import pipeline

pipe = pipeline("fill-mask",model="bert-base-uncased")

In [None]:
# 测试模型的可用性已经每个标签的分数
movie_desc = "The main characters of the movie madacascar are a lion,a zebra,a giraffe,and a hippo."
prompt = "The movie is about [MASK]."

ouptut = pipe(movie_desc + prompt)
for element in ouptut:
    print(f"Token {element['token_str']}:\t{element['score']:.3f}%")

In [None]:
# 使用target参数来限定bert模型仅对于指定标签进行预测
ouptut = pipe(movie_desc + prompt,targets=["animals","cars"])
for element in ouptut:
    print(f"Token {element['token_str']}:\t{element['score']:.3f}%")

In [None]:
# 尝试使用另外一个语料来测试模型对于车的预测
movie_desc = "In the movie transformers aliens can morph into a wide range of vehicles."

ouptut = pipe(movie_desc + prompt,targets=["animals","cars"])
for element in ouptut:
    print(f"Token {element['token_str']}:\t{element['score']:.3f}%")

In [None]:
# 加载一个NLI(Natural Language Inference)模型来预测标签
pipe_nli = pipeline("zero-shot-classification",device=0)

In [None]:
# 从Github数据集中进行零样本测试
sample = ds['train'][0]
print(f"Labels: {sample['labels']}")
output = pipe_nli(sample['text'],all_labels,multi_label=True)
print(output['sequence'][:400])

print("\nPredictions:")

for label,score in zip(output['labels'],output['scores']):
    print(f"{label} : {score:.2%}")

In [None]:
# 定义一个零样本管道
def zero_shot_pipeline(example):
    output = pipe_nli(example['text'],all_labels,multi_label=True)
    example['predicted_labels'] = output['labels']
    example['scores'] = output['scores']
    return example

ds_zero_shot = ds['valid'].map(zero_shot_pipeline)

使用以下方式来控制每个样本中应该声明的标签：
- 定义一个阈值并选择高于该阈值的标签
- 使用$top_k$最高分数来获取$top_k$标签

In [None]:
# 定义一个获取预测标签的函数
def get_preds(example,threshold=None,topk=None):
    preds = []
    if threshold:
        for label,score in zip(example['predicted_labels'],example['scores']):
            if score > threshold:
                preds.append(label)
    elif topk:
        for i in range(topk):
            preds.append(example['predicted_labels'][i])
    else:
        raise ValueError("Please provide either a threshold or topk value")
    return {"pred_label_ids":list(np.squeeze(mlb.transform([preds])))}

In [None]:
# 定义一个分类报告获取函数
def get_clf_report(ds):
    y_true = np.array(ds['label_ids'])
    y_pred = np.array(ds['pred_label_ids'])
    return classification_report(y_true,y_pred,target_names=mlb.classes_,zero_division=0,output_dict=True)

In [None]:
# 绘制带有K值限制的F1分数曲线
macros,micros = [],[]
topks = [1,2,3,4]
for topk in topks:
    ds_zero_shot = ds_zero_shot.map(get_preds,batched=False,fn_kwargs={"topk":topk})
    clf_report = get_clf_report(ds_zero_shot)
    macros.append(clf_report['macro avg']['f1-score'])
    micros.append(clf_report['micro avg']['f1-score'])

plt.plot(topks,macros,label="Macro F1 Score",linewidth=2)
plt.plot(topks,micros,label="Micro F1 Score",linewidth=2,linestyle="dashed")
plt.xlabel("Top-k")
plt.ylabel("F1 Score")
plt.legend(loc="best")
plt.show()

In [None]:
# 绘制带有阈值限制的F1分数曲线
macros,micros = [],[]
thresholds = np.linspace(0.01,1,100)
for threshold in thresholds:
    ds_zero_shot = ds_zero_shot.map(get_preds,fn_kwargs={"threshold":threshold})
    clf_report = get_clf_report(ds_zero_shot)
    macros.append(clf_report['macro avg']['f1-score'])
    micros.append(clf_report['micro avg']['f1-score'])
plt.plot(thresholds,macros,label="Macro F1 Score",linewidth=2)
plt.plot(thresholds,micros,label="Micro F1 Score",linewidth=2,linestyle="dashed")
plt.xlabel("Threshold")
plt.ylabel("F1 Score")
plt.legend(loc="best")
plt.show()

In [None]:
# 计算最佳阈值
best_t,best_micro = thresholds[np.argmax(micros)],np.max(micros)
print(f"Best threshold (micro): {best_t} with F1-score {best_micro:.2f}.")
best_t,best_macro = thresholds[np.argmax(macros)],np.max(macros)
print(f"Best threshold (macro): {best_t} with F1-score {best_macro:.2f}.")

In [None]:
# 使用top1方法的参数结果来0样本模型和朴素贝叶斯对比
ds_zero_shot = ds['test'].map(zero_shot_pipeline)
ds_zero_shot = ds_zero_shot.map(get_preds,fn_kwargs={"topk":1})
clf_report = get_clf_report(ds_zero_shot)

for train_slice in train_slices:
    macro_scores['Zero-shot'].append(clf_report['macro avg']['f1-score'])
    micro_scores['Zero-shot'].append(clf_report['micro avg']['f1-score'])

plot_metrics(micro_scores,macro_scores,train_samples,"Zero-shot")

### 处理少数标签

常见的两个技巧：
- 后翻译处理：其核心思想是将源文本翻译成一个或多个外语，再翻译回来。这个技巧尤其在特别高的源语言或超大量语料库上表现得很好，前提是语料内不应包含特殊的领域术语。
- 词元混合：从训练集中随机提取并采取随机的变换，例如“删除”、“替换”、“插入”、“交换”等。  
可使用的库：
- 使用NlpAug库已实现后翻译处理
- 使用TextAttack库已实现词元混合

In [None]:
from transformers import set_seed
import nlpaug.augmenter.word as naw

set_seed(0) # 设置随机种子
aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased',device="cpu",action="substitute")

In [None]:
# 测试数据增强
text = "Transformers are the most popular toys"
print(f"Original: {text}")
print(f"Augmented: {aug.augment(text)[0]}")

In [None]:
# 定义一个数据增强函数
def augment_text(batch):
    text_aug,label_ids = [],[]
    for text,labels in zip(batch['text'],batch['label_ids']):
        text_aug += [text]
        label_ids += [labels]
        text_aug += [aug.augment(text)[0]]
        label_ids += [labels]
    return {"text":text_aug,"label_ids":label_ids}

In [None]:
# 将数据增强函数应用到我们的数据集
ds_train_sample = ds_train_sample.map(augment_text,batched=True,remove_columns=ds_train_sample.column_names).shuffle(seed=42)

### 使用Emebdding作为查询表

> 诸如GPT-3的大语言模型已经展示其在有限数据下解决问题的性能。这其中的原因是这些模型能够学习到有用的文本表述并将文本编码为多个维度，例如：情感(*Semtiment*)、话题(*Topic*)、文本结构(*Text Structure*)等。处于这些原因，这些大语言模型的嵌入可用于开发语义搜索引擎，寻找相似文档或评论，甚至是文本分类。

接下来要做的事情：
- 利用OpenAI的模型来嵌入所有标签文本
- 对所有存储的嵌入使用邻近值搜索
- 聚合所有邻近的标签以获得一个预测

In [None]:
# 使用GPT2来实现数据增强
import torch
from transformers import AutoTokenizer,AutoModel
model_ckpt = "miguelvictor/python-gpt2-large"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [None]:
# 定以均值池化函数
def mean_pooling(model_output,attention_mask):
    # 提取词元嵌入
    token_embeddings = model_output[0]
    # 计算注意力掩码层
    input_mask_expanded = (attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float())
    # 求出嵌入值的和，并且忽略已被掩码的词元
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded,dim=1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1),min=1e-9)
    # 返回均值
    return sum_embeddings / sum_mask

In [None]:
# 定义一个嵌入文本的函数
def embed_text(examples):
    inputs = tokenizer(examples['text'],padding=True,truncation=True,max_length=128,return_tensors="pt")
    with torch.no_grad():
        model_output = model(**inputs)
    pooled_embeds = mean_pooling(model_output,inputs['attention_mask'])
    return {"embedding":pooled_embeds.cpu().numpy()}

In [None]:
# 获取每个文本的嵌入
tokenizer.pad_token = tokenizer.eos_token
embs_train = ds['train'].map(embed_text,batched=True,batch_size=16)
embs_valid = ds['valid'].map(embed_text,batched=True,batch_size=16)
embs_test = ds['test'].map(embed_text,batched=True,batch_size=16)

In [None]:
# 创建一个FAISS索引
embs_train.add_faiss_index("embedding")

In [None]:
# 定义搜索参数
i,k = 0,3 # 从第零个开始，搜索最近的3个邻居
rn,nl = "\r\n\r\n","\n" # 移除换行符

query = np.array(embs_train[i]['embedding'],dtype=np.float32)
scores,samples = embs_train.get_nearest_examples("embedding",query,k=k)

print(f"QUERY LABELS: {embs_valid[i]['labels']}")
print(f"QUERY TEXT:\n{embs_valid[i]['text'][:200].replace(rn,nl)} [...]\n")
print("="*50)
print(f"Retrieved documents:")
for score,label,text in zip(scores,sample['labels'],sample['text']):
    print("="*50)
    print(f"TEXT:\n{text[:200].replace(rn,nl)} [...]")
    print(f"SCORE: {score:.2f}")
    print(f"LABELS: {label}")

In [None]:
# 重新定义获取采样预测的函数
def get_sample_pred(sample,m):
    return (np.sum(sample['label_ids'],axis=0) >= m).astype(int)

# 定义寻找最佳K邻居的函数
def find_best_k_m(ds_train,valid_queries,valid_labels,max_k=17):
    max_k = min(len(ds_train),max_k)
    perf_micro = np.zeros((max_k,max_k))
    perf_macro = np.zeros((max_k,max_k))
    for k in range(1,max_k):
        for m in range(1,k+1):
            _,samples = ds_train.get_nearest_examples_batch("embedding",valid_queries,k=k)

            y_pred = np.array([get_sample_pred(sample,m) for sample in samples])
            clf_report = classification_report(valid_labels,y_pred,target_names=mlb.classes_,zero_division=0,output_dict=True)
            perf_micro[k,m] = clf_report['micro avg']['f1-score']
            perf_macro[k,m] = clf_report['macro avg']['f1-score']
    return perf_micro,perf_macro

In [None]:
# 获取验证集标签
valid_labels = np.array(embs_valid['label_ids'])
valid_queries = np.array(embs_valid['embedding'],dtype=np.float32)
perf_micro,perf_macro = find_best_k_m(embs_train,valid_queries,valid_labels)

# 绘制最佳K邻居的F1分数曲线
fig,(ax0,ax1) = plt.subplots(1,2,figsize=(10,3.5),sharey=True)
ax0.imshow(perf_micro)
ax1.imshow(perf_macro)

ax0.set_title("micro scores")
ax0.set_ylabel("k")
ax1.set_title("macro scores")
for ax in [ax0,ax1]:
    ax.set_xlim([0.5,17 -0.5])
    ax.set_ylim([17 - 0.5,0.5])
    ax.set_xlabel("m")
plt.show()

通过上述图标我们可以发现：无论$m$值选的是高还是低从给定的$k$值中将会得到并非最佳的结果。最佳性能只会出现在$\frac{m}{k} = \frac{1}{3}$时，但是这样实在是过于麻烦，我们可以使用Numpy中的`unravel_index()`函数来获取最佳的$m$和$k$值。

In [None]:
k,m = np.unravel_index(perf_micro.argmax(),perf_micro.shape)
print(f"Best k: {k}, Best m: {m}")

In [None]:
# 使用循环继续深入探索
embs_train.drop_index("embedding")
test_label = np.array(embs_test['label_ids'])
test_queries = np.array(embs_test['embedding'],dtype=np.float32)

for train_slice in train_slices:
    # 创建一个Faiss索引从训练切片中
    embs_train_tmp = embs_train.select(train_slice)
    embs_train_tmp.add_faiss_index("embedding")
    # 获取最佳K和M
    perf_micro,_ = find_best_k_m(embs_train_tmp,valid_queries,valid_labels)
    k,m = np.unravel_index(perf_micro.argmax(),perf_micro.shape)
    # 在测试集上进行预测
    _,samples = embs_train_tmp.get_nearest_examples_batch("embedding",test_queries,k=int(k))

    y_pred = np.array([get_sample_pred(sample,m) for sample in samples])
    clf_report = classification_report(test_label,y_pred,target_names=mlb.classes_,zero_division=0,output_dict=True)
    macro_scores['Embedding'].append(clf_report['macro avg']['f1-score'])
    micro_scores['Embedding'].append(clf_report['micro avg']['f1-score'])

In [None]:
plot_metrics(micro_scores,macro_scores,train_samples,"Embedding")

### 为什么Faiss能够提供更快地近似度搜索？
> FAISS在处理这个过程上有几个取巧的地方：
> - 将向量数据库中的数据进行随机分区，这将显著减少搜索空间。
> - 在随机打乱后我们仍然无法确定哪个分区是应该被查询的，FAISS使用*K-Means*聚类算法来匹配从查询到向量数据库中最近的聚类分区。  

#### FAISS的实现原理
从给入的n组向量中搜索起来会更加简单：我们首先从[质心](https://baike.baidu.com/item/%E8%B4%A8%E5%BF%83/2509882)*K*周围来寻找出一个距离我们输入最近的搜索，然后再以组进行搜索（对比$\frac{k}{n}个元素$。这将减少从$n$到$k + \frac{n}{k}$的对比数量。可问题来了：最佳的$k$值是多少呢？如果太小的话，每个组中的样本数量将会激增，以导致我们K-Means的计算时间过长。如果太大又会产生N多个质心从而影响搜索匹配的精度。实际上我们应该从函数$f(k) = k + \frac{n}{k}$的中值中寻找最佳的$k$值，以$k = \sqrt{n}$为例。

### 训练一个“香草味”的Transformer

In [None]:
# 加载一个预训练的BERT模型来对数据进行预测
import torch
from transformers import (AutoTokenizer,AutoConfig,AutoModelForSequenceClassification)

model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(batch['text'],truncation=True,max_length=128)

# 词元化数据集
ds_enc = ds.map(tokenize,batched=True)
ds_enc = ds_enc.remove_columns(["text","labels"])

In [None]:
# 设置数据集格式为PyTorch
ds_enc.set_format("torch")
# 将标签设置为浮点以确保训练时不会出现错误
ds_enc = ds_enc.map(lambda x:{"label_ids_f":x['label_ids'].to(torch.float)},remove_columns=["label_ids"])
ds_enc = ds_enc.rename_column("label_ids_f","label_ids")

In [None]:
# 定义训练参数
from transformers import TrainingArguments

training_args_fine_tune = TrainingArguments(
    output_dir = "./results",
    num_train_epochs=20,
    learning_rate=3e-5,
    lr_scheduler_type="constant",
    per_device_train_batch_size=4,
    weight_decay=0.0,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    log_level="error")

In [None]:
from scipy.special import expit as sigmoid

# 定义精度函数
def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = sigmoid(pred.predictions)
    y_pred = (y_pred > 0.5).astype(float)
    clf_dict = classification_report(y_true,y_pred,target_names=all_labels,zero_division=0,output_dict=True)
    return {
        "micro f1":clf_dict['micro avg']['f1-score'],
        "macro f1":clf_dict['macro avg']['f1-score']
    }

In [None]:
from transformers import Trainer
config = AutoConfig.from_pretrained(model_ckpt)
config.num_labels = len(all_labels)
config.problem_type = "multi_label_classification"
        

In [None]:
for train_slice in train_slices:
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,config=config)
    trainer = Trainer(
        model=model,tokenizer=tokenizer,
        args=training_args_fine_tune,
        compute_metrics=compute_metrics,
        train_dataset=ds_enc['train'].select(train_slice),
        eval_dataset=ds_enc['valid'])
    trainer.train()
    pred = trainer.predict(ds_enc['test'])
    metrics = compute_metrics(pred)
    micro_scores['Fine-tune (vanilla)'].append(metrics['micro f1'])
    macro_scores['Fine-tune (vanilla)'].append(metrics['macro f1'])


In [None]:
plot_metrics(micro_scores,macro_scores,train_samples,"Fine-tune (vanilla)")

### 将BERT微调成一个掩码语言模型

In [None]:
# 创建一个带有特殊词元的编码函数
def tokenize(batch):
    return tokenizer(batch['text'],truncation=True,max_length=128,return_special_tokens_mask=True)


# 词元化数据集
ds_mlm = ds.map(tokenize,batched=True)
ds_mlm = ds_mlm.remove_columns(["text","labels","label_ids"])

In [None]:
# 使用数据对齐
from transformers import DataCollatorForLanguageModeling,set_seed

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm_probability=0.15)

In [None]:
# 设置种子
set_seed(3)
data_collator.return_tensors = "np"
inputs = tokenizer("Transformers are awesome!",return_tensors="np")

outputs = data_collator([{"input_ids":inputs['input_ids'][0]}])

pd.DataFrame(
    {
        "Original tokens":tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]),
        "Masked tokens":tokenizer.convert_ids_to_tokens(outputs['input_ids'][0]),
        "Labels":outputs['labels'][0]
    }
).T

In [None]:
# 将数据对齐返回的模式设置为PyTorch
data_collator.return_tensors = "pt"

In [None]:
# 登录只写token
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# 训练掩码模型
from transformers import AutoModelForMaskedLM

training_args = TrainingArguments(
    output_dir=f"{model_ckpt}-issue-128",
    per_device_train_batch_size=32,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="no",
    num_train_epochs=16,
    push_to_hub=True,
    log_level="error",
    report_to="none"
)

trainer = Trainer(
    model=AutoModelForMaskedLM.from_pretrained(model_ckpt),
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds_mlm['unsup'],
    eval_dataset=ds_mlm['train']
)

In [None]:
trainer.train()

In [None]:
# 调用log历史来查看Loss曲线

df_log = pd.DataFrame(trainer.state.log_history)
(
    df_log.dropna(subset=['eval_loss']).reset_index()['eval_loss'].plot(label="Validation")
)
df_log.dropna(subset=['loss']).reset_index()['loss'].plot(label="Train",linestyle="--")

plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(loc="upper right")
plt.show()

### 微调一个BERT分类器

In [None]:
# 定义训练的超参数
model_ckpt = f"{model_ckpt}-issue-128"
config = AutoConfig.from_pretrained(model_ckpt)
config.num_labels = len(all_labels)
config.problem_type = "multi_label_classification"

In [None]:
# 训练掩码模型
for train_slice in train_slices:
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,config=config)
    trainer = Trainer(
        model=model,tokenizer=tokenizer,
        args=training_args_fine_tune,
        compute_metrics=compute_metrics,
        train_dataset=ds_enc['train'].select(train_slice),
        eval_dataset=ds_enc['valid'])
    trainer.train()
    pred = trainer.predict(ds_enc['test'])
    metrics = compute_metrics(pred)
    micro_scores['Fine-tune (DA)'].append(metrics['micro f1'])
    macro_scores['Fine-tune (DA)'].append(metrics['macro f1'])

In [None]:
plot_metrics(micro_scores,macro_scores,train_samples,"Fine-tune (DA)")