In [None]:
!pip install evaluate

In [1]:
import os
import random
import functools
import csv
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score, multilabel_confusion_matrix, roc_curve, auc
from skmultilearn.model_selection import iterative_train_test_split
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
import pandas as pd
import bitsandbytes, accelerate


2024-06-23 22:07:25.952996: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=float(pad_token_id)
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=float(pad_token_id)
    )
    d['labels'] = torch.stack(d['labels'])
    return d

In [3]:
# create custom trainer class to be able to pass label weights and calculate mutilabel loss
class CustomTrainer(Trainer):

    def __init__(self, label_weights, **kwargs):
        super().__init__(**kwargs)
        self.label_weights = label_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")

        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # compute custom loss
        loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
        return (loss, outputs) if return_outputs else loss

In [4]:
# disable progress bars when submitting
def is_interactive():
   return 'SHLVL' not in os.environ

if not is_interactive():
    def nop(it, *a, **k):
        return it

    tqdm = nop

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()
N_LABELS = 30 #主标签+辅助标签

# def preprocess(data):
#     '''
#     Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
#     '''
#     punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
#     def clean_special_chars(text, punct):
#         for p in punct:
#             text = text.replace(p, ' ')
#         return text

#     data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
#     return data

数据集构建

In [5]:
max_features = None

train = pd.read_csv('../dataset/train.csv')
# test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

# x_train = preprocess(train['comment_text'])
x_train = train['comment_text']
y_train = np.where(train['target'] >= 0.5, 1, 0)
# y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
y_aux_train = train[['target']+list(train.columns[3:32])]
y_aux_train = y_aux_train.applymap(lambda x: 1 if x > 0.5 else 0)


# 均衡化数据集
# 确定数量较少的类别
count_class_0 = np.sum(y_train == 0)
count_class_1 = np.sum(y_train == 1)

# 从数量较多的类别中随机抽样
index_class_0 = np.where(y_train == 0)[0]
index_class_1 = np.where(y_train == 1)[0]

# 从数量较多的类别中随机选择与数量较少的类别相同数量的样本
seed_everything()
selected_index_class_0 = np.random.choice(index_class_0, count_class_1, replace=False)

# 构建平衡的数据集
x_train = np.concatenate([x_train[selected_index_class_0], x_train[index_class_1]])
y_train = np.concatenate([y_train[selected_index_class_0], y_train[index_class_1]])
y_aux_train = np.concatenate([y_aux_train.iloc[selected_index_class_0], y_aux_train.iloc[index_class_1]])

# MAX_LEN = 220
# 划分训练集和测试集，按照9:1的比例
x_train, x_test, y_train, y_test, y_aux_train, y_aux_test = train_test_split(
    x_train, y_train, y_aux_train, test_size=0.1, random_state=1624)



In [6]:
# create hf dataset
ds = DatasetDict({
    'train': Dataset.from_dict({'text': x_train, 'labels': y_aux_train}),
    'val': Dataset.from_dict({'text': x_test, 'labels': y_aux_test})
})
label_weights = np.ones((N_LABELS))

In [7]:
# model name
model_name = 'bert-base-uncased'

# preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

tokenizer = AutoTokenizer.from_pretrained('./BERT')
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')
pad_token_id = 0

# # quantization config
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,  # enable 4-bit quantization
#     bnb_4bit_quant_type='nf4',  # information theoretically optimal dtype for normally distributed weights
#     bnb_4bit_use_double_quant=True,  # quantize quantized weights //insert xzibit meme
#     bnb_4bit_compute_dtype=torch.bfloat16  # optimized fp format for ML
# )


Map:   0%|          | 0/259801 [00:00<?, ? examples/s]

Map:   0%|          | 0/28867 [00:00<?, ? examples/s]

In [26]:
# lora config
lora_config = LoraConfig(
    r=4,  # the dimension of the low-rank matrices
    inference_mode=False,
    lora_alpha=8,  # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules=[#'query', 
                    # 'key', 
                    'value',
                    'classifier.bias',
                    'classifier.weight'
                    'dense'
                   ],
    lora_dropout=0.1,  # dropout probability of the LoRA layers
    bias='none',  # whether to train bias weights, so 'none' for attention layers
    task_type='SEQ_CLS'
)

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    './BERT',
    num_labels=N_LABELS
)
# print(model)
# model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
model.config.pad_token_id = pad_token_id

# define training args
training_args = TrainingArguments(
    output_dir = 'multilabel_classification',
    learning_rate = 5e-4,
    per_device_train_batch_size = 8, 
    per_device_eval_batch_size = 8,
    num_train_epochs = 2,
    weight_decay = 0.01,
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./BERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


trainable params: 96,798 || all params: 109,602,108 || trainable%: 0.0883


In [27]:
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = torch.sigmoid(torch.tensor(logits)).numpy() > 0.5
#     # labels = labels.numpy()
    
#     # # Calculate F1 scores
#     # f1_micro = f1_score(labels, predictions, average='micro')
#     # f1_macro = f1_score(labels, predictions, average='macro')
#     # f1_weighted = f1_score(labels, predictions, average='weighted')

#     # # Plot Confusion Matrix for each label
#     # conf_matrices = multilabel_confusion_matrix(labels, predictions)
#     # fig, ax = plt.subplots(1, len(conf_matrices), figsize=(15, 5))
#     # # if len(conf_matrices) > 1:
#     # #     for idx, cm in enumerate(conf_matrices):
#     # #         plot_confusion_matrix(cm, idx, ax[idx])
#     # # else:
#     # #     plot_confusion_matrix(conf_matrices[0], 0, ax)
#     # plt.tight_layout()
#     # plt.show()

#     # # Plot ROC Curves
#     # plot_multilabel_roc(labels, torch.sigmoid(torch.tensor(logits)).numpy(), num_classes=labels.shape[1])
    
#     return {}

from sklearn.metrics import accuracy_score, recall_score
def compute_metrics(pred):
    # 获取预测的概率值
    logits = pred.predictions
    probabilities = 1 / (1 + np.exp(-logits))  # sigmoid 转换

    # 获取真实的标签
    labels = pred.label_ids

    # 选择第一个类别的预测和真实标签
    class_index = 0  # 第一个类别的索引
    predictions_class = (probabilities[:, class_index] > 0.5).astype(int)
    labels_class = labels[:, class_index]

    # 计算第一个类别的准确度和召回率
    accuracy_class = accuracy_score(labels_class, predictions_class)
    recall_class = recall_score(labels_class, predictions_class)

    return {
        'accuracy_class_0': accuracy_class,
        'recall_class_0': recall_class,
    }

可以反复运行以下代码，起到增加epoch数量的作用

In [28]:
# train
trainer = CustomTrainer(
    model = model.cuda(),
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['val'],
    tokenizer = tokenizer,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = compute_metrics,
    label_weights = torch.tensor(label_weights, device=model.device)
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy Class 0,Recall Class 0
1,0.0281,0.029688,0.840337,0.897926
2,0.0271,0.026909,0.854471,0.872455




TrainOutput(global_step=64952, training_loss=0.03149752416315888, metrics={'train_runtime': 3375.1759, 'train_samples_per_second': 153.948, 'train_steps_per_second': 19.244, 'total_flos': 4.438001751174108e+16, 'train_loss': 0.03149752416315888, 'epoch': 2.0})

In [29]:
# save model
peft_model_id = 'lora-bert-2epoch'
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)



('lora-bert-2epoch/tokenizer_config.json',
 'lora-bert-2epoch/special_tokens_map.json',
 'lora-bert-2epoch/vocab.txt',
 'lora-bert-2epoch/added_tokens.json',
 'lora-bert-2epoch/tokenizer.json')

In [30]:
#再训练两轮
trainer.train()
# save model
peft_model_id = 'lora-bert-4epoch'
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

Epoch,Training Loss,Validation Loss,Accuracy Class 0,Recall Class 0
1,0.0267,0.027255,0.854263,0.871319
2,0.0263,0.026224,0.857034,0.880788




('lora-bert-4epoch/tokenizer_config.json',
 'lora-bert-4epoch/special_tokens_map.json',
 'lora-bert-4epoch/vocab.txt',
 'lora-bert-4epoch/added_tokens.json',
 'lora-bert-4epoch/tokenizer.json')

In [None]:
在进行模型评测时，以下需要补充读出模型，输入训练集，进行预测的代码

这部分就交给你们啦（因为我单模型的precision和recall已经通过上面得到了）

我的训练集：标签均为0-1，共N_LABELS个（可以讨论只用toxicity或者identity的情况），模型的输出需要进一步sigmoid才会得到概率。在集成学习中，把得到的label综合起来