In [1]:
from datasets import load_dataset, load_from_disk
import os
from pathlib import Path
from typing import Dict
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    DataCollatorForSeq2Seq,
    DataCollatorWithPadding,
    TrainingArguments,
    Seq2SeqTrainingArguments,
    Trainer,
    Seq2SeqTrainer,
    PreTrainedTokenizerFast,
)
from functools import partial
from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast
from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM, Qwen2ForSequenceClassification
from peft import PrefixTuningConfig, get_peft_model, TaskType
from peft.peft_model import PeftModelForCausalLM, PeftModelForSequenceClassification
from rouge_score import rouge_scorer
import numpy as np
from transformers import GenerationConfig
import torch

In [2]:
file_dir = Path(os.getcwd())
file_dir

PosixPath('/root/llm_adv_qa/sft')

In [3]:
cache_dir='/root/autodl-tmp'

In [4]:
model_path = str(
        Path(cache_dir, ".cache/modelscope/hub/tiansz/bert-base-chinese")
    )
model_path

'/root/autodl-tmp/.cache/modelscope/hub/tiansz/bert-base-chinese'

In [5]:
train_file = Path(file_dir.parent, "resources", "dataset", "classification", "train.jsonl")
test_file = Path(file_dir.parent, "resources", "dataset", "classification", "test.jsonl")
dataset = load_dataset(
    "json", data_files={"train": train_file.as_posix(), "test": test_file.as_posix()}
)

dataset

DatasetDict({
    train: Dataset({
        features: ['class', 'question', 'prompt'],
        num_rows: 1087
    })
    test: Dataset({
        features: ['class', 'question', 'prompt'],
        num_rows: 119
    })
})

In [6]:
dataset = dataset.shuffle(seed=1234)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
type(tokenizer)

transformers.models.bert.tokenization_bert_fast.BertTokenizerFast

In [8]:
original_cols = dataset["train"].column_names
original_cols

['class', 'question', 'prompt']

In [9]:
id2label={
    0: "A",
    1: "B",
    2: "C",
    3: "D",
    4: "E",
    5: "F"
}

label2id={
    "A": 0,
    "B": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5
}

In [10]:
def preprocess_training_data(
    example,
    tokenizer: PreTrainedTokenizerFast,
    max_source_len: int = 512,
    max_output_len: int = 1,
    ignore_pad_token_for_loss: bool = True,
) -> Dict:
    """
    example:
      {
          "id": 0,
          "question_prompt": "xxxx",
          "question": "xxx",
          "query": "F"
      },

    return:
        {"input_ids:[], attention_mask:[], labels:[]}

    build inputs with format `X Y <eos> <pad>...` and labels with format `<pad> ... <pad> Y <eos><pad>...`
    """
    max_seq_len = max_source_len + max_output_len + 1

    x: str = example["question_prompt"]
    y: str = example["query"]

    x_ids = tokenizer.encode(
        text=x,
        truncation=True,
        max_length=max_source_len,
    )

    y_ids = tokenizer.encode(
        text=y,
        truncation=True,
        max_length=max_output_len,
    )

    x_len = len(x_ids)
    input_ids = x_ids + y_ids + [tokenizer.eos_token_id]
    labels = [tokenizer.pad_token_id] * x_len + y_ids + [tokenizer.eos_token_id]

    # paddding
    pad_len = max_seq_len - len(input_ids)
    input_ids = input_ids + [tokenizer.pad_token_id] * pad_len
    labels = labels + [tokenizer.pad_token_id] * pad_len
    assert len(input_ids) == len(labels) == max_seq_len

    if ignore_pad_token_for_loss:
        labels = [
            (label if label != tokenizer.pad_token_id else -100) for label in labels
        ]

    return {"input_ids": input_ids, "labels": labels}


In [11]:
def preprocess_training_data_for_cls(
    example,
    tokenizer: PreTrainedTokenizerFast,
    label2id: Dict,
) -> Dict:
    """
    example:
      {
          "prompt": "xxxx",
          "question": "xxx",
          "class": "F"
      },

    return:
        {"input_ids:[1,2,3,4,5], attention_mask:[1,1,1,1,1], labels:0}

    """
    x: str = example["prompt"]
    y: str = example["class"]
    xx = tokenizer(x, truncation=True)
    label = label2id[y]
    xx['labels'] = label
    return xx


In [12]:
preprocess_func = partial(preprocess_training_data_for_cls, tokenizer=tokenizer, label2id=label2id)

In [13]:
validation_set = dataset["test"].map(
        partial(preprocess_func, tokenizer=tokenizer),
        remove_columns=original_cols,
    )

In [14]:
validation_set.to_pandas()

Unnamed: 0,input_ids,token_type_ids,attention_mask,labels
0,"[101, 6435, 7309, 100, 7448, 2128, 4906, 2825,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,"[101, 6435, 7309, 100, 1071, 800, 2418, 3119, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5
2,"[101, 6435, 7309, 100, 9960, 2399, 5433, 2949,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,"[101, 6435, 7309, 100, 2123, 3797, 2548, 3208,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
4,"[101, 6435, 7309, 100, 8439, 2399, 2600, 6566,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4
...,...,...,...,...
114,"[101, 6435, 7309, 100, 6435, 2990, 897, 1298, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
115,"[101, 6435, 7309, 100, 2990, 897, 5650, 2533, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
116,"[101, 6435, 7309, 100, 704, 4906, 7032, 6568, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
117,"[101, 6435, 7309, 100, 7270, 3309, 5500, 3326,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5


In [15]:
train_set = dataset["train"].map(
        partial(preprocess_func, tokenizer=tokenizer),
        remove_columns=original_cols,
    )

In [16]:
train_set.to_pandas()

Unnamed: 0,input_ids,token_type_ids,attention_mask,labels
0,"[101, 6435, 7309, 100, 2769, 2682, 4761, 6887,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
1,"[101, 6435, 7309, 100, 3800, 1085, 1765, 4157,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4
2,"[101, 6435, 7309, 100, 1762, 8439, 2399, 8024,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,"[101, 6435, 7309, 100, 1762, 6205, 2128, 3800,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4
4,"[101, 6435, 7309, 100, 1065, 2336, 7942, 3777,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
...,...,...,...,...
1082,"[101, 6435, 7309, 100, 945, 3345, 4294, 7415, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
1083,"[101, 6435, 7309, 100, 3343, 2336, 6237, 4636,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
1084,"[101, 6435, 7309, 100, 8439, 2399, 704, 6823, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1085,"[101, 6435, 7309, 100, 1762, 9960, 2399, 8024,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2


In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=6,id2label=id2label, label2id=label2id
)
type(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /root/autodl-tmp/.cache/modelscope/hub/tiansz/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


transformers.models.bert.modeling_bert.BertForSequenceClassification

In [18]:
model.config

BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/root/autodl-tmp/.cache/modelscope/hub/tiansz/bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "A",
    "1": "B",
    "2": "C",
    "3": "D",
    "4": "E",
    "5": "F"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "A": 0,
    "B": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_vers

In [24]:
model_output_dir = Path(
        file_dir.parent, "resources", "sft_models", "classification"
    )
model_output_dir.mkdir(parents=True, exist_ok=True)
model_output_dir

PosixPath('/root/llm_adv_qa/resources/sft_models/classification')

In [25]:
best_model_output_dir = Path(model_output_dir, "best")

In [26]:
def compute_metrics(eval_pred, tokenizer: PreTrainedTokenizerFast):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rouge = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    score_dict = {}
    acc_total = len(predictions)
    acc_correct = 0
    for prediction, label in zip(decoded_preds, decoded_labels):
        # Accuracy
        if prediction[-1].upper() == label.upper():
            acc_correct += 1
        # print(f"new token: '{tokenizer.encode(prediction[-1])}'\nlabel: '{label}'\n")
        # print("--------------")
        # Rouge
        # rouge_scores = rouge.score(prediction, label)
        # for k, v in rouge_scores.items():
        #     if score_dict.get(k) is None:
        #         score_dict[k] = []
        #     score_dict[k].append(v.fmeasure)
    # accuracy
    score_dict["accuracy"] = acc_correct / acc_total
    # length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    score_dict["gen_len"] = np.mean(prediction_lens)

    return {k: round(np.mean(v), 4) for k, v in score_dict.items()}


In [27]:
def compute_metrics_for_cls(eval_pred, tokenizer: PreTrainedTokenizerFast):
    preds, labels = eval_pred
    acc_total = len(preds)
    acc_correct = 0
    for pred, label in zip(preds, labels):
        prediction = np.argmax(pred, axis=-1)
        if prediction == label:
            acc_correct += 1
    return {"accuracy": acc_correct/acc_total}

In [28]:
metrics_func = partial(compute_metrics_for_cls, tokenizer=tokenizer)

In [29]:
args = TrainingArguments(
        output_dir=str(model_output_dir),
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        logging_steps=0.05,
        num_train_epochs=5,
        learning_rate=2e-5,
        warmup_ratio=0.05,
        weight_decay=0.01,
        eval_strategy="steps",
        eval_steps=0.05,
        save_strategy="steps",
        save_total_limit=1,
        save_steps=0.05,
        load_best_model_at_end=True,
        metric_for_best_model="eval_accuracy",
    )

In [30]:
smoke_run = False

if smoke_run:
    validation_set = validation_set.select(range(8))
    train_set = train_set.select(range(8))

In [31]:
trainer = Trainer(
        model=model,
        args=args,
        tokenizer=tokenizer,
        train_dataset=train_set,
        eval_dataset=validation_set,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=metrics_func,
    )

  trainer = Trainer(


In [32]:
if smoke_run:
    trainer.evaluate(eval_dataset=validation_set)

In [33]:
if not smoke_run:
    trainer.train()
    trainer.save_model(output_dir=best_model_output_dir)
    trainer.save_state()

Step,Training Loss,Validation Loss,Accuracy
34,1.6521,1.376862,0.436975
68,1.223,1.040381,0.630252
102,0.617,0.252569,0.957983
136,0.2089,0.272385,0.941176
170,0.2224,0.202603,0.957983
204,0.2387,0.145921,0.97479
238,0.1563,0.011729,1.0
272,0.1095,0.074524,0.983193
306,0.0797,0.056931,0.991597
340,0.062,0.11768,0.966387


In [34]:
!pip install -qU modelscope

[0m

In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

AK = os.environ['MODEL_SCOPE_AK']

In [None]:
from modelscope.hub.api import HubApi

api = HubApi()
api.login(AK)

In [38]:
from modelscope.hub.constants import Licenses, ModelVisibility

username = 'Blackoutta'
model_name = 'bert-base-chinese-sft-intention'
model_id = username + "/" + model_name,
model_id[0]

'Blackoutta/bert-base-chinese-sft-intention'

In [None]:
api.push_model(
    model_id=model_id[0], # 如果model_id对应的模型库不存在，将会被自动创建
    model_dir=best_model_output_dir # 指定本地模型所在目录
)

2024-12-21 17:09:29,946 - modelscope - INFO - Creating new model [Blackoutta/bert-base-chinese-sft-intention]
2024-12-21 17:09:32,659 - modelscope - INFO - Pushing folder /root/llm_adv_qa/resources/sft_models/classification/best as model Blackoutta/bert-base-chinese-sft-intention.
2024-12-21 17:09:32,660 - modelscope - INFO - Total folder size 1.14 GB, this may take a while depending on actual pushing size...
2024-12-21 17:09:41,439 - modelscope - INFO - [master c14d01e] 'upload model'
 13 files changed, 42678 insertions(+), 47 deletions(-)
 delete mode 100644 README.md
 create mode 100644 config.json
 create mode 100644 configuration.json
 create mode 100644 model.safetensors
 create mode 100644 optimizer.pt
 create mode 100644 rng_state.pth
 create mode 100644 scheduler.pt
 create mode 100644 special_tokens_map.json
 create mode 100644 tokenizer.json
 create mode 100644 tokenizer_config.json
 create mode 100644 trainer_state.json
 create mode 100644 training_args.bin
 create mode 100