In [5]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# =========================
# 0) 配置
# =========================
CSV_PATH = "/root/autodl-tmp/CommitFit/dataset/Ghadhab/dataset.csv"
MODEL_NAME = "/root/autodl-tmp/models/codet5-base"

DIFF_COL = "diffs"     # 你的 diff 列
MESSAGE_COL = "msgs"   # 你的 commit message 列
LABEL_COL = "labels"   # 你的 label 列（可选，没有也行）

max_source_len = 512
max_target_len = 64
max_diff_chars = 1500

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# =========================
# 1) label -> control token
# =========================
def label_to_control(label):
    """
    支持 label 为:
      - int: 0/1/2
      - str: "0"/"1"/"2"
      - str: "LABEL_0"/"LABEL_1"/"LABEL_2"
      - str: "Adaptive"/"Perfective"/"Corrective"
    """
    if isinstance(label, (np.integer, int)):
        label = int(label)
        return {0: "[Adaptive]", 1: "[Perfective]", 2: "[Corrective]"}[label]

    s = str(label).strip()
    if s in {"0", "1", "2"}:
        return {"0": "[Adaptive]", "1": "[Perfective]", "2": "[Corrective]"}[s]
    if s in {"LABEL_0", "LABEL_1", "LABEL_2"}:
        return {"LABEL_0": "[Adaptive]", "LABEL_1": "[Perfective]", "LABEL_2": "[Corrective]"}[s]

    s_low = s.lower()
    if "adaptive" in s_low:
        return "[Adaptive]"
    if "perfective" in s_low:
        return "[Perfective]"
    if "corrective" in s_low:
        return "[Corrective]"

    raise ValueError(f"Unrecognized label: {label}")

# =========================
# 2) prompt 构造
# =========================
def build_prompt(diff, control=None, max_diff_chars=1500):
    diff = "" if diff is None else str(diff)
    base = (
        "Please generate a concise Git commit message (one sentence, imperative mood) "
        "that describes the following code changes:\n\n"
        f"{diff[:max_diff_chars]}\n\n"
        "Commit message:"
    )
    return (control + " " + base) if control else base

# =========================
# 3) 读 CSV -> Dataset -> split
# =========================
df = pd.read_csv(CSV_PATH)

# 可选：简单清洗（避免空值导致报错）
df[DIFF_COL] = df[DIFF_COL].fillna("")
df[MESSAGE_COL] = df[MESSAGE_COL].fillna("")

full_dataset = Dataset.from_pandas(df, preserve_index=False)

# 70 / 15 / 15
first_split = full_dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = first_split["train"]   # ~70%
tmp_dataset = first_split["test"]      # ~30%

second_split = tmp_dataset.train_test_split(test_size=0.5, seed=42)
test_dataset = second_split["train"]   # ~15%
valid_dataset = second_split["test"]   # ~15%

ds_splits = DatasetDict({
    "train": train_dataset,
    "valid": valid_dataset,
    "test":  test_dataset,
})

# =========================
# 4) 生成 input_text / target_text
# =========================
def add_text_fields(example):
    # label 列可选：没有 label 就不加 control token
    ctrl = None
    if LABEL_COL in example and example[LABEL_COL] is not None and str(example[LABEL_COL]).strip() != "" and str(example[LABEL_COL]).lower() != "nan":
        ctrl = label_to_control(example[LABEL_COL])

    return {
        "input_text": build_prompt(example[DIFF_COL], control=ctrl, max_diff_chars=max_diff_chars),
        "target_text": str(example[MESSAGE_COL]),
    }

ds_text = ds_splits.map(add_text_fields)

# =========================
# 5) tokenize（推荐用 text_target）
# =========================
def tokenize_fn(batch):
    model_inputs = tokenizer(
        batch["input_text"],
        max_length=max_source_len,
        truncation=True,
        padding="max_length",
    )
    labels = tokenizer(
        text_target=batch["target_text"],
        max_length=max_target_len,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

ds_tok = ds_text.map(
    tokenize_fn,
    batched=True,
    remove_columns=ds_text["train"].column_names,  # 去掉原始列 + input_text/target_text
)

ds_tok

Map:   0%|          | 0/1246 [00:00<?, ? examples/s]

Map:   0%|          | 0/268 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

Map:   0%|          | 0/1246 [00:00<?, ? examples/s]

Map:   0%|          | 0/268 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1246
    })
    valid: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 268
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 267
    })
})

In [7]:
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="codet5-sft-commit",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,   # 等效 batch=16
    learning_rate=5e-5,
    num_train_epochs=1,              # 先跑 1 epoch 看效果
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    predict_with_generate=False,
    report_to="wandb",               # 你在用 wandb
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_tok['train'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("codet5-sft-commit")
tokenizer.save_pretrained("codet5-sft-commit")

  trainer = Seq2SeqTrainer(


Step,Training Loss
50,2.428


('codet5-sft-commit/tokenizer_config.json',
 'codet5-sft-commit/special_tokens_map.json',
 'codet5-sft-commit/vocab.json',
 'codet5-sft-commit/merges.txt',
 'codet5-sft-commit/added_tokens.json',
 'codet5-sft-commit/tokenizer.json')