In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# =========================
# 0) 配置
# =========================
CSV_PATH = "/root/autodl-tmp/CommitFit/dataset/Ghadhab/dataset.csv"
MODEL_NAME = "/root/autodl-tmp/models/codet5-base"

DIFF_COL = "diffs"     # diff 列
MESSAGE_COL = "msgs"   # commit message 列

max_source_len = 1024
max_target_len = 256
max_diff_chars = 1500

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# =========================
# 1) prompt 构造（无 control）
# =========================
def build_prompt(diff, max_diff_chars=1500):
    diff = "" if diff is None else str(diff)
    return (
        "Please generate a concise Git commit message (imperative mood)"
        "that describes the following code changes:\n\n"
        f"{diff[:max_diff_chars]}\n\n"
        "Commit message:"
    )

# =========================
# 2) 读 CSV -> Dataset -> split
# =========================
df = pd.read_csv(CSV_PATH)

# 简单清洗，避免 tokenizer 报错
df[DIFF_COL] = df[DIFF_COL].fillna("")
df[MESSAGE_COL] = df[MESSAGE_COL].fillna("")

full_dataset = Dataset.from_pandas(df, preserve_index=False)

# 70 / 15 / 15
first_split = full_dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = first_split["train"]
tmp_dataset = first_split["test"]

second_split = tmp_dataset.train_test_split(test_size=0.5, seed=42)
test_dataset = second_split["train"]
valid_dataset = second_split["test"]

ds_splits = DatasetDict({
    "train": train_dataset,
    "valid": valid_dataset,
    "test":  test_dataset,
})

# =========================
# 3) 生成 input_text / target_text
# =========================
def add_text_fields(example):
    return {
        "input_text": build_prompt(example[DIFF_COL], max_diff_chars=max_diff_chars),
        "target_text": str(example[MESSAGE_COL]),
    }

ds_text = ds_splits.map(add_text_fields)

# =========================
# 4) tokenize（CodeT5 推荐方式）
# =========================
def tokenize_fn(batch):
    model_inputs = tokenizer(
        batch["input_text"],
        max_length=max_source_len,
        truncation=True,
        padding="max_length",
    )
    labels = tokenizer(
        text_target=batch["target_text"],
        max_length=max_target_len,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

ds_tok = ds_text.map(
    tokenize_fn,
    batched=True,
    remove_columns=ds_text["train"].column_names,
)

ds_tok


Map:   0%|          | 0/1246 [00:00<?, ? examples/s]

Map:   0%|          | 0/268 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

Map:   0%|          | 0/1246 [00:00<?, ? examples/s]

Map:   0%|          | 0/268 [00:00<?, ? examples/s]

Map:   0%|          | 0/267 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1246
    })
    valid: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 268
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 267
    })
})

In [2]:
import torch
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="codet5-sft-commit",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,   # 等效 batch=16
    learning_rate=5e-5,
    num_train_epochs=10,              # 先跑 1 epoch 看效果
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    predict_with_generate=False,
    report_to="wandb",               # 你在用 wandb
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_tok['train'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

trainer.save_model("codet5-sft-commit")
tokenizer.save_pretrained("codet5-sft-commit")

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtongjiajun1992[0m ([33mtongjiajun1992-china-university-of-mining-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.7651
100,0.4843
150,0.4176
200,0.3734
250,0.3788
300,0.3424
350,0.3349
400,0.3259
450,0.3135
500,0.2936


('codet5-sft-commit/tokenizer_config.json',
 'codet5-sft-commit/special_tokens_map.json',
 'codet5-sft-commit/vocab.json',
 'codet5-sft-commit/merges.txt',
 'codet5-sft-commit/added_tokens.json',
 'codet5-sft-commit/tokenizer.json')

In [3]:
1

1