In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
# =========================
# 0) 配置
# =========================
import os

os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
CSV_PATH = "/root/autodl-tmp/CommitFit/dataset/Ghadhab/dataset.csv"

# =========================
# 2) 读 CSV -> Dataset -> split
# =========================
df = pd.read_csv(CSV_PATH)
label2id={'Adaptive':0, 'Corrective':1, 'Perfective':2}
df = df.replace({"labels": label2id})
df

Unnamed: 0,user,repo,commit,labels,msgs,diffs,feature
0,ponsonio,RxJava,0531b8bff5c14d9504beefb4ad47f473e3a22932,2,Change hasException to hasThrowable--,diff --git a/rxjava-core/src/main/java/rx/Noti...,"[1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,ponsonio,RxJava,0950c46beda335819928585f1262dfe1dca78a0b,0,Trying to extend the Scheduler interface accor...,diff --git a/rxjava-core/src/main/java/rx/Sche...,"[2, 44, 0, 0, 30, 0, 0, 1, 18, 0, 0, 0, 0, 0, ..."
2,ponsonio,RxJava,0f92fdd8e6422d5b79c610a7fd8409d222315a49,0,RunAsync method for outputting multiple values--,diff --git a/rxjava-contrib/rxjava-async-util/...,"[2, 53, 0, 0, 42, 0, 0, 1, 45, 1, 0, 0, 0, 0, ..."
3,ponsonio,RxJava,100f571c9a2835d5a30a55374b9be74c147e031f,1,forEach with Action1 but not Observer--I re-re...,diff --git a/language-adaptors/rxjava-groovy/s...,"[1, 5, 122, 9, 10, 9, 4, 1, 5, 18, 2, 0, 0, 0,..."
4,ponsonio,RxJava,191f023cf5253ea90647bc091dcaf55ccdce81cc,1,1.x: Fix Completable swallows- OnErrorNotImple...,diff --git a/src/main/java/rx/Completable.java...,"[1, 1, 0, 0, 0, 0, 0, 1, 21, 0, 0, 0, 0, 0, 0,..."
...,...,...,...,...,...,...,...
1776,jenkinsci,clearcase-plugin,51e9da224f80254476a7dc446bca817b505381d8,2,Use a temporary file to decrease memory consum...,diff --git a/src/main/java/hudson/plugins/clea...,"[2, 12, 0, 4, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,..."
1777,jexp,batch-import,609d6c4b1eea2c33d9fb950fcbb9ba9dc1f80fc3,2,added a more memory efficient structure for st...,diff --git a/src/main/java/org/neo4j/batchimpo...,"[10, 159, 29, 35, 9, 2, 1, 5, 106, 0, 4, 8, 0,..."
1778,hdiv,hdiv,19b650c78a1c76f4fd90274d7f163f863c0d39e4,2,Memory and performance optimizations,diff --git a/hdiv-config/src/main/java/org/hdi...,"[31, 302, 131, 140, 170, 89, 53, 7, 88, 14, 17..."
1779,casidiablo,persistence,d7bf95159df37a3d338ca267dddd3d26b38ec37c,2,Now it is possible to specify the sqlite open ...,diff --git a/pom.xml b/pom.xml\nindex 394263b....,"[5, 57, 20, 9, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [2]:
import re

def compress_diff_minimal(
    diff_text: str,
    max_changed_lines: int = 120,   # 最多保留多少条 +/-
    max_chars: int = 3500,          # 最终硬截断（字符）
) -> str:
    if not diff_text:
        return ""

    lines = diff_text.splitlines()
    kept = []
    changed_cnt = 0

    for ln in lines:
        # 文件头
        if ln.startswith("diff --git "):
            kept.append(ln)
            continue

        # 跳过噪声行
        if ln.startswith(("index ", "new file mode", "deleted file mode", "similarity index", "rename from", "rename to")):
            continue
        if ln.startswith(("--- ", "+++ ")):  # 这两行通常很长且重复文件名，可选保留；这里跳过以更“最小”
            continue
        if "GIT binary patch" in ln:
            continue

        # hunk 头
        if ln.startswith("@@"):
            kept.append(ln)
            continue

        # 只保留变更行（排除+++ / ---）
        if (ln.startswith("+") and not ln.startswith("+++")) or (ln.startswith("-") and not ln.startswith("---")):
            kept.append(ln)
            changed_cnt += 1
            if changed_cnt >= max_changed_lines:
                kept.append("... (diff truncated: too many changed lines)")
                break
            continue

        # 其他上下文行：不保留（最小改动版）
        # 如果你想保留少量上下文，把这里改成“遇到变更行附近保留1-2行”即可

    out = "\n".join(kept).strip()
    if len(out) > max_chars:
        out = out[:max_chars] + "\n... (diff truncated: max_chars)"
    return out


def build_prompt(diff_compact: str) -> str:
    return (
        "Please write a concise commit message that summarizes the following code changes:"
        "<DIFF>\n"
        f"{diff_compact}\n"
        "</DIFF>\n\n"
        "Commit message:"
    )



In [3]:
df["diff_compact"] = df["diffs"].map(compress_diff_minimal)
df["gen_prompt"] = df["diff_compact"].map(build_prompt)   # 不要 label 版本

In [4]:
df = df.rename(columns={'msgs':'target_text','gen_prompt':'source_text'})
df

Unnamed: 0,user,repo,commit,labels,target_text,diffs,feature,diff_compact,source_text
0,ponsonio,RxJava,0531b8bff5c14d9504beefb4ad47f473e3a22932,2,Change hasException to hasThrowable--,diff --git a/rxjava-core/src/main/java/rx/Noti...,"[1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",diff --git a/rxjava-core/src/main/java/rx/Noti...,Please write a concise commit message that sum...
1,ponsonio,RxJava,0950c46beda335819928585f1262dfe1dca78a0b,0,Trying to extend the Scheduler interface accor...,diff --git a/rxjava-core/src/main/java/rx/Sche...,"[2, 44, 0, 0, 30, 0, 0, 1, 18, 0, 0, 0, 0, 0, ...",diff --git a/rxjava-core/src/main/java/rx/Sche...,Please write a concise commit message that sum...
2,ponsonio,RxJava,0f92fdd8e6422d5b79c610a7fd8409d222315a49,0,RunAsync method for outputting multiple values--,diff --git a/rxjava-contrib/rxjava-async-util/...,"[2, 53, 0, 0, 42, 0, 0, 1, 45, 1, 0, 0, 0, 0, ...",diff --git a/rxjava-contrib/rxjava-async-util/...,Please write a concise commit message that sum...
3,ponsonio,RxJava,100f571c9a2835d5a30a55374b9be74c147e031f,1,forEach with Action1 but not Observer--I re-re...,diff --git a/language-adaptors/rxjava-groovy/s...,"[1, 5, 122, 9, 10, 9, 4, 1, 5, 18, 2, 0, 0, 0,...",diff --git a/language-adaptors/rxjava-groovy/s...,Please write a concise commit message that sum...
4,ponsonio,RxJava,191f023cf5253ea90647bc091dcaf55ccdce81cc,1,1.x: Fix Completable swallows- OnErrorNotImple...,diff --git a/src/main/java/rx/Completable.java...,"[1, 1, 0, 0, 0, 0, 0, 1, 21, 0, 0, 0, 0, 0, 0,...",diff --git a/src/main/java/rx/Completable.java...,Please write a concise commit message that sum...
...,...,...,...,...,...,...,...,...,...
1776,jenkinsci,clearcase-plugin,51e9da224f80254476a7dc446bca817b505381d8,2,Use a temporary file to decrease memory consum...,diff --git a/src/main/java/hudson/plugins/clea...,"[2, 12, 0, 4, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...",diff --git a/src/main/java/hudson/plugins/clea...,Please write a concise commit message that sum...
1777,jexp,batch-import,609d6c4b1eea2c33d9fb950fcbb9ba9dc1f80fc3,2,added a more memory efficient structure for st...,diff --git a/src/main/java/org/neo4j/batchimpo...,"[10, 159, 29, 35, 9, 2, 1, 5, 106, 0, 4, 8, 0,...",diff --git a/src/main/java/org/neo4j/batchimpo...,Please write a concise commit message that sum...
1778,hdiv,hdiv,19b650c78a1c76f4fd90274d7f163f863c0d39e4,2,Memory and performance optimizations,diff --git a/hdiv-config/src/main/java/org/hdi...,"[31, 302, 131, 140, 170, 89, 53, 7, 88, 14, 17...",diff --git a/hdiv-config/src/main/java/org/hdi...,Please write a concise commit message that sum...
1779,casidiablo,persistence,d7bf95159df37a3d338ca267dddd3d26b38ec37c,2,Now it is possible to specify the sqlite open ...,diff --git a/pom.xml b/pom.xml\nindex 394263b....,"[5, 57, 20, 9, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","diff --git a/pom.xml b/pom.xml\n@@ -23,5 +23,5...",Please write a concise commit message that sum...


In [5]:
train, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val, test = train_test_split(temp_df, test_size=0.5, random_state=42)

In [6]:
from simplet5_trl import SimpleT5_TRL
import pandas as pd
model = SimpleT5_TRL()

  if not hasattr(np, "object"):


In [7]:
preds = []

In [8]:
test['source_text']

342     Please write a concise commit message that sum...
124     Please write a concise commit message that sum...
1061    Please write a concise commit message that sum...
1543    Please write a concise commit message that sum...
398     Please write a concise commit message that sum...
                              ...                        
453     Please write a concise commit message that sum...
363     Please write a concise commit message that sum...
1316    Please write a concise commit message that sum...
479     Please write a concise commit message that sum...
534     Please write a concise commit message that sum...
Name: source_text, Length: 268, dtype: object

In [9]:
model.load_model("outputs/checkpoint-780",use_gpu=True)

In [10]:
preds = []
for item in test["source_text"]:
    res = model.predict(item)[0]
    print(res)
    preds.append(res)

Token indices sequence length is longer than the specified maximum sequence length for this model (1149 > 512). Running this sequence through the model will result in indexing errors


YARN-1185. Fixed FileSystemRMStateStore to not leave partial files that prevent subsequent ResourceManager- recovery. Contributed by Omkar Vinit Joshi. svn merge --ignore-ancestry -c 158344../../trunk/--git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1516344 13f79535-47bb-0310-9956-ffa450edef68-
HBASE-10883 reverseDNS can't handle- DNS error in case of a NPE--git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@724929 13f79535-47bb-0310-9956-ffa450edef68-
libvaladoc: Add support for empty content

ARQGRA-470: Selenium's network traffic capture functionality has been removed--git-svn-id: https://svn.jboss.org/repos/asf/labs/labs/jbossrules/trunk@106263 c60d74c8-e8f6-0310-9e8f-d4a2fc68ab70-
HDFS-4347 Add support for user groups in web app--git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.23@1209515 13f79535-47bb-0310-9956-ffa450edef68-
vapigen: Add support for type_arguments

YARN-3100. Made YARN authorization pluggable. (Jian 

In [11]:
test["pred"] = preds

In [12]:
test.to_csv("/root/autodl-tmp/CommitFit/dataset/generated_commits.csv", index=False)