In [2]:
from pydriller import Repository
import pandas as pd
import re
from dotenv import load_dotenv
import os
from tqdm import tqdm

load_dotenv()

True

In [3]:
lst = []
for commit in Repository('manim').traverse_commits():
    if("bug" in commit.msg.lower()):
        lst.append(commit)

In [4]:
print(len(lst))

309


In [5]:
count = 0
for commit in tqdm(lst):
    count += len(commit.modified_files)
print("Total number of modified files: ", count)
print("Total number of commits containing 'bug': ", len(lst))

100%|██████████| 309/309 [00:18<00:00, 17.14it/s]

Total number of modified files:  1250
Total number of commits containing 'bug':  309





In [6]:
def get_file_diffs(commit):
    diffs = []
    for file in commit.modified_files:
        diffs.append(file.diff)
    return diffs

def parse_files(patch):
    accumulator = []
    for line in patch.splitlines():
        if line.startswith(("index", "diff")):
            continue
        if line.startswith("---"):
            filename_before = line.split(" ", 1)[1][1:]
            continue
        if line.startswith("+++"):
            filename_after = line.split(" ", 1)[1][1:]
            if filename_before == filename_after:
                accumulator.append(f"<ide><path>{filename_before}")
            else:
                accumulator.append(f"<add><path>{filename_after}")
                accumulator.append(f"<del><path>{filename_before}")
            continue
        line = re.sub("@@[^@@]*@@", "", line)
        if not line:
            continue
        if line.startswith("+"):
            line = line.replace("+", "<add>", 1)
        elif line.startswith("-"):
            line = line.replace("-", "<del>", 1)
        else:
            line = f"<ide>{line}"
        accumulator.append(line)
    return "\n".join(accumulator)


In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mamiksik/CommitPredictorT5")
model = AutoModelForSeq2SeqLM.from_pretrained("mamiksik/CommitPredictorT5")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def get_commit_prediction(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    # print(inputs)
    outputs = model.generate(**inputs, max_length=50, num_beams=10, early_stopping=True)
    predicted_commit_msg = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_commit_msg

In [11]:
total_files = sum(len(commit.modified_files) for commit in lst)
rows = []
with tqdm(total=total_files) as pbar:
    for commit in lst:
        og_message = commit.msg
        for file in commit.modified_files:
            try:
                diff = parse_files(file.diff)
            except Exception as e:
                pbar.update(1)
                continue
            llm_pred = get_commit_prediction(diff)
            rows.append({
                "hash": commit.hash,
                "date": commit.committer_date,
                "original_message": og_message,
                "file_name": file.filename,
                "prev_source_code": file.content_before,
                "cur_source_code": file.content,
                "diff": diff,
                "llm_commit_inference": llm_pred,
                "rectified_message": ""
            })
            pbar.update(1)
files_df = pd.DataFrame(rows)


100%|██████████| 1250/1250 [1:12:32<00:00,  3.48s/it]


In [12]:
files_df.to_csv("files_df.csv", index=False)

In [14]:
print(len(files_df))

1247


In [15]:
(files_df.loc[0:5, ["original_message", "llm_commit_inference"]])

Unnamed: 0,original_message,llm_commit_inference
0,quick rgb-should-be-numpy-array bug fix,fix mobject.fade_to for numpy arrays
1,"Slightly faster sort_points method, and bug fi...",update mobjects.py
2,Bug fix to bug fix on Mobject.fade method,fix typo in mobjects.py
3,Bug in Arrow buffer,update line.py
4,Small bug in how displayer had been refactored,fix bug in place_on_screen
5,Couple scene bug fixes,update scene.py


In [16]:
commit_df = pd.DataFrame(columns=["Hash", "Message", "Parent Hashes", "Is Merge Commit", "Modified Files"])
df_list = []
for commit in lst:
    df_list.append({
        "Hash": commit.hash,
        "Message": commit.msg,
        "Parent Hashes": [parent for parent in commit.parents],
        "Is Merge Commit": commit.merge,
        "Modified Files": [mod_file.filename for mod_file in commit.modified_files]
    })

commit_df = pd.DataFrame(df_list)
commit_df.to_csv("commit_df.csv", index=False)