Importing Libraries

In [2]:
from pydriller import Repository
import csv
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt

Importing Repo URL

In [33]:
repo_url = 'https://github.com/pythonprofilers/memory_profiler.git'

repo = Repository(repo_url)

Mining Bug fixing commits

In [14]:
keywords = ['fix', 'fixes', 'fixed', 'bug', 'bugfix', 'defect', 'defects',
            'error', 'issue', 'patch', 'repair', 'fault', 'resolved', 'resolve']

def is_a_bug_fixing_commit(msg):
    if msg is None or msg == "":
        return False
    msg = msg.lower()
    for keyword in keywords:
        if keyword in msg:
            return True
    return False

with open("bug_fixing_commits.csv", mode='w', newline='', encoding='utf-8') as file:
    write = csv.writer(file)
    write.writerow(["Hash", "Message", "Hashes of Parents",
                    "Is a merge commit", "List of modified files"])
    
    for commit in repo.traverse_commits():
        msg = commit.msg.lower()
        if is_a_bug_fixing_commit(msg) == True:

            parents = commit.parents

            if len(commit.parents) > 1:
                is_merge = "Yes"
            else:
                is_merge = "No"

            modif_file = []
            for mod in commit.modified_files:
                if mod.new_path:
                    modif_file.append(mod.new_path)

            write.writerow([commit.hash, commit.msg.strip().replace("\n"," "),
                           parents, is_merge, modif_file])

pd.set_option("display.width", 200) 
pd.set_option("display.max_colwidth", 20)
df = pd.read_csv("bug_fixing_commits.csv")
df = pd.DataFrame(df)
print(df)

                    Hash              Message    Hashes of Parents Is a merge commit List of modified files
0    8800fe9252802753...  Port to OSX and ...  ['76745998505624...                No  ['examples/plot_...  
1    c953ca4a9df3a332...  FIX: display las...  ['8af133fea44642...                No  ['memory_profile...  
2    ec9ee7032b01cf93...  Fix typos in REA...  ['992bb0a8c911e1...                No  ['README.rst', '...  
3    4214cfc13d033c43...  FIX: README.rst ...  ['ea5d79bd10b52b...                No  ['MANIFEST', 'me...  
4    303cab8df4b2274c...  Explicit error f...  ['7a9ec119e19086...                No  ['memory_profile...  
..                   ...                  ...                  ...               ...                  ...  
196  dd9ed705449638f4...  Merge pull reque...  ['4b81aee7064ba4...               Yes                   []  
197  940aa76f5da8b7cb...  Fix deprecated s...  ['57b643a84e1b7f...                No  ['test/test_exit...  
198  e079d3fa35188908...  Fi

Part e

In [16]:
with open("diff_extraction.csv", mode='w',newline='', encoding='utf-8') as file2:
    write = csv.writer(file2)
    write.writerow([
        "Hash", "Message", "Filename",
        "Source Code (before)", "Source Code (current)",
        "Diff", "LLM Inference (fix type)", "Rectified Message"
    ])
    
    for commit in repo.traverse_commits():
        if is_a_bug_fixing_commit(commit.msg):
            for mod in commit.modified_files:
                write.writerow([ commit.hash, commit.msg.strip().replace("\n", " "), mod.new_path, 
                                mod.source_code_before if mod.source_code_before else "",
                                mod.source_code if mod.source_code else "", mod.diff if mod.diff else "", "", ""])

pd.set_option("display.width", 320)
pd.set_option("display.max_colwidth", 40)
df = pd.read_csv("diff_extraction.csv")
df = pd.DataFrame(df)
print(df)


                                        Hash                                  Message                           Filename                     Source Code (before)                    Source Code (current)                                     Diff  LLM Inference (fix type)  Rectified Message
0    8800fe9252802753643a0e48de52c9d8d12f...              Port to OSX and other fixes            examples/plot_memory.py  """\nPlot memory usage of a numeric ...  """\nPlot memory usage of a numeric ...  @@ -1,38 +1,18 @@\n """\n Plot memor...                       NaN                NaN
1    8800fe9252802753643a0e48de52c9d8d12f...              Port to OSX and other fixes                 memory_profiler.py  """Get process information"""\n\n__v...  """Get process information"""\n\n__v...  @@ -6,17 +6,18 @@ _CMD_USAGE = "pyth...                       NaN                NaN
2    c953ca4a9df3a3321166803752608b70e2f2...              FIX: display last statement                 memory_profiler.py  """Get proc

Loading the LLM

In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mamiksik/CommitPredictorT5")
model = AutoModelForSeq2SeqLM.from_pretrained("mamiksik/CommitPredictorT5")

  from .autonotebook import tqdm as notebook_tqdm


Using the LLM on the Diff

In [None]:
df = pd.read_csv("diff_extraction.csv")

for i,row in df.iterrows():
    diff_text = str(row['Diff']).strip()

    cur = str(row['LLM Inference (fix type)']).strip()

    if cur != "" and cur.lower() != "nan":
        continue

    if diff_text.strip()=="":
        df.at[i, 'LLM Inference (fix type)'] = " "

    else:
        inputs = tokenizer(diff_text, return_tensors="pt", truncation=True, max_length=512)
        outputs = model.generate(**inputs)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        df.at[i, 'LLM Inference (fix type)'] = prediction 

df.to_csv("diff_extraction.csv", index=False)

pd.set_option("display.width", 350)
pd.set_option("display.max_colwidth", 40)
df = pd.read_csv("diff_extraction.csv")
df = pd.DataFrame(df)
print(df)


                                        Hash                                  Message                           Filename                     Source Code (before)                    Source Code (current)                                     Diff                LLM Inference (fix type)                       Rectified Message
0    8800fe9252802753643a0e48de52c9d8d12f...              Port to OSX and other fixes            examples/plot_memory.py  """\nPlot memory usage of a numeric ...  """\nPlot memory usage of a numeric ...  @@ -1,38 +1,18 @@\n """\n Plot memor...                   add memory usage plot                   add memory usage plot
1    8800fe9252802753643a0e48de52c9d8d12f...              Port to OSX and other fixes                 memory_profiler.py  """Get process information"""\n\n__v...  """Get process information"""\n\n__v...  @@ -6,17 +6,18 @@ _CMD_USAGE = "pyth...               add docs for memory_usage               add docs for memory_usage
2    c953ca4a9df3a3321166803

In [5]:
df.columns

Index(['Hash', 'Message', 'Filename', 'Source Code (before)',
       'Source Code (current)', 'Diff', 'LLM Inference (fix type)',
       'Rectified Message', 'Semantic_Similarity', 'Token_Similarity',
       'Semantic_class', 'Token_class', 'Rectified'],
      dtype='object')

In [12]:
import pandas as pd

df = pd.read_csv("diff_extraction.csv")

def rectifier(row):
    """
    Rectifier: Produces a file-specific, contextualized message.
    """
    original_msg = str(row["Message"]).strip()
    llm_msg = str(row["LLM Inference (fix type)"]).strip()
    filename = str(row["Filename"]).strip()
    diff = str(row["Diff"]).lower()

    if filename.endswith(".py") and "test" in filename.lower():
        return f"Fix test in {filename}"
    if filename.endswith(".py"):
        return f"Fix in Python file {filename}: {llm_msg or original_msg}"
    if filename.endswith(".java"):
        return f"Bug fix in Java file {filename}: {llm_msg or original_msg}"

    if "exception" in diff or "try:" in diff:
        return f"Fix exception handling in {filename}"
    if "import" in diff:
        return f"Fix import/module issue in {filename}"
    if "typo" in diff or "print" in diff:
        return f"Fix typo/log in {filename}"

    if "fix" in original_msg.lower() or "bug" in original_msg.lower():
        return f"{original_msg} ({filename})"

    if llm_msg and llm_msg != "nan":
        return f"{llm_msg} ({filename})"

    return f"Rectified commit for {filename}"

df["Rectified Message"] = df.apply(rectifier, axis=1)

df.to_csv("diff_extraction.csv", index=False)


In [13]:
df.head(10)

Unnamed: 0,Hash,Message,Filename,Source Code (before),Source Code (current),Diff,LLM Inference (fix type),Rectified Message,Semantic_Similarity,Token_Similarity,Semantic_class,Token_class
0,8800fe9252802753643a0e48de52c9d8d12f52f6,Port to OSX and other fixes,examples/plot_memory.py,"""""""\nPlot memory usage of a numeric computatio...","""""""\nPlot memory usage of a numeric computatio...","@@ -1,38 +1,18 @@\n """"""\n Plot memory usage of...",add memory usage plot,Fix in Python file examples/plot_memory.py: ad...,0.961138,0.104053,Minor,Major
1,8800fe9252802753643a0e48de52c9d8d12f52f6,Port to OSX and other fixes,memory_profiler.py,"""""""Get process information""""""\n\n__version__ =...","""""""Get process information""""""\n\n__version__ =...","@@ -6,17 +6,18 @@ _CMD_USAGE = ""python -m memo...",add docs for memory_usage,Fix in Python file memory_profiler.py: add doc...,0.999291,0.943228,Minor,Minor
2,c953ca4a9df3a3321166803752608b70e2f2632a,FIX: display last statement,memory_profiler.py,"""""""Get process information""""""\n\n__version__ =...","""""""Get process information""""""\n\n__version__ =...","@@ -224,7 +224,7 @@ def show_results(prof, str...",show memory usage in profiling results,Fix in Python file memory_profiler.py: show me...,1.0,0.997634,Minor,Minor
3,ec9ee7032b01cf9350628a57f9c5111a48093366,Fix typos in README and make a new release,README.rst,Memory Profiler\n---------------\nThis is a py...,Memory Profiler\n---------------\nThis is a py...,"@@ -9,7 +9,7 @@ Installation\n ------------\n ...",add missing documentation,Fix typos in README and make a new release (RE...,0.999989,0.975265,Minor,Minor
4,ec9ee7032b01cf9350628a57f9c5111a48093366,Fix typos in README and make a new release,memory_profiler.py,"""""""Get process information""""""\n\n__version__ =...","""""""Get process information""""""\n\n__version__ =...","@@ -1,6 +1,6 @@\n """"""Get process information""""...",add missing line,Fix in Python file memory_profiler.py: add mis...,0.999998,0.998307,Minor,Minor
5,4214cfc13d033c43044eb4c9293258609fc6934f,FIX: README.rst was not included in source dis...,MANIFEST,,memory_profiler.py\nsetup.py\nREADME.rst\n,"@@ -0,0 +1,3 @@\n+memory_profiler.py\n+setup.p...",add readme to generate_test.py,FIX: README.rst was not included in source dis...,0.804435,0.0,Minor,Major
6,4214cfc13d033c43044eb4c9293258609fc6934f,FIX: README.rst was not included in source dis...,memory_profiler.py,"""""""Get process information""""""\n\n__version__ =...","""""""Get process information""""""\n\n__version__ =...","@@ -1,6 +1,6 @@\n """"""Get process information""""...",update process info,Fix in Python file memory_profiler.py: update ...,0.999998,0.998348,Minor,Minor
7,303cab8df4b2274c5086cde80fc53ab49e5e80f4,Explicit error for non-Unix,memory_profiler.py,"""""""Get process information""""""\n\n__version__ =...","""""""Get process information""""""\n\n__version__ =...","@@ -25,7 +25,7 @@ if os.name == 'posix':\n els...",add missing exception,Fix in Python file memory_profiler.py: add mis...,0.999907,0.992705,Minor,Minor
8,ffe1fd28d66fca10b0f4aa0fd412e2ef496a0fbc,FIX: line display,memory_profiler.py,"""""""Get process information""""""\n\n__version__ =...","""""""Get process information""""""\n\n__version__ =...","@@ -174,8 +174,8 @@ class LineProfiler:\n ...",add more info to the profiler,Fix in Python file memory_profiler.py: add mor...,1.0,0.952307,Minor,Minor
9,da9817994b29e5f7a0c204236a401908d1318c70,Fixed windows support,memory_profiler.py,"""""""Get process information""""""\n\n__version__ =...","""""""Get process information""""""\n\n__version__ =...","@@ -12,7 +12,7 @@ try:\n \n def _get_m...",fix get_memory() error,Fix in Python file memory_profiler.py: fix get...,0.999759,0.990441,Minor,Minor


In [15]:
trivial = {"fix", "update", "typo", "bug", "minor change", "change"}

def is_precise(msg):
    msg = str(msg).strip().lower()
    if len(msg) < 10: 
        return False
    if msg in trivial:
        return False
    if len(msg) > 200: 
        return False
    return True

df['Dev_Precise'] = df['Message'].apply(is_precise)
df['LLM_Precise'] = df['LLM Inference (fix type)'].apply(is_precise)
df['Rectified_Precise'] = df['Rectified Message'].apply(is_precise)

total_commits = len(df)
dev_hit_rate = df['Dev_Precise'].sum() / total_commits
llm_hit_rate = df['LLM_Precise'].sum() / total_commits
rectified_hit_rate = df['Rectified_Precise'].sum() / total_commits

print("Hit Rates (stricter evaluation):")
print(f"RQ1 (Developer): {dev_hit_rate:.2%}")
print(f"RQ2 (LLM): {llm_hit_rate:.2%}")
print(f"RQ3 (Rectifier): {rectified_hit_rate:.2%}")


Hit Rates (stricter evaluation):
RQ1 (Developer): 89.53%
RQ2 (LLM): 99.42%
RQ3 (Rectifier): 99.42%


In [16]:
hit_table = pd.DataFrame({
    'RQ': ['RQ1 (Developer)', 'RQ2 (LLM)', 'RQ3 (Rectifier)'],
    'Hit Rate (%)': [dev_hit_rate*100, llm_hit_rate*100, rectified_hit_rate*100]
})

print(hit_table)

                RQ  Hit Rate (%)
0  RQ1 (Developer)     89.534884
1        RQ2 (LLM)     99.418605
2  RQ3 (Rectifier)     99.418605
