In [1]:
import pandas as pd
import glob
import numpy as np

In [2]:
def measure_result_classification(df, type_check_failed=False, typecheck=True, sample_weight=1, beta=1):
    # remove special cases for finetuned models: the program is longer than context size
    if typecheck:
        df = df[df['type_check_failed'] == type_check_failed]
    df = df.copy()
    df["gt_label"] = df["gt_error_loc"] > 0
    df["pred_label"] = (df["loc_pred"] == df["gt_error_loc"]) & (df["gt_label"])
    df["sample_weight"] = sample_weight * df["gt_label"] + (~df["gt_label"])
    

    tp = (df["loc_pred"] == df["gt_error_loc"]) & (df["gt_label"])
    tp = (tp * df["sample_weight"]).sum()
    tn = (df["loc_pred"] == 0) & (~df["gt_label"])
    tn = (tn * df["sample_weight"]).sum()
    pp = df["loc_pred"] > 0
    pp = (pp * df["sample_weight"]).sum()
    fp = pp - tp
    fn = (df["loc_pred"] == 0) & (df["gt_label"])
    fn = (fn * df["sample_weight"]).sum()

    print(f"tp: {tp}, tn: {tn}, fp: {fp}, fn: {fn}")

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    # mcc = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5

    fb = (1 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall)


    return precision, recall, fb


In [3]:
def reset_sample_ids(df, end_idx=1291):
    if "sample_id" not in df.columns:
        return df
    df_bugs = df[df["gt_error_loc"] > 0]
    df_no_bugs = df[df["gt_error_loc"] == 0]

    records = df_no_bugs.to_dict("records")
    for i, record in enumerate(records):
        record["sample_id"] = i + end_idx

    return pd.concat([df_bugs, pd.DataFrame(records)])

def precess_df(folder, filename, leak_max_idx=None, perform_test_leak=True, reset_sample_id=False):
    type_files = glob.glob(f"{folder}/typechecking/*.csv")

    type_check_df = pd.concat([pd.read_csv(file) for file in type_files])
    type_check_df = type_check_df.reset_index()

    if perform_test_leak:
        test_leak = pd.read_csv(f"{folder}/test_leak.csv", index_col=0)
        if leak_max_idx:
            test_leak = test_leak.iloc[:leak_max_idx]

    ground_truth_df = pd.read_csv(f"{folder}/ground_truth.csv")

    df = pd.read_csv(f"{folder}/{filename}")
    if reset_sample_id:
        df = reset_sample_ids(df, end_idx=leak_max_idx)
    if "sample_id" in df.columns:
        df = df.set_index("sample_id")
    elif "sample_ids" in df.columns:
        df = df.set_index("sample_ids")

    df = df.drop(columns=["gt_error_loc"])
    df = df.join(ground_truth_df)

    if perform_test_leak:
        df = df.join(test_leak)
        df = df.fillna(False)
        df = df[df["leaked"] == False]
    df = df.join(type_check_df)
    df.fillna(False, inplace=True)
    
    return df

In [4]:
def relative_performance(df, settings = ("codebert", "graphcodebert", "unixcoder", "ggnn", "great", "transformer")):
    records = df.to_dict("records")
    settings_map = set(settings)
    base_map = dict()
    for record in records:
        if record["setup"] in settings_map:
            base_map[record["setup"]] = record

    results = []
    for record in records:
        if record["setup"] in settings_map:
            continue
        name = record["setup"].split(" ")[0]
        if name not in base_map:
            print(f"Warning: missing base performance for {name}")
            continue

        base_performance = base_map[name]
        result = {"setup": record["setup"]}
        for key in record.keys():
            if key == "setup":
                continue
            result[key] = (record[key] - base_performance[key]) / base_performance[key] * 100
        results.append(result)
    return pd.DataFrame(results)

## Real

In [5]:
real1_folder = "results/real_1"
real2_folder = "results/real_2"

In [6]:
setting_map = {
    "codebert": "codebert.csv",
    "graphcodebert": "graphcodebert.csv",
    "unixcoder": "unixcoder.csv",
    "ggnn": "ggnn.csv",
    "great": "great.csv",
}

In [7]:
beta = 1.5
results = []
for key, filename in setting_map.items():
    df_real1 = precess_df(real1_folder, filename, leak_max_idx=1292, reset_sample_id=True)  # 1292 faulty programs, correct programs are already filtered
    df_real2 = precess_df(real2_folder, filename)

    df =  pd.concat([df_real1, df_real2])
    full_metrics = measure_result_classification(df, typecheck=False, beta=beta)
    type_check_metric = measure_result_classification(df, typecheck=True, beta=beta)

    results.append({
        "setting": key,
        "precision": full_metrics[0] * 100,
        "type_check_precision": type_check_metric[0] * 100,
        "precision_delta": (type_check_metric[0] - full_metrics[0]) / full_metrics[0] * 100,
        "recall": full_metrics[1] * 100,
        "type_check_recall": type_check_metric[1] * 100,
        "recall_delta": (type_check_metric[1] - full_metrics[1]) / full_metrics[1] * 100,
        "fb": full_metrics[2] * 100,
        "type_check_fb": type_check_metric[2] * 100,
        "fb_delta": (type_check_metric[2] - full_metrics[2]) / full_metrics[2] * 100,
    })

tp: 654, tn: 32122, fp: 1682, fn: 1373
tp: 615, tn: 32122, fp: 1666, fn: 1316
tp: 668, tn: 32283, fp: 1498, fn: 1382
tp: 626, tn: 32283, fp: 1487, fn: 1323
tp: 707, tn: 32272, fp: 1513, fn: 1339
tp: 664, tn: 32272, fp: 1500, fn: 1283
tp: 312, tn: 30505, fp: 2656, fn: 1492
tp: 288, tn: 30505, fp: 2644, fn: 1420
tp: 442, tn: 29296, fp: 4013, fn: 1214
tp: 403, tn: 29296, fp: 3993, fn: 1165


In [8]:
pd.DataFrame(results).round(2)

Unnamed: 0,setting,precision,type_check_precision,precision_delta,recall,type_check_recall,recall_delta,fb,type_check_fb,fb_delta
0,codebert,28.0,26.96,-3.7,32.26,31.85,-1.29,30.82,30.17,-2.12
1,graphcodebert,30.84,29.63,-3.94,32.59,32.12,-1.43,32.03,31.31,-2.25
2,unixcoder,31.85,30.68,-3.65,34.56,34.1,-1.31,33.67,32.97,-2.08
3,ggnn,10.51,9.82,-6.56,17.29,16.86,-2.5,14.43,13.82,-4.26
4,great,9.92,9.17,-7.6,26.69,25.7,-3.71,17.56,16.53,-5.87


# Synthetic dataset

In [9]:
synthetic_folder = "results/synthetic_1"

In [10]:
setting_map = {
    "codebert": "codebert.csv",
    "graphcodebert": "graphcodebert.csv",
    "unixcoder": "unixcoder.csv",
    "ggnn": "ggnn.csv",
    "great": "great.csv",
}

In [11]:
beta = 2
results = []
for key, filename in setting_map.items():
    df = precess_df(synthetic_folder, filename, perform_test_leak=False)

    full_metrics = measure_result_classification(df, typecheck=False, beta=beta)
    type_check_metric = measure_result_classification(df, typecheck=True, beta=beta)
    
    results.append({
        "setting": key,
        "precision": full_metrics[0] * 100,
        "type_check_precision": type_check_metric[0] * 100,
        "precision_delta": (type_check_metric[0] - full_metrics[0]) / full_metrics[0] * 100,
        "recall": full_metrics[1] * 100,
        "type_check_recall": type_check_metric[1] * 100,
        "recall_delta": (type_check_metric[1] - full_metrics[1]) / full_metrics[1] * 100,
        "fb": full_metrics[2] * 100,
        "type_check_fb": type_check_metric[2] * 100,
        "fb_delta": (type_check_metric[2] - full_metrics[2]) / full_metrics[2] * 100,
    })

tp: 433901, tn: 464562, fp: 30681, fn: 39448
tp: 344203, tn: 464562, fp: 30303, fn: 35011
tp: 437579, tn: 466034, fp: 27909, fn: 37070
tp: 347608, tn: 466034, fp: 27652, fn: 32785
tp: 444176, tn: 465928, fp: 28003, fn: 30485
tp: 352936, tn: 465928, fp: 27788, fn: 27427
tp: 365237, tn: 442262, fp: 57572, fn: 86303
tp: 278803, tn: 442262, fp: 56601, fn: 81826
tp: 402183, tn: 430804, fp: 72085, fn: 46302
tp: 312835, tn: 430804, fp: 71304, fn: 44549


In [12]:
pd.DataFrame(results).round(2)

Unnamed: 0,setting,precision,type_check_precision,precision_delta,recall,type_check_recall,recall_delta,fb,type_check_fb,fb_delta
0,codebert,93.4,91.91,-1.59,91.67,90.77,-0.98,92.01,90.99,-1.1
1,graphcodebert,94.0,92.63,-1.46,92.19,91.38,-0.88,92.55,91.63,-0.99
2,unixcoder,94.07,92.7,-1.45,93.58,92.79,-0.84,93.68,92.77,-0.96
3,ggnn,86.38,83.12,-3.77,80.89,77.31,-4.42,81.93,78.41,-4.3
4,great,84.8,81.44,-3.97,89.68,87.53,-2.39,88.66,86.24,-2.72


In [13]:
print(pd.DataFrame(results).round(2).to_latex(index=False))

\begin{tabular}{lrrrrrrrrr}
\toprule
       setting &  precision &  type\_check\_precision &  precision\_delta &  recall &  type\_check\_recall &  recall\_delta &     fb &  type\_check\_fb &  fb\_delta \\
\midrule
      codebert &      93.40 &                 91.91 &            -1.59 &   91.67 &              90.77 &         -0.98 &  92.01 &          90.99 &     -1.10 \\
 graphcodebert &      94.00 &                 92.63 &            -1.46 &   92.19 &              91.38 &         -0.88 &  92.55 &          91.63 &     -0.99 \\
     unixcoder &      94.07 &                 92.70 &            -1.45 &   93.58 &              92.79 &         -0.84 &  93.68 &          92.77 &     -0.96 \\
          ggnn &      86.38 &                 83.12 &            -3.77 &   80.89 &              77.31 &         -4.42 &  81.93 &          78.41 &     -4.30 \\
         great &      84.80 &                 81.44 &            -3.97 &   89.68 &              87.53 &         -2.39 &  88.66 &          86.24 &     

## Manual Annotation

In [14]:
from utils import get_performance
from  tqdm import tqdm
import pandas as pd

In [15]:
def ratio_change(a, b):
    return (b - a) / a

def print_ratio_change(original_performance, filtered_performance):
    # calculate for precision
    original_precision, original_recall, original_f1 = original_performance
    filtered_precision, filtered_recall, filtered_f1, _ = filtered_performance

    original_precision = original_precision
    filtered_precision = filtered_precision
    precision_ratio_change = ratio_change(original_precision, filtered_precision)

    original_recall = original_recall
    filtered_recall = filtered_recall
    recall_ratio_change = ratio_change(original_recall, filtered_recall)

    # print(f"{original_precision * 100:.2f} & {filtered_precision * 100:.2f} & {precision_ratio_change * 100:.2f} & {original_recall * 100:.2f} & {filtered_recall * 100:.2f} & {recall_ratio_change * 100:.2f}")
    return original_precision, filtered_precision, precision_ratio_change, original_recall, filtered_recall, recall_ratio_change

In [16]:
models = ["codebert", "graphcodebert", "unixcoder", "ggnn", "great"]
results = []

for model in tqdm(models):
    result = get_performance(model)
    original_precision, filtered_precision, precision_ratio_change, original_recall, filtered_recall, recall_ratio_change = print_ratio_change(result["original_performance"], result["filtered_performance"])

    results.append({
        "model": model,
        "original_precision": original_precision * 100,
        "filtered_precision": filtered_precision * 100,
        "precision_ratio_change": precision_ratio_change * 100,
        "original_recall": original_recall * 100,
        "filtered_recall": filtered_recall * 100,
        "recall_ratio_change": recall_ratio_change * 100,
    })

100%|██████████| 5/5 [00:49<00:00,  9.84s/it]


In [17]:
pd.DataFrame(results).round(2)

Unnamed: 0,model,original_precision,filtered_precision,precision_ratio_change,original_recall,filtered_recall,recall_ratio_change
0,codebert,74.49,69.51,-6.68,39.25,36.54,-6.9
1,graphcodebert,72.0,66.67,-7.41,39.78,37.09,-6.77
2,unixcoder,76.24,73.81,-3.18,41.62,39.49,-5.12
3,ggnn,46.58,41.38,-11.16,19.88,16.44,-17.32
4,great,39.6,34.52,-12.83,24.24,20.57,-15.16
