In [1]:
import pandas as pd
import glob
import numpy as np

In [2]:
def measure_result_classification(df, type_check_failed=False, typecheck=True, sample_weight=1, beta=1):
    # remove special cases for finetuned models: the program is longer than context size
    if typecheck:
        df = df[df['type_check_failed'] == type_check_failed]
    df = df.copy()
    df["gt_label"] = df["gt_error_loc"] > 0
    df["pred_label"] = (df["loc_pred"] == df["gt_error_loc"]) & (df["gt_label"])
    df["sample_weight"] = sample_weight * df["gt_label"] + (~df["gt_label"])
    

    tp = (df["loc_pred"] == df["gt_error_loc"]) & (df["gt_label"])
    tp = (tp * df["sample_weight"]).sum()
    tn = (df["loc_pred"] == 0) & (~df["gt_label"])
    tn = (tn * df["sample_weight"]).sum()
    pp = df["loc_pred"] > 0
    pp = (pp * df["sample_weight"]).sum()
    fp = pp - tp
    fn = (df["loc_pred"] == 0) & (df["gt_label"])
    fn = (fn * df["sample_weight"]).sum()

    print(f"tp: {tp}, tn: {tn}, fp: {fp}, fn: {fn}")

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    # mcc = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) ** 0.5

    fb = (1 + beta ** 2) * precision * recall / (beta ** 2 * precision + recall)


    return precision, recall, fb


In [3]:
def reset_sample_ids(df, end_idx=1291):
    if "sample_id" not in df.columns:
        return df
    df_bugs = df[df["gt_error_loc"] > 0]
    df_no_bugs = df[df["gt_error_loc"] == 0]

    records = df_no_bugs.to_dict("records")
    for i, record in enumerate(records):
        record["sample_id"] = i + end_idx

    return pd.concat([df_bugs, pd.DataFrame(records)])

def precess_df(folder, filename, leak_max_idx=None, perform_test_leak=True, reset_sample_id=False):
    type_files = glob.glob(f"{folder}/typechecking/*.csv")

    type_check_df = pd.concat([pd.read_csv(file) for file in type_files])
    type_check_df = type_check_df.reset_index()

    if perform_test_leak:
        test_leak = pd.read_csv(f"{folder}/test_leak.csv", index_col=0)
        if leak_max_idx:
            test_leak = test_leak.iloc[:leak_max_idx]

    ground_truth_df = pd.read_csv(f"{folder}/ground_truth.csv")

    df = pd.read_csv(f"{folder}/{filename}")
    
    if reset_sample_id:
        df = reset_sample_ids(df, end_idx=leak_max_idx)

    if "sample_id" in df.columns:
        df = df.set_index("sample_id")
    elif "sample_ids" in df.columns:
        df = df.set_index("sample_ids")

    df = df.drop(columns=["gt_error_loc"])
    df = df.join(ground_truth_df)

    if perform_test_leak:
        df = df.join(test_leak)
        df = df.fillna(False)
        df = df[df["leaked"] == False]
    df = df.join(type_check_df)
    df.fillna(False, inplace=True)
    
    return df

In [4]:
def relative_performance(df, settings = ("codebert", "graphcodebert", "unixcoder", "ggnn", "great", "transformer")):
    records = df.to_dict("records")
    settings_map = set(settings)
    base_map = dict()
    for record in records:
        if record["setup"] in settings_map:
            base_map[record["setup"]] = record

    results = []
    for record in records:
        if record["setup"] in settings_map:
            continue
        name = record["setup"].split(" ")[0]
        if name not in base_map:
            print(f"Warning: missing base performance for {name}")
            continue

        base_performance = base_map[name]
        result = {"setup": record["setup"]}
        for key in record.keys():
            if key == "setup":
                continue
            result[key] = (record[key] - base_performance[key])
        results.append(result)
    return pd.DataFrame(results)

In [5]:
def relative_performance_change(df, settings = ("codebert", "graphcodebert", "unixcoder", "ggnn", "great", "transformer")):
    records = df.to_dict("records")
    settings_map = set(settings)
    base_map = dict()
    for record in records:
        if record["setup"] in settings_map:
            base_map[record["setup"]] = record

    results = []
    for record in records:
        if record["setup"] in settings_map:
            continue
        name = record["setup"].split(" ")[0]
        if name not in base_map:
            print(f"Warning: missing base performance for {name}")
            continue

        base_performance = base_map[name]
        result = {"setup": record["setup"]}
        for key in record.keys():
            if key == "setup":
                continue
            result[key] =  f"{'{:.2f}'.format(base_performance[key])} -> {'{:.2f}'.format(record[key])}"
        results.append(result)
    return pd.DataFrame(results)

def reset_sample_ids(df, end_idx=1291):
    if "sample_id" not in df.columns:
        return df
    df_bugs = df[df["gt_error_loc"] > 0]
    df_no_bugs = df[df["gt_error_loc"] == 0]

    records = df_no_bugs.to_dict("records")
    for i, record in enumerate(records):
        record["sample_id"] = i + end_idx

    return pd.concat([df_bugs, pd.DataFrame(records)])

## Real Dataset

In [6]:
real1_folder = "results/real_1"
real2_folder = "results/real_2"

In [7]:
setting_map = {
    "codebert": "codebert.csv",
    "codebert typecheck oversample": "codebert_typecheck_oversample.csv",
    "graphcodebert": "graphcodebert.csv",
    "graphcodebert typecheck oversample": "graphcodebert_typecheck_oversample.csv",
    "unixcoder": "unixcoder.csv",
    "unixcoder typecheck oversample": "unixcoder_typecheck_oversample.csv",
    "ggnn": "ggnn.csv",
    "ggnn typecheck oversample": "ggnn_typecheck_oversample.csv",
    "great": "great.csv",
    "great typecheck oversample": "great_typecheck_oversample.csv",
}

In [11]:
beta = 1.5
results = []
for key, filename in setting_map.items():
    df_real1 = precess_df(real1_folder, filename, leak_max_idx=1292, reset_sample_id=True)  # 1292 faulty programs, correct programs are already filtered
    df_real2 = precess_df(real2_folder, filename)
    df =  pd.concat([df_real1, df_real2])
    df = df.reset_index()
    metrics = measure_result_classification(df, typecheck=True, beta=beta)
    
    results.append({
        "setup": key,
        "precision": metrics[0] * 100,
        "recall": metrics[1] * 100,
        "fb": metrics[2] * 100,
    })


tp: 615, tn: 32122, fp: 1666, fn: 1316
tp: 608, tn: 32021, fp: 1778, fn: 1312
tp: 626, tn: 32283, fp: 1487, fn: 1323
tp: 679, tn: 32130, fp: 1642, fn: 1268
tp: 664, tn: 32272, fp: 1500, fn: 1283
tp: 709, tn: 32087, fp: 1695, fn: 1228
tp: 288, tn: 30505, fp: 2644, fn: 1420
tp: 292, tn: 30349, fp: 2807, fn: 1409
tp: 403, tn: 29296, fp: 3993, fn: 1165
tp: 431, tn: 29448, fp: 3824, fn: 1154


In [12]:
result_df = pd.DataFrame(results)

In [13]:
print(relative_performance_change(result_df).round(2).to_latex(index=False))

\begin{tabular}{llll}
\toprule
                              setup &       precision &          recall &              fb \\
\midrule
      codebert typecheck oversample &  26.96 -> 25.48 &  31.85 -> 31.67 &  30.17 -> 29.47 \\
 graphcodebert typecheck oversample &  29.63 -> 29.25 &  32.12 -> 34.87 &  31.31 -> 32.93 \\
     unixcoder typecheck oversample &  30.68 -> 29.49 &  34.10 -> 36.60 &  32.97 -> 34.08 \\
          ggnn typecheck oversample &    9.82 -> 9.42 &  16.86 -> 17.17 &  13.82 -> 13.70 \\
         great typecheck oversample &   9.17 -> 10.13 &  25.70 -> 27.19 &  16.53 -> 17.91 \\
\bottomrule
\end{tabular}



In [14]:
relative_performance(result_df)

Unnamed: 0,setup,precision,recall,fb
0,codebert typecheck oversample,-1.479881,-0.182116,-0.700247
1,graphcodebert typecheck oversample,-0.371492,2.75513,1.619537
2,unixcoder typecheck oversample,-1.191406,2.499245,1.102203
3,ggnn typecheck oversample,-0.400252,0.304546,-0.114
4,great typecheck oversample,0.961835,1.490898,1.380642


## Synthetic Dataset

In [15]:
synthetic_folder = "results/synthetic_1"

In [16]:
setting_map = {
    "codebert": "codebert.csv",
    "codebert typecheck oversample": "codebert_typecheck_oversample.csv",
    "graphcodebert": "graphcodebert.csv",
    "graphcodebert typecheck oversample": "graphcodebert_typecheck_oversample.csv",
    "unixcoder": "unixcoder.csv",
    "unixcoder typecheck oversample": "unixcoder_typecheck_oversample.csv",
    "ggnn": "ggnn.csv",
    "ggnn typecheck oversample": "ggnn_typecheck_oversample.csv",
    "great": "great.csv",
    "great typecheck oversample": "great_typecheck_oversample.csv",
}

In [17]:
beta = 1.5
results = []
for key, filename in setting_map.items():
    df = precess_df(synthetic_folder, filename, perform_test_leak=False)

    metrics = measure_result_classification(df, typecheck=True, beta=beta)
    
    results.append({
        "setup": key,
        "precision": metrics[0] * 100,
        "recall": metrics[1] * 100,
        "fb": metrics[2] * 100,
    })

tp: 344203, tn: 464562, fp: 30303, fn: 35011
tp: 345406, tn: 462642, fp: 32579, fn: 33452
tp: 347608, tn: 466034, fp: 27652, fn: 32785
tp: 348307, tn: 464249, fp: 29779, fn: 31744
tp: 352936, tn: 465928, fp: 27788, fn: 27427
tp: 354013, tn: 463969, fp: 29860, fn: 26237
tp: 278803, tn: 442262, fp: 56601, fn: 81826
tp: 281328, tn: 441134, fp: 58529, fn: 78501
tp: 312835, tn: 430804, fp: 71304, fn: 44549
tp: 314259, tn: 433207, fp: 68051, fn: 43975


In [18]:
result_df = pd.DataFrame(results)

In [19]:
relative_performance_change(result_df)

Unnamed: 0,setup,precision,recall,fb
0,codebert typecheck oversample,91.91 -> 91.38,90.77 -> 91.17,91.12 -> 91.23
1,graphcodebert typecheck oversample,92.63 -> 92.12,91.38 -> 91.65,91.76 -> 91.79
2,unixcoder typecheck oversample,92.70 -> 92.22,92.79 -> 93.10,92.76 -> 92.83
3,ggnn typecheck oversample,83.12 -> 82.78,77.31 -> 78.18,79.01 -> 79.54
4,great typecheck oversample,81.44 -> 82.20,87.53 -> 87.72,85.56 -> 85.95


In [20]:
relative_performance(result_df)

Unnamed: 0,setup,precision,recall,fb
0,codebert typecheck oversample,-0.527666,0.402824,0.119445
1,graphcodebert typecheck oversample,-0.507493,0.266155,0.031192
2,unixcoder typecheck oversample,-0.479889,0.310809,0.065757
3,ggnn typecheck oversample,-0.346185,0.873604,0.531557
4,great typecheck oversample,0.762078,0.189808,0.38344


In [21]:
print(relative_performance_change(result_df).round(2).to_latex(index=False))

\begin{tabular}{llll}
\toprule
                              setup &       precision &          recall &              fb \\
\midrule
      codebert typecheck oversample &  91.91 -> 91.38 &  90.77 -> 91.17 &  91.12 -> 91.23 \\
 graphcodebert typecheck oversample &  92.63 -> 92.12 &  91.38 -> 91.65 &  91.76 -> 91.79 \\
     unixcoder typecheck oversample &  92.70 -> 92.22 &  92.79 -> 93.10 &  92.76 -> 92.83 \\
          ggnn typecheck oversample &  83.12 -> 82.78 &  77.31 -> 78.18 &  79.01 -> 79.54 \\
         great typecheck oversample &  81.44 -> 82.20 &  87.53 -> 87.72 &  85.56 -> 85.95 \\
\bottomrule
\end{tabular}

