In [1]:
import nltk
from nltk.tokenize import word_tokenize
import re
import pandas as pd
from ast import literal_eval
import krippendorff
from sklearn.metrics import cohen_kappa_score
import numpy as np
import pickle
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
import csv 
import copy

def calculate_precision(true_labels, predicted_labels):
    return precision_score(true_labels, predicted_labels, average='binary')

def calculate_recall(true_labels, predicted_labels):
    return recall_score(true_labels, predicted_labels, average='binary')

def calculate_f1(true_labels, predicted_labels):
    return f1_score(true_labels, predicted_labels, average='binary')

def calculate_cohens_kappa(rater1, rater2):
    return cohen_kappa_score(rater1, rater2)

def find_substring_bounds(s, sub):
    start = s.find(sub)
    if start != -1:
        end = start + len(sub)
        return (start, end)
    else:
        return (-1, -1)

def get_model_dict(attention_dict, threshold, ground_truth_dict):
    model_dict = {}
    for i in attention_dict:
        temp = copy.deepcopy(ground_truth_dict[i])
        temp = sorted(temp, key=lambda item: item[1], reverse=True)
        top_keywords = [j[2] for j in temp[:threshold]]
        model_dict[i] = top_keywords
    return model_dict

def compare_index_ranges(ground_truth_dict, comparison_dict):
    overlap_results = {}
    for prompt, range_dict in ground_truth_dict.items():
        result = [0] * len(range_dict)
        if prompt in comparison_dict:
            comp_ranges = comparison_dict[prompt]
            gt_ranges = [j[2] for j in range_dict]
            for i, gt_range in enumerate(gt_ranges):
                for comp_range in comp_ranges:
                    if (gt_range[1] > comp_range[0] and gt_range[0] < comp_range[1]):
                        result[i] = 1
                        break
            overlap_results[prompt] = result
    return overlap_results

def concat_dict_values(dict_to_concat):
    concatenated_list = []
    for value_list in dict_to_concat.values():
        concatenated_list.extend(value_list)
    return concatenated_list

def tokenize_with_indices(attention_dict):
    result = dict()
    for i in attention_dict:
        temp = []
        current = 0
        for j in attention_dict[i]:
            start = i[current:].find(j[0])
            if start != -1:
                end = start + len(j[0])
                temp.append((j[0], j[1], (start + current, end + current)))
                current += end
        result[i] = temp
    return result

def tokenize_with_indices_incoder(attention_dict):
    result = dict()
    for i in attention_dict:
        character_to_character_map = {}
        
        cursor = 0

        for char_index, char in enumerate(i):
            character_to_character_map[cursor] = char_index
            if i != " ":
                cursor += 1
        
        temp = []
        current = 0
        for j in attention_dict[i]:
            start = i.replace(" ", "")[current:].find(j[0])
            if start != -1:
                end = start + len(j[0])
                temp.append((j[0], j[1], (character_to_character_map[start + current], character_to_character_map[end + current])))
                current += end
        result[i] = temp
    return result

def save_to_csv(filename, data):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        for row in data:
            writer.writerow(row)

def overlap_length(range1, range2):
    # Unpack ranges
    a, b = range1
    c, d = range2

    # Calculate the start and end of the overlap
    start_overlap = max(a, c)
    end_overlap = min(b, d)

    # Calculate the overlap length
    if start_overlap < end_overlap:
        return end_overlap - start_overlap
    else:
        # No overlap
        return 0
    
def calculate_new_overall(attention_dict, threshold, incoder=False):
    human_dict = pickle.load(open("gradient_based/dataset.pkl", "rb"))

    # print("Human dict", human_dict)

    ground_truth_dict = {}

    if not incoder:
        ground_truth_dict = tokenize_with_indices(attention_dict)
    else:
        ground_truth_dict = tokenize_with_indices_incoder(attention_dict)

    for real_key in attention_dict:
        attention_dict[real_key] = sorted(attention_dict[real_key], key=lambda item: item[1], reverse=True)

    model_dict = get_model_dict(attention_dict, threshold, ground_truth_dict)

    groundtruth_human_overlap = compare_index_ranges(ground_truth_dict, human_dict)

    groundtruth_model_overlap = compare_index_ranges(ground_truth_dict, model_dict)

    concat_human = concat_dict_values(groundtruth_human_overlap)

    concat_model = concat_dict_values(groundtruth_model_overlap)
    
    def ka(a, b):
        return krippendorff.alpha(reliability_data=[a, b], level_of_measurement='nominal')

    krippendorf_alpha = krippendorff.alpha(reliability_data=[concat_human, concat_model], level_of_measurement='nominal')

    def calculate_sm_precision(h, m):
        h = set(h)
        m = set(m)
        summation = 0
        for human_keyword in h:
            for model_keyword in m:
                if model_keyword[1] - model_keyword[0] != 0:
                    summation += overlap_length(human_keyword, model_keyword) / (model_keyword[1] - model_keyword[0])
        return summation
    
    def calculate_sm_recall(h, m):
        h = set(h)
        m = set(m)
        summation = 0
        for human_keyword in h:
            for model_keyword in m:
                if human_keyword[1] - human_keyword[0] == 0:
                    summation += 0
                else:
                    summation += overlap_length(human_keyword, model_keyword) / (human_keyword[1] - human_keyword[0])
        return summation

    def calculate_sm_f1(precision, recall):
        if precision + recall == 0:
            return 0
        f1 = 2 * (precision * recall) / (precision + recall)
        return f1

    def ka(a, b):
        return krippendorff.alpha(reliability_data=[a, b], level_of_measurement='nominal')

    all_human = []
    all_model = []
    so_far = 0
    for index, i in enumerate(model_dict):
        h = human_dict[i]
        m = model_dict[i]
        all_human += [(each_human[0] + so_far, each_human[1] + so_far) for each_human in h]
        all_model += [(each_model[0] + so_far, each_model[1] + so_far) for each_model in m]
        so_far += len(i)

    precision_sum = 0
    recall_sum = 0
    for index, i in enumerate(model_dict):
        h = human_dict[i]
        m = model_dict[i]

        precision_sum += calculate_sm_precision(h, m)
        recall_sum += calculate_sm_recall(h, m)

    precision = precision_sum / len(all_model)
    recall = recall_sum / len(all_human)
    f1 = calculate_sm_f1(precision, recall)
    statistics = [
        f"{precision:.1%}",   # 将精确度格式化为百分比，并在%前添加反斜杠
        f"{recall:.1%}",      # 将召回率格式化为百分比，并在%前添加反斜杠
        f"{f1:.1%}",          # 将F1分数格式化为百分比，并在%前添加反斜杠
        f"{krippendorf_alpha:.2f}"  # 将Krippendorff's alpha系数格式化为保留两位小数的浮点数
    ]
    return statistics, [precision, recall, f1, krippendorf_alpha]

def calculate_final(model, calculate, method):
    if method.find("self_attention") !=-1:
        attention_dict = pickle.load(open(f"/home/bonan/fse2024/{method}code/{method}{model}.pkl", "rb"))
        result = []
        for key in attention_dict:
            for real_key in attention_dict[key]:
                attention_dict[key][real_key] = sorted(attention_dict[key][real_key], key=lambda item: item[1], reverse=True)
            five = calculate(attention_dict[key], 5)
            ten = calculate(attention_dict[key], 10)
            twenty = calculate(attention_dict[key], 20)
            result += five + ten + twenty
    elif method.find("bert") != -1:
        attention_dict = pickle.load(open(f"/home/bonan/fse2024/{method}code/{method}{model}.pkl", "rb"))
        result = []
        for real_key in attention_dict:
            attention_dict[real_key] = sorted(attention_dict[real_key], key=lambda item: item[1], reverse=True)
        five = calculate(attention_dict, 5)
        ten = calculate(attention_dict, 10)
        twenty = calculate(attention_dict, 20)
        result += five + ten + twenty
    elif method.find("shap") != -1:
        attention_dict = pickle.load(open(f"/home/bonan/fse2024/shap/{method}{model}.pkl", "rb"))
        result = []
        for real_key in attention_dict:
            attention_dict[real_key] = sorted(attention_dict[real_key], key=lambda item: item[1], reverse=True)
        five = calculate(attention_dict, 5)
        ten = calculate(attention_dict, 10)
        twenty = calculate(attention_dict, 20)
        result += five + ten + twenty
    return result


# Table 2 A

In [4]:
models = ['incoder', 'codegen', 'codeparrot', 'gptj', 'polycoder', 'gpt4']
# models = ['incoder']
methods = ["bert_perturbation", "shap_perturbation"]
prefix = "experiment_result"
files = []

table_content = []

for model in models:
    each_model_result = []
    for method in methods:
        attention = pickle.load(open(f"{prefix}/{method}_{model}.pkl", "rb"))
        five, _ = calculate_new_overall(copy.deepcopy(attention), 5)
        ten, _ = calculate_new_overall(copy.deepcopy(attention), 10)
        twenty, _ = calculate_new_overall(copy.deepcopy(attention), 20)
        each_model_result += five + ten + twenty
    table_content.append(each_model_result)
    print(model, each_model_result)

save_to_csv(f"perturbation.csv", table_content)

FileNotFoundError: [Errno 2] No such file or directory: '/home/bonan/fse2024/human_eval/dataset.pkl'

# Table 2 B

In [107]:
models = ['incoder', 'codegen', 'codeparrot', 'gptj', 'polycoder']
methods = ["inputxgradient_reading", "inputxgradient_coding"]

prefix = "../experiment_results"

files = []
table_content = []

for model in models:
    each_model_result = []
    for method in methods:
        attention = pickle.load(open(f"{prefix}/{method}_{model}.pkl", "rb"))
        five, _ = calculate_new_overall(copy.deepcopy(attention), 5, model=="incoder")
        ten, _ = calculate_new_overall(copy.deepcopy(attention), 10, model=="incoder")
        twenty, _ = calculate_new_overall(copy.deepcopy(attention), 20, model=="incoder")
        each_model_result += five + ten + twenty
    table_content.append(each_model_result)

methods = ["saliency_reading", "saliency_coding"]
for model in models:
    each_model_result = []
    for method in methods:
        attention = pickle.load(open(f"{prefix}/{method}_{model}.pkl", "rb"))
        five, _ = calculate_new_overall(copy.deepcopy(attention), 5)
        ten, _ = calculate_new_overall(copy.deepcopy(attention), 10)
        twenty, _ = calculate_new_overall(copy.deepcopy(attention), 20)
        each_model_result += five + ten + twenty
    table_content.append(each_model_result)
save_to_csv(f"last_day/gradient.csv", table_content)

# Table 2 C

In [None]:
models = ['incoder', 'codegen', 'codeparrot', 'gptj', 'polycoder']
methods = ["reading_first", "coding_first"]

prefix = "../experiment_results"

files = []
table_content = []

for model in models:
    each_model_result = []
    for method in methods:
        attention = pickle.load(open(f"{prefix}/{method}_{model}.pkl", "rb"))
        five, _ = calculate_new_overall(copy.deepcopy(attention), 5)
        ten, _ = calculate_new_overall(copy.deepcopy(attention), 10)
        twenty, _ = calculate_new_overall(copy.deepcopy(attention), 20)
        each_model_result += five + ten + twenty
    table_content.append(each_model_result)

methods = ["reading_last", "coding_last"]
for model in models:
    each_model_result = []
    for method in methods:
        attention = pickle.load(open(f"{prefix}/{method}_{model}.pkl", "rb"))
        five, _ = calculate_new_overall(copy.deepcopy(attention), 5)
        ten, _ = calculate_new_overall(copy.deepcopy(attention), 10)
        twenty, _ = calculate_new_overall(copy.deepcopy(attention), 20)
        each_model_result += five + ten + twenty
    table_content.append(each_model_result)

methods = ["reading_all", "coding_all"]
for model in models:
    each_model_result = []
    for method in methods:
        attention = pickle.load(open(f"{prefix}/{method}_{model}.pkl", "rb"))
        five, _ = calculate_new_overall(copy.deepcopy(attention), 5)
        ten, _ = calculate_new_overall(copy.deepcopy(attention), 10)
        twenty, _ = calculate_new_overall(copy.deepcopy(attention), 20)
        each_model_result += five + ten + twenty
    table_content.append(each_model_result)

save_to_csv(f"self-attention.csv", table_content)

# Table 3

In [119]:
models = ['incoder', 'codegen', 'codeparrot', 'gptj', 'polycoder']
methods = ["bert_perturbation", 
           "shap_perturbation",
           "inputxgradient_reading", 
           "inputxgradient_coding",
           "saliency_reading", 
           "saliency_coding",
           "reading_first", 
           "reading_last", 
           "reading_all", 
           "coding_first", 
           "coding_last", 
           "coding_all"]
prefix = "../experiment_results"

gradient_based = [
                "inputxgradient_reading", 
                "inputxgradient_coding",
                "saliency_reading", 
                "saliency_coding"
                ]

import numpy as np

all_methods_models = []
for method in methods:
    each_method = []
    for model in models:
        incoder = False
        if method in gradient_based and model == "incoder":
            incoder = True
        attention = pickle.load(open(f"{prefix}/{method}_{model}.pkl", "rb"))
        each_model_stats = []
        _, five = calculate_new_overall(copy.deepcopy(attention), 5, incoder)
        _, ten = calculate_new_overall(copy.deepcopy(attention), 10, incoder)
        _, twenty = calculate_new_overall(copy.deepcopy(attention), 20, incoder)
        each_model_stats += five[2:] + ten[2:] + twenty[2:]
        each_method.append(each_model_stats)
    each_method_raw_average = np.mean(np.array(each_method), axis=0)
    each_method_string = [
        f"{each_method_raw_average[0]:.1%}",
        f"{each_method_raw_average[1]:.2f}",
        f"{each_method_raw_average[2]:.1%}",
        f"{each_method_raw_average[3]:.2f}",
        f"{each_method_raw_average[4]:.1%}",
        f"{each_method_raw_average[5]:.2f}" 
    ]

    all_methods_models.append(each_method_string)
save_to_csv(f"comparison.csv", all_methods_models)

# Table 2 Color

In [None]:
models = ['incoder', 'codegen', 'codeparrot', 'gptj', 'polycoder']
methods = ["bert_perturbation", 
           "shap_perturbation",
           "inputxgradient_reading", 
           "inputxgradient_coding",
           "saliency_reading", 
           "saliency_coding",
           "reading_first", 
           "reading_last", 
           "reading_all", 
           "coding_first", 
           "coding_last", 
           "coding_all"]
prefix = "../experiment_results"

gradient_based = [
                "inputxgradient_reading", 
                "inputxgradient_coding",
                "saliency_reading", 
                "saliency_coding"
                ]

import numpy as np
all_methods_models = []
for model in models:
    for k in [5, 10, 20]:
        precision = []
        recall = []
        f1 = []
        ka = []
        for method in methods:
            attention = pickle.load(open(f"{prefix}/{method}_{model}.pkl", "rb"))
            if method in gradient_based and model=="incoder":
                _, metrics = calculate_new_overall(copy.deepcopy(attention), k, True)
            else:
                _, metrics = calculate_new_overall(copy.deepcopy(attention), k)
            precision.append(metrics[0])
            recall.append(metrics[1])
            f1.append(metrics[2])
            ka.append(metrics[3])
        
        precision_max_index = precision.index(max(precision))
        recall_max_index = recall.index(max(recall))
        f1_max_index = f1.index(max(f1))
        ka_max_index = ka.index(max(ka))

        print(f"For model {model} at k={k}, max precision: {methods[precision_max_index]}, number is {max(precision)}")
        print(f"For model {model} at k={k}, max recall: {methods[recall_max_index]}, number is {max(recall)}")
        print(f"For model {model} at k={k}, max f1: {methods[f1_max_index]}, number is {max(f1)}")
        print(f"For model {model} at k={k}, max ka: {methods[ka_max_index]}, number is {max(ka)}")
        print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")