In [1]:
import os
import json

original_dataset_path = "original/" # get all the json files from this directory
cleaned_dataset_path = "cleaned/" # get all the json files from this directory

def get_json_files(path):
    """
    Get all json files in the given directory.
    """
    json_files = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".json"):
                json_files.append(os.path.join(root, file))
    return json_files

cleaned_json_files = get_json_files(cleaned_dataset_path)
original_json_files = get_json_files(original_dataset_path)
len(cleaned_json_files), len(original_json_files)

(21, 23)

In [2]:
original_json_files

['original/reconstructed_group1_hash_renamed.json',
 'original/reconstructed_group3_helper_as_part_of_groundtruth_original.json',
 'original/reconstructed_group3_helper_as_part_of_groundtruth_usual_renamed.json',
 'original/reconstructed_group3_helper_as_other_candidates_original.json',
 'original/reconstructed_group1_asm_short.json',
 'original/reconstructed_group3_asm_short.json',
 'original/reconstructed_group2_hash_renamed.json',
 'original/reconstructed_group3_asm_long.json',
 'original/reconstructed_group2_wasm.json',
 'original/reconstructed_group2_asm_short.json',
 'original/reconstructed_group2_asm_long.json',
 'original/reconstructed_group2_usual_renamed.json',
 'original/reconstructed_group3_asm_long_saved.json',
 'original/reconstructed_group3_helper_as_part_of_groundtruth_hash_renamed.json',
 'original/reconstructed_group3_asm_long_updated.json',
 'original/reconstructed_group1_asm_long.json',
 'original/reconstructed_group1_usual_renamed.json',
 'original/reconstructed_gr

In [3]:
json_files_used = [
    'original/reconstructed_group1_original.json',
    'original/reconstructed_group1_asm_long.json',
    'original/reconstructed_group1_wasm.json',
    'original/reconstructed_group2_original.json',
    'original/reconstructed_group2_asm_long.json',
    'original/reconstructed_group2_wasm.json',
    'original/reconstructed_group3_helper_as_part_of_groundtruth_original.json',
    'original/reconstructed_group3_asm_long.json',
    'original/reconstructed_group3_helper_as_part_of_groundtruth_wasm.json',

]

# check if json file used are the subset of original json files
for json_file in json_files_used:
    if json_file not in original_json_files:
        print(f"{json_file} is not in original json files")

#### We are gonna calculate the statistics for the dataset.

##### General metrics
1. number of pairs

##### For Query
1. Token counts

##### For Code
1. Token counts
2. Line of code
3. Complexity

Here, we define the token counts to be the number of tokens in the code/query as a string after splitting by spaces.

For Line of Code, we split the text by '\n'.

For each group, we collect the following statistics. And we also collect for the entire dataset

In [4]:
final_result = {"general": {}, "query": {}, "code": {}}

#### General Metrics

##### 1. Number of pairs

In [5]:
def get_num_pairs(json_file):
    """
    Get the number of pairs in the given json files.
    """
    return len(json_file['query'])

In [6]:
group_one_original_path = "original/reconstructed_group1_original.json"
group_two_original_path = "original/reconstructed_group2_original.json"
group_three_original_path = "original/reconstructed_group3_helper_as_part_of_groundtruth_original.json"

with open(group_one_original_path, 'r') as f:
    group_one_original = json.load(f)

with open(group_two_original_path, 'r') as f:
    group_two_original = json.load(f)

with open(group_three_original_path, 'r') as f:
    group_three_original = json.load(f)

group_one_pair_count = get_num_pairs(group_one_original)

group_two_pair_count = get_num_pairs(group_two_original)

group_three_pair_count = get_num_pairs(group_three_original)


In [7]:
final_result['general']['Group 1 original pair count'] = group_one_pair_count
final_result['general']['Group 2 original pair count'] = group_two_pair_count
final_result['general']['Group 3 original pair count'] = group_three_pair_count
print("Group 1 original pair count: ", group_one_pair_count, "Group 2 original pair count: ", group_two_pair_count, "Group 3 original pair count: ", group_three_pair_count)

Group 1 original pair count:  526 Group 2 original pair count:  469 Group 3 original pair count:  44


#### Query Metrics

##### 1. Token counts

In [8]:
# Use the tokenizer of OASIS
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Kwaipilot/OASIS-code-1.5B")

In [9]:
# Count the number of tokens in the given query.

def count_query_tokens(query):
    """
    Count the number of tokens in the given query.
    """
    query = query.strip()
    tokens = tokenizer(query, return_tensors='pt', add_special_tokens=True, truncation=False, max_length=1024 * 1024 * 128)
    return len(tokens['input_ids'][0])

In [10]:
group_one_original_path = "original/reconstructed_group1_original.json"
group_two_original_path = "original/reconstructed_group2_original.json"
group_three_original_path = "original/reconstructed_group3_helper_as_part_of_groundtruth_original.json"

with open(group_one_original_path, 'r') as f:
    group_one_original = json.load(f)
    group_one_query = group_one_original['query']

with open(group_two_original_path, 'r') as f:
    group_two_original = json.load(f)
    group_two_query = group_two_original['query']

with open(group_three_original_path, 'r') as f:
    group_three_original = json.load(f)
    group_three_query = group_three_original['query']

# calculate the average number of tokens in the query
group_one_query_tokens = []
group_two_query_tokens = []
group_three_query_tokens = []

for id, query in group_one_query.items():
    group_one_query_tokens.append(count_query_tokens(query))
for id, query in group_two_query.items():
    group_two_query_tokens.append(count_query_tokens(query))
for id, query in group_three_query.items():
    group_three_query_tokens.append(count_query_tokens(query))


group_one_query_tokens_count = "{:.1f}".format(sum(group_one_query_tokens) / len(group_one_query_tokens))
group_two_query_tokens_count = "{:.1f}".format(sum(group_two_query_tokens) / len(group_two_query_tokens))
group_three_query_tokens_count = "{:.1f}".format(sum(group_three_query_tokens) / len(group_three_query_tokens))

In [11]:
final_result['query']['Group 1 average query tokens'] = group_one_query_tokens_count
final_result['query']['Group 2 average query tokens'] = group_two_query_tokens_count
final_result['query']['Group 3 average query tokens'] = group_three_query_tokens_count

print("Group 1 average query tokens: ", group_one_query_tokens_count, "Group 2 average query tokens: ", group_two_query_tokens_count, "Group 3 average query tokens: ", group_three_query_tokens_count)

Group 1 average query tokens:  88.3 Group 2 average query tokens:  84.7 Group 3 average query tokens:  88.2


In [12]:
group_entire_query_token_count = "{:.1f}".format((sum(group_one_query_tokens) + sum(group_two_query_tokens) + sum(group_three_query_tokens)) / (len(group_one_query_tokens) + len(group_two_query_tokens) + len(group_three_query_tokens)))

final_result['query']['Entire Group average query token count'] = group_entire_query_token_count
print("Entire Group average query token count: ", group_entire_query_token_count)

Entire Group average query token count:  86.7


#### Code Metrics

##### 1. Token counts

In [13]:
# Count the number of tokens in the given code.
# Use the tokenizer of OASIS
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Kwaipilot/OASIS-code-1.5B")

def count_code_tokens(code):
    """
    Count the number of tokens in the given code.
    """
    code = code.strip()
    tokens = tokenizer(code, return_tensors='pt', add_special_tokens=True, truncation=False, max_length=1024 * 1024 * 128)
    return len(tokens['input_ids'][0])

In [14]:

group_one_original_path = "original/reconstructed_group1_original.json"
group_one_asm_long_path = "original/reconstructed_group1_asm_long.json"
group_one_wasm_path = "original/reconstructed_group1_wasm.json"
group_two_original_path = "original/reconstructed_group2_original.json"
group_two_asm_long_path = "original/reconstructed_group2_asm_long.json"
group_two_wasm_path = "original/reconstructed_group2_wasm.json"
group_three_original_path = "original/reconstructed_group3_helper_as_part_of_groundtruth_original.json"
group_three_asm_long_path = "original/reconstructed_group3_asm_long.json"
group_three_wasm_path = "original/reconstructed_group3_helper_as_part_of_groundtruth_wasm.json"

# make sure all of the json files are in the json files used
assert group_one_original_path in json_files_used and group_one_asm_long_path in json_files_used and group_one_wasm_path in json_files_used
assert group_two_original_path in json_files_used and group_two_asm_long_path in json_files_used and group_two_wasm_path in json_files_used
assert group_three_original_path in json_files_used and group_three_asm_long_path in json_files_used and group_three_wasm_path in json_files_used

with open(group_one_original_path, 'r') as f:
    group_one_original = json.load(f)
    group_one_corpus = group_one_original['corpus']
    group_one_corpus_tokens = []
    for id, code in group_one_corpus.items():
        group_one_corpus_tokens.append(count_code_tokens(code))

with open(group_one_asm_long_path, 'r') as f:
    group_one_asm_long = json.load(f)
    group_one_asm_long_corpus = group_one_asm_long['corpus']
    group_one_asm_long_corpus_tokens = []
    for id, code in group_one_asm_long_corpus.items():
        group_one_asm_long_corpus_tokens.append(count_code_tokens(code))

with open(group_one_wasm_path, 'r') as f:
    group_one_wasm = json.load(f)
    group_one_wasm_corpus = group_one_wasm['corpus']
    group_one_wasm_corpus_tokens = []
    for id, code in group_one_wasm_corpus.items():
        group_one_wasm_corpus_tokens.append(count_code_tokens(code))

with open(group_two_original_path, 'r') as f:
    group_two_original = json.load(f)
    group_two_corpus = group_two_original['corpus']
    group_two_corpus_tokens = []
    for id, code in group_two_corpus.items():
        group_two_corpus_tokens.append(count_code_tokens(code))

with open(group_two_asm_long_path, 'r') as f:
    group_two_asm_long = json.load(f)
    group_two_asm_long_corpus = group_two_asm_long['corpus']
    group_two_asm_long_corpus_tokens = []
    for id, code in group_two_asm_long_corpus.items():
        group_two_asm_long_corpus_tokens.append(count_code_tokens(code))

with open(group_two_wasm_path, 'r') as f:
    group_two_wasm = json.load(f)
    group_two_wasm_corpus = group_two_wasm['corpus']
    group_two_wasm_corpus_tokens = []
    for id, code in group_two_wasm_corpus.items():
        group_two_wasm_corpus_tokens.append(count_code_tokens(code))

with open(group_three_original_path, 'r') as f:
    group_three_original = json.load(f)
    group_three_corpus = group_three_original['corpus']
    group_three_corpus_tokens = []
    for id, code in group_three_corpus.items():
        if "c_group_3_id" not in id:
            continue
        group_three_corpus_tokens.append(count_code_tokens(code))

with open(group_three_asm_long_path, 'r') as f:
    group_three_asm_long = json.load(f)
    group_three_asm_long_corpus = group_three_asm_long['corpus']
    group_three_asm_long_corpus_tokens = []
    for id, code in group_three_asm_long_corpus.items():
        if "c_group_3_id" not in id:
            continue
        group_three_asm_long_corpus_tokens.append(count_code_tokens(code))

with open(group_three_wasm_path, 'r') as f:
    group_three_wasm = json.load(f)
    group_three_wasm_corpus = group_three_wasm['corpus']
    group_three_wasm_corpus_tokens = []
    for id, code in group_three_wasm_corpus.items():
        if "c_group_3_id" not in id:
            continue
        group_three_wasm_corpus_tokens.append(count_code_tokens(code))

group_one_original_corpus_tokens_count = "{:.1f}".format(sum(group_one_corpus_tokens) / len(group_one_corpus_tokens))
group_one_asm_long_corpus_tokens_count = "{:.1f}".format(sum(group_one_asm_long_corpus_tokens) / len(group_one_asm_long_corpus_tokens))
group_one_wasm_corpus_tokens_count = "{:.1f}".format(sum(group_one_wasm_corpus_tokens) / len(group_one_wasm_corpus_tokens))
group_two_original_corpus_tokens_count = "{:.1f}".format(sum(group_two_corpus_tokens) / len(group_two_corpus_tokens))
group_two_asm_long_corpus_tokens_count = "{:.1f}".format(sum(group_two_asm_long_corpus_tokens) / len(group_two_asm_long_corpus_tokens))
group_two_wasm_corpus_tokens_count = "{:.1f}".format(sum(group_two_wasm_corpus_tokens) / len(group_two_wasm_corpus_tokens))
group_three_original_corpus_tokens_count = "{:.1f}".format(sum(group_three_corpus_tokens) / len(group_three_corpus_tokens))
group_three_asm_long_corpus_tokens_count = "{:.1f}".format(sum(group_three_asm_long_corpus_tokens) / len(group_three_asm_long_corpus_tokens))
group_three_wasm_corpus_tokens_count = "{:.1f}".format(sum(group_three_wasm_corpus_tokens) / len(group_three_wasm_corpus_tokens))

final_result['code']['Group 1 original average code tokens'] = group_one_original_corpus_tokens_count
final_result['code']['Group 1 asm long average code tokens'] = group_one_asm_long_corpus_tokens_count
final_result['code']['Group 1 wasm average code tokens'] = group_one_wasm_corpus_tokens_count
final_result['code']['Group 2 original average code tokens'] = group_two_original_corpus_tokens_count
final_result['code']['Group 2 asm long average code tokens'] = group_two_asm_long_corpus_tokens_count
final_result['code']['Group 2 wasm average code tokens'] = group_two_wasm_corpus_tokens_count
final_result['code']['Group 3 original average code tokens'] = group_three_original_corpus_tokens_count
final_result['code']['Group 3 asm long average code tokens'] = group_three_asm_long_corpus_tokens_count
final_result['code']['Group 3 wasm average code tokens'] = group_three_wasm_corpus_tokens_count
print("Group 1 original average code tokens: ", group_one_original_corpus_tokens_count, "Group 1 asm long average code tokens: ", group_one_asm_long_corpus_tokens_count, "Group 1 wasm average code tokens: ", group_one_wasm_corpus_tokens_count)
print("Group 2 original average code tokens: ", group_two_original_corpus_tokens_count, "Group 2 asm long average code tokens: ", group_two_asm_long_corpus_tokens_count, "Group 2 wasm average code tokens: ", group_two_wasm_corpus_tokens_count)
print("Group 3 original average code tokens: ", group_three_original_corpus_tokens_count, "Group 3 asm long average code tokens: ", group_three_asm_long_corpus_tokens_count, "Group 3 wasm average code tokens: ", group_three_wasm_corpus_tokens_count)


Group 1 original average code tokens:  119.2 Group 1 asm long average code tokens:  753.7 Group 1 wasm average code tokens:  665.5
Group 2 original average code tokens:  137.7 Group 2 asm long average code tokens:  831.3 Group 2 wasm average code tokens:  947.1
Group 3 original average code tokens:  163.3 Group 3 asm long average code tokens:  926.9 Group 3 wasm average code tokens:  672.6


In [15]:
# calculate the average number of tokens in the entire corpus for code

original_entire_corpus_tokens_count = "{:.1f}".format((sum(group_one_corpus_tokens) + sum(group_two_corpus_tokens) + sum(group_three_corpus_tokens)) / (len(group_one_corpus_tokens) + len(group_two_corpus_tokens) + len(group_three_corpus_tokens)))
asm_long_entire_corpus_tokens_count = "{:.1f}".format((sum(group_one_asm_long_corpus_tokens) + sum(group_two_asm_long_corpus_tokens) + sum(group_three_asm_long_corpus_tokens)) / (len(group_one_asm_long_corpus_tokens) + len(group_two_asm_long_corpus_tokens) + len(group_three_asm_long_corpus_tokens)))
wasm_entire_corpus_tokens_count = "{:.1f}".format((sum(group_one_wasm_corpus_tokens) + sum(group_two_wasm_corpus_tokens) + sum(group_three_wasm_corpus_tokens)) / (len(group_one_wasm_corpus_tokens) + len(group_two_wasm_corpus_tokens) + len(group_three_wasm_corpus_tokens)))

final_result['code']['Entire Group average original code tokens'] = original_entire_corpus_tokens_count
final_result['code']['Entire Group average asm long code tokens'] = asm_long_entire_corpus_tokens_count
final_result['code']['Entire Group average wasm code tokens'] = wasm_entire_corpus_tokens_count
print("Entire Group average original code tokens: ", original_entire_corpus_tokens_count, "Entire Group average asm long code tokens: ", asm_long_entire_corpus_tokens_count, "Entire Group average wasm code tokens: ", wasm_entire_corpus_tokens_count)

Entire Group average original code tokens:  129.4 Entire Group average asm long code tokens:  795.5 Entire Group average wasm code tokens:  787.1


In [27]:

print("Asm totoal points: ", len(group_one_asm_long_corpus_tokens) + len(group_two_asm_long_corpus_tokens) + len(group_three_asm_long_corpus_tokens))
print("Wasm totoal points: ", len(group_one_wasm_corpus_tokens) + len(group_two_wasm_corpus_tokens) + len(group_three_wasm_corpus_tokens))
print("Asm discard: ", group_one_pair_count + group_two_pair_count + group_three_pair_count -  (len(group_one_asm_long_corpus_tokens) + len(group_two_asm_long_corpus_tokens) + len(group_three_asm_long_corpus_tokens)))
print("Wasm discard: ", group_one_pair_count + group_two_pair_count + group_three_pair_count -  (len(group_one_wasm_corpus_tokens) + len(group_two_wasm_corpus_tokens) + len(group_three_wasm_corpus_tokens)))

Asm totoal points:  1015
Wasm totoal points:  940
Asm discard:  24
Wasm discard:  99


##### 2. Line of Code

In [16]:
def get_line_count(code): # here, the text is a string for the query.
    """
    Get the number of lines in the given text.
    """
    code = code.strip()
    return len(code.split('\n'))

In [17]:
group_one_original_path = "original/reconstructed_group1_original.json"
group_one_asm_long_path = "original/reconstructed_group1_asm_long.json"
group_one_wasm_path = "original/reconstructed_group1_wasm.json"
group_two_original_path = "original/reconstructed_group2_original.json"
group_two_asm_long_path = "original/reconstructed_group2_asm_long.json"
group_two_wasm_path = "original/reconstructed_group2_wasm.json"
group_three_original_path = "original/reconstructed_group3_helper_as_part_of_groundtruth_original.json"
group_three_asm_long_path = "original/reconstructed_group3_asm_long.json"
group_three_wasm_path = "original/reconstructed_group3_helper_as_part_of_groundtruth_wasm.json"

# make sure all of the json files are in the json files used
assert group_one_original_path in json_files_used and group_one_asm_long_path in json_files_used and group_one_wasm_path in json_files_used
assert group_two_original_path in json_files_used and group_two_asm_long_path in json_files_used and group_two_wasm_path in json_files_used
assert group_three_original_path in json_files_used and group_three_asm_long_path in json_files_used and group_three_wasm_path in json_files_used

with open(group_one_original_path, 'r') as f:
    group_one_original = json.load(f)
    group_one_corpus = group_one_original['corpus']
    group_one_corpus_line_count = []
    for id, code in group_one_corpus.items():
        group_one_corpus_line_count.append(get_line_count(code))

with open(group_one_asm_long_path, 'r') as f:
    group_one_asm_long = json.load(f)
    group_one_asm_long_corpus = group_one_asm_long['corpus']
    group_one_asm_long_corpus_line_count = []
    for id, code in group_one_asm_long_corpus.items():
        group_one_asm_long_corpus_line_count.append(get_line_count(code))

with open(group_one_wasm_path, 'r') as f:
    group_one_wasm = json.load(f)
    group_one_wasm_corpus = group_one_wasm['corpus']
    group_one_wasm_corpus_line_count = []
    for id, code in group_one_wasm_corpus.items():
        group_one_wasm_corpus_line_count.append(get_line_count(code))

with open(group_two_original_path, 'r') as f:
    group_two_original = json.load(f)
    group_two_corpus = group_two_original['corpus']
    group_two_corpus_line_count = []
    for id, code in group_two_corpus.items():
        group_two_corpus_line_count.append(get_line_count(code))

with open(group_two_asm_long_path, 'r') as f:
    group_two_asm_long = json.load(f)
    group_two_asm_long_corpus = group_two_asm_long['corpus']
    group_two_asm_long_corpus_line_count = []
    for id, code in group_two_asm_long_corpus.items():
        group_two_asm_long_corpus_line_count.append(get_line_count(code))

with open(group_two_wasm_path, 'r') as f:
    group_two_wasm = json.load(f)
    group_two_wasm_corpus = group_two_wasm['corpus']
    group_two_wasm_corpus_line_count = []
    for id, code in group_two_wasm_corpus.items():
        group_two_wasm_corpus_line_count.append(get_line_count(code))

with open(group_three_original_path, 'r') as f:
    group_three_original = json.load(f)
    group_three_corpus = group_three_original['corpus']
    group_three_corpus_line_count = []
    for id, code in group_three_corpus.items():
        if "c_group_3_id" not in id:
            continue
        group_three_corpus_line_count.append(get_line_count(code))

with open(group_three_asm_long_path, 'r') as f:
    group_three_asm_long = json.load(f)
    group_three_asm_long_corpus = group_three_asm_long['corpus']
    group_three_asm_long_corpus_line_count = []
    for id, code in group_three_asm_long_corpus.items():
        if "c_group_3_id" not in id:
            continue
        group_three_asm_long_corpus_line_count.append(get_line_count(code))

with open(group_three_wasm_path, 'r') as f:
    group_three_wasm = json.load(f)
    group_three_wasm_corpus = group_three_wasm['corpus']
    group_three_wasm_corpus_line_count = []
    for id, code in group_three_wasm_corpus.items():
        if "c_group_3_id" not in id:
            continue
        group_three_wasm_corpus_line_count.append(get_line_count(code))

group_one_original_corpus_line_count_avg = "{:.1f}".format(sum(group_one_corpus_line_count) / len(group_one_corpus_line_count))
group_one_asm_long_corpus_line_count_avg = "{:.1f}".format(sum(group_one_asm_long_corpus_line_count) / len(group_one_asm_long_corpus_line_count))
group_one_wasm_corpus_line_count_avg = "{:.1f}".format(sum(group_one_wasm_corpus_line_count) / len(group_one_wasm_corpus_line_count))
group_two_original_corpus_line_count_avg = "{:.1f}".format(sum(group_two_corpus_line_count) / len(group_two_corpus_line_count))
group_two_asm_long_corpus_line_count_avg = "{:.1f}".format(sum(group_two_asm_long_corpus_line_count) / len(group_two_asm_long_corpus_line_count))
group_two_wasm_corpus_line_count_avg = "{:.1f}".format(sum(group_two_wasm_corpus_line_count) / len(group_two_wasm_corpus_line_count))
group_three_original_corpus_line_count_avg = "{:.1f}".format(sum(group_three_corpus_line_count) / len(group_three_corpus_line_count))
group_three_asm_long_corpus_line_count_avg = "{:.1f}".format(sum(group_three_asm_long_corpus_line_count) / len(group_three_asm_long_corpus_line_count))
group_three_wasm_corpus_line_count_avg = "{:.1f}".format(sum(group_three_wasm_corpus_line_count) / len(group_three_wasm_corpus_line_count))
final_result['code']['Group 1 original average code line count'] = group_one_original_corpus_line_count_avg
final_result['code']['Group 1 asm long average code line count'] = group_one_asm_long_corpus_line_count_avg
final_result['code']['Group 1 wasm average code line count'] = group_one_wasm_corpus_line_count_avg
final_result['code']['Group 2 original average code line count'] = group_two_original_corpus_line_count_avg
final_result['code']['Group 2 asm long average code line count'] = group_two_asm_long_corpus_line_count_avg
final_result['code']['Group 2 wasm average code line count'] = group_two_wasm_corpus_line_count_avg
final_result['code']['Group 3 original average code line count'] = group_three_original_corpus_line_count_avg
final_result['code']['Group 3 asm long average code line count'] = group_three_asm_long_corpus_line_count_avg
final_result['code']['Group 3 wasm average code line count'] = group_three_wasm_corpus_line_count_avg

print("Group 1 original average code line count: ", group_one_original_corpus_line_count_avg, "Group 1 asm long average code line count: ", group_one_asm_long_corpus_line_count_avg, "Group 1 wasm average code line count: ", group_one_wasm_corpus_line_count_avg)

print("Group 2 original average code line count: ", group_two_original_corpus_line_count_avg, "Group 2 asm long average code line count: ", group_two_asm_long_corpus_line_count_avg, "Group 2 wasm average code line count: ", group_two_wasm_corpus_line_count_avg)

print("Group 3 original average code line count: ", group_three_original_corpus_line_count_avg, "Group 3 asm long average code line count: ", group_three_asm_long_corpus_line_count_avg, "Group 3 wasm average code line count: ", group_three_wasm_corpus_line_count_avg)

Group 1 original average code line count:  12.8 Group 1 asm long average code line count:  80.7 Group 1 wasm average code line count:  96.2
Group 2 original average code line count:  13.3 Group 2 asm long average code line count:  84.4 Group 2 wasm average code line count:  134.4
Group 3 original average code line count:  22.9 Group 3 asm long average code line count:  96.4 Group 3 wasm average code line count:  97.6


In [18]:
# calculate the average number of tokens in the code for the entire corpus

original_entire_corpus_line_count_avg = "{:.1f}".format((sum(group_one_corpus_line_count) + sum(group_two_corpus_line_count) + sum(group_three_corpus_line_count)) / (len(group_one_corpus_line_count) + len(group_two_corpus_line_count) + len(group_three_corpus_line_count)))
asm_long_entire_corpus_line_count_avg = "{:.1f}".format((sum(group_one_asm_long_corpus_line_count) + sum(group_two_asm_long_corpus_line_count) + sum(group_three_asm_long_corpus_line_count)) / (len(group_one_asm_long_corpus_line_count) + len(group_two_asm_long_corpus_line_count) + len(group_three_asm_long_corpus_line_count)))
wasm_entire_corpus_line_count_avg = "{:.1f}".format((sum(group_one_wasm_corpus_line_count) + sum(group_two_wasm_corpus_line_count) + sum(group_three_wasm_corpus_line_count)) / (len(group_one_wasm_corpus_line_count) + len(group_two_wasm_corpus_line_count) + len(group_three_wasm_corpus_line_count)))

final_result['code']['Entire Group average original code line count'] = original_entire_corpus_line_count_avg
final_result['code']['Entire Group average asm long code line count'] = asm_long_entire_corpus_line_count_avg
final_result['code']['Entire Group average wasm code line count'] = wasm_entire_corpus_line_count_avg

print("Entire Group average original code line count: ", original_entire_corpus_line_count_avg, "Entire Group average asm long code line count: ", asm_long_entire_corpus_line_count_avg, "Entire Group average wasm code line count: ", wasm_entire_corpus_line_count_avg)

Entire Group average original code line count:  13.5 Entire Group average asm long code line count:  83.0 Entire Group average wasm code line count:  112.7


##### 3. Code Complexity

In [19]:
from calculate_complexity import cyclomatic_complexity

In [20]:
# calculate the cyclomatic complexity of the code. Only for the original code.
import json
group_one_original_path = "original/reconstructed_group1_original.json"
group_two_original_path = "original/reconstructed_group2_original.json"
group_three_original_path = "original/reconstructed_group3_helper_as_part_of_groundtruth_original.json"

with open(group_one_original_path, 'r') as f:
    group_one_original = json.load(f)
    group_one_corpus = group_one_original['corpus']
    group_one_corpus_complexity = []
    for id, code in group_one_corpus.items():
        group_one_corpus_complexity.append(cyclomatic_complexity(code))
with open(group_two_original_path, 'r') as f:
    group_two_original = json.load(f)
    group_two_corpus = group_two_original['corpus']
    group_two_corpus_complexity = []
    for id, code in group_two_corpus.items():
        group_two_corpus_complexity.append(cyclomatic_complexity(code))
with open(group_three_original_path, 'r') as f:
    group_three_original = json.load(f)
    group_three_corpus = group_three_original['corpus']
    group_three_corpus_complexity = []
    for id, code in group_three_corpus.items():
        if "c_group_3_id" not in id:
            continue
        group_three_corpus_complexity.append(cyclomatic_complexity(code))
group_one_original_corpus_complexity_avg = "{:.1f}".format(sum(group_one_corpus_complexity) / len(group_one_corpus_complexity))
group_two_original_corpus_complexity_avg = "{:.1f}".format(sum(group_two_corpus_complexity) / len(group_two_corpus_complexity))
group_three_original_corpus_complexity_avg = "{:.1f}".format(sum(group_three_corpus_complexity) / len(group_three_corpus_complexity))

final_result['code']['Group 1 original average code cyclomatic complexity'] = group_one_original_corpus_complexity_avg
final_result['code']['Group 2 original average code cyclomatic complexity'] = group_two_original_corpus_complexity_avg
final_result['code']['Group 3 original average code cyclomatic complexity'] = group_three_original_corpus_complexity_avg


print("Group 1 original average code cyclomatic complexity: ", group_one_original_corpus_complexity_avg, "Group 2 original average code cyclomatic complexity: ", group_two_original_corpus_complexity_avg, "Group 3 original average code cyclomatic complexity: ", group_three_original_corpus_complexity_avg)

Group 1 original average code cyclomatic complexity:  2.9 Group 2 original average code cyclomatic complexity:  2.8 Group 3 original average code cyclomatic complexity:  4.3


In [21]:
# calculate the average complexity of the original code for the entire corpus

original_entire_corpus_complexity_avg = "{:.1f}".format((sum(group_one_corpus_complexity) + sum(group_two_corpus_complexity) + sum(group_three_corpus_complexity)) / (len(group_one_corpus_complexity) + len(group_two_corpus_complexity) + len(group_three_corpus_complexity)))
final_result['code']['Entire Group average original code cyclomatic complexity'] = original_entire_corpus_complexity_avg
print("Entire Group average original code cyclomatic complexity: ", original_entire_corpus_complexity_avg)

Entire Group average original code cyclomatic complexity:  2.9


In [22]:
final_result

{'general': {'Group 1 original pair count': 526,
  'Group 2 original pair count': 469,
  'Group 3 original pair count': 44},
 'query': {'Group 1 average query tokens': '88.3',
  'Group 2 average query tokens': '84.7',
  'Group 3 average query tokens': '88.2',
  'Entire Group average query token count': '86.7'},
 'code': {'Group 1 original average code tokens': '119.2',
  'Group 1 asm long average code tokens': '753.7',
  'Group 1 wasm average code tokens': '665.5',
  'Group 2 original average code tokens': '137.7',
  'Group 2 asm long average code tokens': '831.3',
  'Group 2 wasm average code tokens': '947.1',
  'Group 3 original average code tokens': '163.3',
  'Group 3 asm long average code tokens': '926.9',
  'Group 3 wasm average code tokens': '672.6',
  'Entire Group average original code tokens': '129.4',
  'Entire Group average asm long code tokens': '795.5',
  'Entire Group average wasm code tokens': '787.1',
  'Group 1 original average code line count': '12.8',
  'Group 1 asm

In [25]:
# total number of pairs
total_pairs = group_one_pair_count + group_two_pair_count + group_three_pair_count
print("Total number of pairs: ", total_pairs)

Total number of pairs:  1039
