In [1]:
import os
from collections import Counter
from tqdm import tqdm


data_dir = "/media/aziz/Data/Aziz/data/gans_for_apr/"

file_paths = [root+'/'+name for root, dirs, files in os.walk(data_dir) for name in files]

In [2]:
buggy_paths, fixed_paths = [], []
buggy_paths = [f_path for f_path in file_paths if 'buggy' in f_path]
fixed_paths = [f_path for f_path in file_paths if 'fixed' in f_path]

buggy_texts = []
for path in tqdm(buggy_paths):
    with open(path, 'r', encoding='utf-8') as f:
        buggy_texts.append(f.read())

fixed_texts = []
for path in tqdm(fixed_paths):
    with open(path, 'r', encoding='utf-8') as f:
        fixed_texts.append(f.read())

100%|██████████| 20826/20826 [00:00<00:00, 60537.45it/s]
100%|██████████| 20826/20826 [00:00<00:00, 59962.68it/s]


In [34]:
print(buggy_texts[1000])
print('=======================')
print(fixed_texts[1000])

meld
git diff

 public void resetUid() {
        uidOnlySchema = null;
    }

 public void resetUid() {
        uidOnlySchema = null;
    }



In [4]:
import re


buggy_codes, fixed_codes = [], []
for buggy, fixed in tqdm(zip(buggy_texts, fixed_texts)):
    buggy = re.sub('(?s)/\*.*?\*/', '', buggy)
    buggy = re.sub('(//[^\n]*)', '', buggy)
    buggy_codes.append(buggy)
    fixed = re.sub('(?s)/\*.*?\*/', '', fixed)
    fixed = re.sub('(//[^\n]*)', '', fixed)
    fixed_codes.append(fixed)

20826it [00:00, 57207.34it/s]


In [5]:
def write_java_parser_ready(codes, output_dir, file_name):
    with open(output_dir+file_name+'.java', "w", encoding='utf-8') as f:
        f.write("/**\n * Dummy JavaDoc\n */\npublic class "+file_name+" {\n\n")
        for code in tqdm(codes):
            f.write("/**\n * Dummy JavaDoc\n */\n"+str(code)+ "\n")
        f.write("}\n")

In [6]:
write_java_parser_ready(buggy_codes[:10], 'buggy_codes/', 'BuggyCodes')
write_java_parser_ready(fixed_codes[:10], 'fixed_codes/', 'FixedCodes')

100%|██████████| 10/10 [00:00<00:00, 18501.56it/s]
100%|██████████| 10/10 [00:00<00:00, 23198.58it/s]


In [28]:
len(buggy_codes[:10])

10

In [4]:
import json
import re
import sys


def extract_codes_and_comments(data):
    # Deal with codes
    print("Extracting comments and code fragments...")
    codes, comments = [], []
    for i in range(len(data)):
        if i % 2 == 0:  # If a comment
            if len(data[i]) == 1:  # If not a multiple comment
                comments.append("".join(data[i][0]))
                codes.append(" ".join(data[i+1]))
    print("Ignoring defected code's pairs...")
    clean_codes, coms_v1 = [], []
    for code, comment in zip(codes, comments):
        flag = False
        tokens = code.split()
        for i, token in enumerate(tokens):
            if token == '<' and tokens[i+1] == '<':
                flag = True
                break
            if token == '>' and tokens[i+1] == '>':
                flag = True
                break
            if token == 'Enumeration' and tokens[i+1] == 'enum':
                flag = True
                break
            if token == '\\' and tokens[i+1] == b'\xef\xbf\xbd'.decode("utf-8", "strict"):
                flag = True
                break
        if not flag:
            clean_codes.append(code)
            coms_v1.append(comment)

    # Deal with comments
    print("Removing whitespaces...")
    coms_v2 = []
    for comment in coms_v1:
        coms_v2.append(" ".join(re.findall(r'\S+|\n', comment)).replace(' \n ', '\n'))
    print("Removing commenting characters...")
    coms_multiline = []
    for comment in coms_v2:
        temp_list = comment.split("\n")  # Every line in a separate item of the list
        temp_list2 = []
        for item in temp_list:
            stripped = item.lstrip("//")
            stripped = stripped.lstrip("/**")
            stripped = stripped.lstrip("/*")
            stripped = stripped.rstrip("*/")
            stripped = stripped.lstrip("*")
            temp_list2.append(stripped + "\n")  # Append clean line to the new list
        coms_multiline.append(" ".join(temp_list2))
    print("Making every comment a one-liner...")
    clean_coms = []
    for comment in coms_multiline:
        clean_coms.append(" ".join(comment.split()))

    return clean_codes, clean_coms, coms_multiline


def write_to_file(path, file_name, a_list):
    with open(path + file_name, "w", encoding='utf-8') as file:
        for item in a_list:
            file.write(str(item) + '\n')


data_path = "/home/aa043/sea/data/td/v3/"

# Retrieve data from disk
print("Extracting data...")
with open(data_path+'pos.json', 'r') as f:
    tds = json.load(f)
print(len(tds)//2, "TD observations extracted")
with open(data_path+'neg.json', 'r') as f:
    non_tds = json.load(f)
print(len(non_tds)//2, "non-TD observations extracted")

# Prepare 1st draft of data
print("Processing TD data...")
pos_clean_codes, pos_clean_coms, pos_coms_multiline = extract_codes_and_comments(tds)
print("Processing non-TD data...")
neg_clean_codes, neg_clean_coms, neg_coms_multiline = extract_codes_and_comments(non_tds)

print("Creating labels...")
pos_lbls = []
for i in range(len(pos_clean_codes)):
    pos_lbls.append(1)
neg_lbls = []
for i in range(len(neg_clean_codes)):
    neg_lbls.append(0)

print("Aggregating data...")
all_clean_codes = pos_clean_codes + neg_clean_codes
all_clean_coms = pos_clean_coms + neg_clean_coms
all_coms_multiline = pos_coms_multiline + neg_coms_multiline
all_lbls = pos_lbls + neg_lbls

sys.exit()

# Prepare for writing multiline comments to disk
to_write_multi = []
for comment in all_coms_multiline:
    to_write_multi.append(comment + '+++')
# Creating multiple files for JavaParser because of memory (heap) issues
sliced_codes = [all_clean_codes[i:i+100000] for i in range(0, len(all_clean_codes), 100000)]

# Writing to disk
print("Writing the codes file...")
write_to_file(data_path, "codes.txt", all_clean_codes)
print("Writing the comments file...")
write_to_file(data_path, "comments.txt", all_clean_coms)
print("Writing the multiline comments file...")
write_to_file(data_path, "comments_multiline.txt", to_write_multi)
print("Writing the labels file...")
write_to_file(data_path, "labels.txt", all_lbls)
print("Writing the code file for JavaParser...")
method_count = 1
for i, slice in enumerate(sliced_codes, 1):
    with open(data_path+"parser_processing/"+str(i)+"/Codes"+str(i)+".java", "w", encoding='utf-8') as f:  # write codes file
        f.write("/**\n * Dummy JavaDoc\n */\npublic class Codes"+str(i)+" {\n\n")
        for code in slice:
            f.write("/**\n * Dummy JavaDoc\n */\npublic void coverMethod" + str(method_count) + "() {\n\t" + str(code) + "\n}\n\n")
            method_count += 1
        f.write("}\n")
    print(".java file", i, "has been written to disk")
print(str(method_count - 1) + " conditional statements have been written to file.")


In [113]:
from difflib import Differ


text1 = "Aziz Aziz Aziz Aziz Aziz\nIbra Ibra Ibra Ibra Ibra\nAhmed Ahmed Ahmed Ahmed Ahmed".splitlines()
text2 = "Aziz Aziz Aziz Aziz Aziz\nIbla Ibla Ibla Ibla Ibla\nAhmed Ahmed Ahmed Ahmed Ahmed".splitlines()
text3 = '''Aziz Aziz Aziz Aziz Aziz
Ibra Ibra Ibra Ibra Ibra
Ahmed Ahmed Ahmed Ahmed Ahmed
'''.splitlines(keepends=True)

d = Differ()

result = list(d.compare(text1, text2))
# result = d.compare(text1, text2)
result

['  Aziz Aziz Aziz Aziz Aziz',
 '- Ibra Ibra Ibra Ibra Ibra',
 '?   ^    ^    ^    ^    ^\n',
 '+ Ibla Ibla Ibla Ibla Ibla',
 '?   ^    ^    ^    ^    ^\n',
 '  Ahmed Ahmed Ahmed Ahmed Ahmed']

In [3]:
from difflib import ndiff

text4 = '''111111111111
2222222222222222
333333333333
4444444444444444
666666666
7777777777
abcdefghi
99999999999
'''.splitlines()


text5 = '''111111111111
2222222222222222
4444444444444444
5555555555555555
666666666
7777777777
888888888888
99999999999
'''.splitlines()

result2 = list(ndiff(text4, text5))
result2

['  111111111111',
 '  2222222222222222',
 '- 333333333333',
 '  4444444444444444',
 '+ 5555555555555555',
 '  666666666',
 '  7777777777',
 '- abcdefghi',
 '+ 888888888888',
 '  99999999999']

In [115]:
result == list(result2)

True

In [116]:
print(result)

['  Aziz Aziz Aziz Aziz Aziz', '- Ibra Ibra Ibra Ibra Ibra', '?   ^    ^    ^    ^    ^\n', '+ Ibla Ibla Ibla Ibla Ibla', '?   ^    ^    ^    ^    ^\n', '  Ahmed Ahmed Ahmed Ahmed Ahmed']


In [117]:
print(list(result2))

['  Aziz Aziz Aziz Aziz Aziz', '- Ibra Ibra Ibra Ibra Ibra', '?   ^    ^    ^    ^    ^\n', '+ Ibla Ibla Ibla Ibla Ibla', '?   ^    ^    ^    ^    ^\n', '  Ahmed Ahmed Ahmed Ahmed Ahmed']
