## Retrieve data

In [1]:
import os
from collections import Counter
from tqdm import tqdm


data_dir = "/media/aziz/Data/Aziz/data/gans_for_apr/training/"

print("Collecting file paths...")
file_paths = [root+'/'+name for root, dirs, files in tqdm(os.walk(data_dir)) for name in files]
buggy_paths = [f_path for f_path in tqdm(file_paths) if 'buggy' in f_path]
fixed_paths = [f_path for f_path in tqdm(file_paths) if 'fixed' in f_path]

print("Collecting file contents...")
buggy_data = []
for path in tqdm(buggy_paths):
    with open(path, 'r', encoding='utf-8') as f:
        buggy_data.append(f.read())
fixed_data = []
for path in tqdm(fixed_paths):
    with open(path, 'r', encoding='utf-8') as f:
        fixed_data.append(f.read())

print('Number of processed files:-')
print(len(buggy_data), 'buggy files +', len(fixed_data), 'fixed files = ', len(buggy_data)+len(fixed_data), 'files')

3113it [00:00, 31123.94it/s]

Collecting file paths...


21757it [00:00, 45638.42it/s]
100%|██████████| 43512/43512 [00:00<00:00, 1660412.28it/s]
100%|██████████| 43512/43512 [00:00<00:00, 1713494.22it/s]
 17%|█▋        | 3778/21756 [00:00<00:00, 37778.50it/s]

Collecting file contents...


100%|██████████| 21756/21756 [00:00<00:00, 37452.79it/s]
100%|██████████| 21756/21756 [00:00<00:00, 42136.62it/s]

Number of processed files:-
21756 buggy files + 21756 fixed files =  43512 files





## Clean data - Reduce noise

### Remove comments

In [2]:
import re


def remove_comments(code):
    code = re.sub('(?s)/\*.*?\*/', '', code)
    return re.sub('(//[^\n]*)', '', code)


buggy_no_comments = [remove_comments(x) for x in tqdm(buggy_data)]
fixed_no_comments = [remove_comments(x) for x in tqdm(fixed_data)]
print('Comments have been removed')

100%|██████████| 21756/21756 [00:00<00:00, 81611.64it/s]
100%|██████████| 21756/21756 [00:00<00:00, 62513.93it/s]

Comments have been removed





### Remove unnecessary white spaces

In [3]:
def remove_spaces(code):
    '''This function removes excessive spaces and keeps necessary ones'''
    code = code.splitlines()
    result = []
    for line in code:
        line = line.split()
        line = ' '.join(line)
        if len(line) > 0:  # Remove empty lines
            result.append(line)
    
    return '\n'.join(result)


buggy_no_spaces = [remove_spaces(x) for x in tqdm(buggy_no_comments)]
fixed_no_spaces = [remove_spaces(x) for x in tqdm(fixed_no_comments)]
print('White spaces have been removed')

100%|██████████| 21756/21756 [00:01<00:00, 16783.57it/s]
100%|██████████| 21756/21756 [00:01<00:00, 16245.42it/s]

White spaces have been removed





### Remove identical buggy and fixed codes

In [4]:
buggy_texts, fixed_texts = [], []
for buggy, fixed in zip(buggy_no_spaces, fixed_no_spaces):
    if buggy != fixed:
        buggy_texts.append(buggy)
        fixed_texts.append(fixed)
print(len(buggy_no_spaces)-len(buggy_texts), 'code pairs have been removed')
print('Current # data points:', len(buggy_texts))

4726 code pairs have been removed
Current # data points: 17030


### Remove duplicated pairs in the dataset

In [5]:
text_pairs = [(x, y) for x, y in zip(buggy_texts, fixed_texts)]
code_pairs = sorted(set(text_pairs))  # Sorted to ensure same order every run (not sure if 'set' works randomly)

buggy_codes = [x[0] for x in code_pairs]
fixed_codes = [x[1] for x in code_pairs]

print(len(text_pairs)-len(code_pairs), 'code pairs have been removed')
print('Current # data points:', len(buggy_codes))

3857 code pairs have been removed
Current # data points: 13173


## Calculate Diffs

In [6]:
from difflib import ndiff


initial_diffs = [list(ndiff(x.splitlines(), y.splitlines())) for x, y in tqdm(zip(buggy_codes, fixed_codes))]

13173it [00:06, 2181.29it/s]


### Restrict to one-line difference

In [7]:
one_line_diffs = []
for diff in tqdm(initial_diffs):
    if sum([1 for x in diff if x.startswith('-')]) > 1 or sum([1 for x in diff if x.startswith('+')]) > 1:
        continue
    one_line_diffs.append(diff)

print(len(initial_diffs)-len(one_line_diffs), 'code pairs have been removed')
print('Current # data points:', len(one_line_diffs))

100%|██████████| 13173/13173 [00:00<00:00, 40512.00it/s]

322 code pairs have been removed
Current # data points: 12851





### Restrict to modified lines (exclude added/deleted lines)

In [8]:
diffs = []
for diff in tqdm(one_line_diffs):
    for i, x in enumerate(diff):
        if x.startswith('+') and diff[i-1].startswith('-'):
            diffs.append((diff[i-1][2:], x[2:]))

print(len(one_line_diffs)-len(diffs), 'code pairs have been removed')
print('Current # data points:', len(diffs))

100%|██████████| 12851/12851 [00:00<00:00, 59870.04it/s]

7102 code pairs have been removed
Current # data points: 5749





#### Shuffle and separate training and testing data

In [9]:
from random import seed, sample
import pickle


seed(30)
shuf_diffs = sample(diffs, k=len(diffs))
test_diffs, train_diffs = shuf_diffs[:500], shuf_diffs[500:]

train_buggy_lines = [x[0] for x in train_diffs]
train_fixed_lines = [x[1] for x in train_diffs]

test_buggy_lines = [x[0] for x in test_diffs]
test_fixed_lines = [x[1] for x in test_diffs]


with open('train_buggy_lines.pkl', 'wb') as f:
    pickle.dump(train_buggy_lines, f)
with open('train_fixed_lines.pkl', 'wb') as f:
    pickle.dump(train_fixed_lines, f)
with open('test_buggy_lines.pkl', 'wb') as f:
    pickle.dump(test_buggy_lines, f)
with open('test_fixed_lines.pkl', 'wb') as f:
    pickle.dump(test_fixed_lines, f)

print("Data have been writte to disk")

Data have been writte to disk
