In [22]:
code_raw = '../logicalErrorFix_CodeT5_Linenumber/data/edit_distance/pair_code_edit_dist_test.txt'

import pandas as pd
import re
COLUMNS = ['PID','Correct_code', 'Incorrect_code', 'Statement']


def read_examples(code_raw):
  data = pd.read_csv(code_raw, sep='\t', header=[0])
  correct_code_dictionary = {}
  for idx, elem in data.iterrows():
    correct_code = elem[COLUMNS[1]].split('||| ')[:-1]
    correct_code_dictionary[f'Problem_{elem[COLUMNS[0]]}_CorrectCode_{idx}'] = {}
    for single_code in correct_code:
      correct_code_dictionary[f'Problem_{elem[COLUMNS[0]]}_CorrectCode_{idx}'][single_code.split(" ")[0]] = re.sub(r'^\d+', '', single_code)

  return correct_code_dictionary
  

def make_source(code_raw):
  source_dictionary = {}
  data = pd.read_csv(code_raw, sep='\t', header=[0])
  for idx, elem in data.iterrows():
    source_code = elem[COLUMNS[2]].split('||| ')[:-1]
    source_dictionary[f'Problem_{elem[COLUMNS[0]]}_source_{idx}'] = {}
    for single_code in source_code:
      source_dictionary[f'Problem_{elem[COLUMNS[0]]}_source_{idx}'][single_code.split(" ")[0]] = re.sub(r'^\d+', '', single_code)

  return source_dictionary

correct_code = read_examples(code_raw)
source_code = make_source(code_raw)

In [23]:
import json
Pretty_print = json.dumps(correct_code, indent = 4)
print(Pretty_print)

{
    "Problem_0_CorrectCode_0": {
        "1": " #include <bits/stdc++.h>",
        "2": " using namespace std;",
        "3": " using LL = long long;",
        "4": " constexpr int N = 1e5 + 5;",
        "5": " int main() {",
        "6": " cin.tie(nullptr)->sync_with_stdio(false);",
        "7": " int n, m;",
        "8": " cin >> n >> m;",
        "9": " vector<string> in(n);",
        "10": " for (auto& x : in) cin >> x;",
        "11": " vector<int> order(n);",
        "12": " iota(begin(order), end(order), 0);",
        "13": " sort(begin(order), end(order), [&](int i, int j) {",
        "14": " for (int k = 0; k < m; k++) {",
        "15": " if (in[i][k] != in[j][k]) {",
        "16": " if (k % 2 == 0) {",
        "17": " return in[i][k] < in[j][k];",
        "18": " } else {",
        "19": " return in[i][k] > in[j][k];",
        "20": " }",
        "21": " }",
        "22": " }",
        "23": " return false;",
        "24": " });",
        "25": " for (int i : order) cout <<

In [24]:
print(json.dumps(source_code, indent=4))

{
    "Problem_0_source_0": {
        "1": " #include <bits/stdc++.h>",
        "2": " using namespace std;",
        "3": " using LL = long long;",
        "4": " constexpr int N = 1e5 + 5;",
        "5": " int main() {",
        "6": " cin.tie(nullptr)->sync_with_stdio(false);",
        "7": " int n, m;",
        "8": " cin >> n >> m;",
        "9": " vector<string> in(n);",
        "10": " for (auto& x : in) cin >> x;",
        "11": " vector<int> order(n);",
        "12": " iota(begin(order), end(order), 0);",
        "13": " sort(begin(order), end(order), [&](int i, int j) {",
        "14": " for (int k = 0; k < m; k++) {",
        "15": " if (in[i][k] != in[i][j]) {",
        "16": " if (k % 2 == 0) {",
        "17": " return in[i][k] < in[j][k];",
        "18": " } else {",
        "19": " return in[i][k] > in[j][k];",
        "20": " }",
        "21": " }",
        "22": " }",
        "23": " return false;",
        "24": " });",
        "25": " for (int i : order) cout << i + 

In [25]:
from tqdm import tqdm
import os

def cpp_file_create(file_path, dictionary_list):
  for index, dictionary in tqdm(dictionary_list.items()):
    with open(os.path.join(file_path, f'{index}.cpp'), 'w', encoding='utf8') as f:
      for _, value in dictionary.items():
        f.write(value + '\n')

In [26]:
import numpy as np
cor_dic = []
sor_dic = []
diff_count = np.zeros(861)

for _, correct_data in correct_code.items():
    cor_dic.append(correct_data)

for _, source_data in source_code.items():
    sor_dic.append(source_data)


for index, (cor, sor) in enumerate(zip(cor_dic, sor_dic)):
    key_merge = set(cor.keys()) | set(sor.keys())

    for key in key_merge:
        if cor.get(key) != sor.get(key):
            diff_count[index] += 1


In [27]:
diff_count = np.array(diff_count)
single_line = diff_count[diff_count == 1]
print(f'Single Line Difference = {len(single_line)}')

multiple_line = diff_count[diff_count != 1]
print(f'Multiple Line Difference = {len(multiple_line)}')

Single Line Difference = 699
Multiple Line Difference = 162


In [28]:
cpp_file_create('./CodeT5_LineNumber_cpp_gold', correct_code)

100%|██████████| 861/861 [00:00<00:00, 30745.16it/s]


In [29]:
cpp_file_create('./CodeT5_LineNumber_cpp_source/', source_code)

100%|██████████| 861/861 [00:00<00:00, 34037.04it/s]
