In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/logicalErrorFix-1

/content/drive/MyDrive/logicalErrorFix-1


In [3]:
from __future__ import absolute_import
import os
import sys
import bleu
import pickle
import torch
import json
import random
import logging
import argparse
import numpy as np
from io import open
from itertools import cycle
import torch.nn as nn
from model import Seq2Seq
from tqdm import tqdm, trange
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
from torch.utils.data.distributed import DistributedSampler
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          RobertaConfig, RobertaModel, RobertaTokenizer)
MODEL_CLASSES = {'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer)}

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)



class Example(object):
    """A single training/test example."""
    def __init__(self,
                 idx,
                 source,
                 target,
                 ):
        self.idx = idx
        self.source = source
        self.target = target

import pandas as pd
COLUMNS = ['Correct_code', 'Incorrect_code', 'Statement']

def read_examples(filename):
  """Read examples from filename for DeepFix style training Line stmt Line stmt Line stmt ..."""
  examples = []
  data = pd.read_csv(filename, sep='\t', header=[0]).drop(columns=COLUMNS[0])
  for idx, elem in data.iterrows():
    code = ' '.join(elem[COLUMNS[1]].split('||| '))[:-1].strip()
    stmt = elem[COLUMNS[2]].strip()

    examples.append(
      Example(
              idx = idx,
              source = code,
              target = stmt,
              )
    )
  return examples

class InputFeatures(object):
    """A single training/test features for a example."""
    def __init__(self,
                 example_id,
                 source_ids,
                 target_ids,
                 source_mask,
                 target_mask,

    ):
        self.example_id = example_id
        self.source_ids = source_ids
        self.target_ids = target_ids
        self.source_mask = source_mask
        self.target_mask = target_mask

def convert_examples_to_features(examples, tokenizer, args,stage=None):
    features = []
    for example_index, example in enumerate(examples):
        #source
        source_tokens = tokenizer.tokenize(example.source)
        source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]
        source_ids =  tokenizer.convert_tokens_to_ids(source_tokens)
        source_mask = [1] * (len(source_tokens))
        padding_length = args.max_source_length - len(source_ids)
        source_ids+=[tokenizer.pad_token_id]*padding_length
        source_mask+=[0]*padding_length

        #target
        if stage=="test":
            target_tokens = tokenizer.tokenize("None")
        else:
            target_tokens = tokenizer.tokenize(example.target)
        target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token]
        target_ids = tokenizer.convert_tokens_to_ids(target_tokens)
        target_mask = [1] *len(target_ids)
        padding_length = args.max_target_length - len(target_ids)
        target_ids+=[tokenizer.pad_token_id]*padding_length
        target_mask+=[0]*padding_length

        if example_index < 5:
            ##NNNN
            if stage=='train1':
                logger.info("*** Example ***")
                logger.info("idx: {}".format(example.idx))

                logger.info("source_tokens: {}".format([x.replace('\u0120','_') for x in source_tokens]))
                logger.info("source_ids: {}".format(' '.join(map(str, source_ids))))
                logger.info("source_mask: {}".format(' '.join(map(str, source_mask))))

                logger.info("target_tokens: {}".format([x.replace('\u0120','_') for x in target_tokens]))
                logger.info("target_ids: {}".format(' '.join(map(str, target_ids))))
                logger.info("target_mask: {}".format(' '.join(map(str, target_mask))))

        features.append(
            InputFeatures(
                 example_index,
                 source_ids,
                 target_ids,
                 source_mask,
                 target_mask,
            )
        )
    return features


In [4]:
import argparse

# 명령줄 인자를 직접 코드 내에서 설정
args = argparse.Namespace(
    model_type='roberta',
    model_name_or_path='microsoft/codebert-base',
    output_dir='path/to/output/dir',
    load_model_path='path/to/trained/model',
    train_filename='./data/edit_distance/pair_code_edit_dist_train.txt',
    dev_filename='./data/edit_distance/pair_code_edit_dist_valid.txt',
    test_filename=None,
    config_name='',
    tokenizer_name='',
    max_source_length=1000,
    max_target_length=1000,
    do_train=True,
    do_eval=True,
    do_test=True,
    do_lower_case=True,
    no_cuda=True,
    train_batch_size=8,
    eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    beam_size=10,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    num_train_epochs=3.0,
    max_steps=-1,
    eval_steps=-1,
    train_steps=-1,
    warmup_steps=0,
    local_rank=-1,
    seed=42,
)

# 예제 사용
print(args.model_type)  # 출력: roberta
print(args.do_train)    # 출력: True

roberta
True


In [5]:
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,do_lower_case=args.do_lower_case)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [6]:
train_examples = read_examples(args.train_filename)

In [7]:
print(type(train_examples))

<class 'list'>


In [8]:
print(train_examples[0].idx)
print(train_examples[0].source)
print(train_examples[0].target)

0
1 #include <bits/stdc++.h> 2 using namespace std; 3 int ara[500008]; 4 set<int> s; 5 set<int>::iterator it; 6 int n; 7 void check(int num) { 8 for (int i = 1; i < n; i++) { 9 if (!(ara[i] % num == 0 || ara[n + i] % num == 0)) { 10 return; 11 } 12 } 13 cout << num << endl; 14 exit(0); 15 } 16 int primefactor(int num) { 17 int i; 18 for (i = 2; i <= num / i; i++) { 19 bool flag = 0; 20 while (num % i == 0) { 21 num /= i; 22 if (!flag) { 23 check(i); 24 } 25 flag = 1; 26 } 27 } 28 if (num > i) { 29 check(num); 30 } 31 } 32 int main() { 33 scanf("%d", &n); 34 for (int i = 0; i < n; i++) { 35 scanf("%d%d", &ara[i], &ara[n + i]); 36 } 37 primefactor(ara[0]); 38 primefactor(ara[n]); 39 cout << "-1" << endl; 40 return 0; 41 }
28 if (num > 1) {


In [9]:
# 순서에 맞지 않는 라인 번호를 무시하고, 이후 부분에서 예상되는 숫자가 나올 때까지 계속 진행하여 코드를 저장하는 로직으로 수정합니다.

def parse_and_continue_on_mismatch(code_str):
    # 결과를 저장할 딕셔너리
    code_dict = {}
    # 순서를 검증하기 위한 변수 (다음에 올바른 라인 번호)
    expected_line_number = 1

    # 코드 문자열을 공백을 기준으로 분리
    parts = code_str.split()
    # 현재 라인 번호
    current_line = None

    for part in parts:
        # 부분이 숫자(라인 번호)인 경우
        if part.isdigit():
            line_number = int(part)
            # 라인 번호가 예상한 순서와 일치하면 저장 시작
            if line_number == expected_line_number:
                current_line = line_number
                code_dict[current_line] = ""
                expected_line_number += 1
            # 라인 번호가 순서에 맞지 않으면 무시하고 다음 숫자를 기다림
        else:
            # 현재 라인에 코드 추가 (예상 순서에 맞는 라인 번호가 발견되면)
            if current_line is not None:
                code_dict[current_line] += part + " "

    # 마지막에 추가된 공백 제거
    for line in code_dict:
        code_dict[line] = code_dict[line].strip()

    return code_dict

In [11]:
source_code_dict = {}
for i in range(len(train_examples)):
    source_code_dict[i] = parse_and_continue_on_mismatch(train_examples[i].source)

In [12]:
def remove_line_numbers(source_dict):
    # 줄 번호를 제거하고 코드를 하나의 문자열로 결합합니다.
    ret = []
    for _, v in source_dict.items():
        ret.append(v)
    return ' '.join(ret)

In [13]:
input_code = {}
for i in range(len(source_code_dict)):
    input_code[i] = {"source" : remove_line_numbers(source_code_dict[i])}

In [14]:
def parse_single_line_to_dict(line_str):
    # 입력 문자열을 첫 번째 공백을 기준으로 분리
    parts = line_str.split(' ', 1)
    line_number = parts[0]
    code = parts[1] if len(parts) > 1 else ""

    return {int(line_number): code}

In [15]:
target_dict = {}
for i in range(len(train_examples)):
    target_dict[i] = parse_single_line_to_dict(train_examples[i].target)

In [16]:
import copy
target_code_dict = copy.deepcopy(source_code_dict)

In [17]:
def update_values_from_dict(dict1, dict2):
    """
    dict1의 값을 dict2의 값으로 업데이트합니다.
    동일한 키가 있는 경우에만 업데이트를 진행합니다.
    """
    for key in dict1:
        if key in dict2:
            dict1[key] = dict2[key]
    return dict1

In [18]:
for i in range(len(target_dict)):
    target_code_dict[i] = update_values_from_dict(target_code_dict[i], target_dict[i])

In [19]:
for i in range(len(target_code_dict)):
    input_code[i]["target"] = remove_line_numbers(target_code_dict[i])

In [20]:
print(input_code[0])
json_str = json.dumps(input_code[0], indent=4)
print(json_str)

{'source': '#include <bits/stdc++.h> using namespace std; int ara[500008]; set<int> s; set<int>::iterator it; int n; void check(int num) { for (int i = 1; i < n; i++) { if (!(ara[i] % num == || ara[n + i] % num == 0)) { return; } } cout << num << endl; exit(0); } int primefactor(int num) { int i; for (i = 2; i <= num / i; i++) { bool flag = 0; while (num % i == 0) { num /= i; if (!flag) { check(i); } flag = 1; } } if (num > i) { check(num); } } int main() { scanf("%d", &n); for (int i = 0; i < n; i++) { scanf("%d%d", &ara[i], &ara[n + i]); } primefactor(ara[0]); primefactor(ara[n]); cout << "-1" << endl; return 0; }', 'target': '#include <bits/stdc++.h> using namespace std; int ara[500008]; set<int> s; set<int>::iterator it; int n; void check(int num) { for (int i = 1; i < n; i++) { if (!(ara[i] % num == || ara[n + i] % num == 0)) { return; } } cout << num << endl; exit(0); } int primefactor(int num) { int i; for (i = 2; i <= num / i; i++) { bool flag = 0; while (num % i == 0) { num /=

In [7]:
def process_code_examples(train_examples):
    def parse_and_continue_on_mismatch(code_str):
        code_dict = {}
        expected_line_number = 1
        parts = code_str.split()
        current_line = None
        for part in parts:
            if part.isdigit():
                line_number = int(part)
                if line_number == expected_line_number:
                    current_line = line_number
                    code_dict[current_line] = ""
                    expected_line_number += 1
            else:
                if current_line is not None:
                    code_dict[current_line] += part + " "
        for line in code_dict:
            code_dict[line] = code_dict[line].strip()
        return code_dict

    def remove_line_numbers(source_dict):
        ret = []
        for _, v in source_dict.items():
            ret.append(v)
        return ' '.join(ret)

    def parse_single_line_to_dict(line_str):
        parts = line_str.split(' ', 1)
        line_number = parts[0]
        code = parts[1] if len(parts) > 1 else ""
        return {int(line_number): code}

    def update_values_from_dict(dict1, dict2):
        for key in dict1:
            if key in dict2:
                dict1[key] = dict2[key]
        return dict1

    source_code_dict = {}
    for i, example in enumerate(train_examples):
        source_code_dict[i] = parse_and_continue_on_mismatch(example.source)

    target_dict = {}
    for i, example in enumerate(train_examples):
        target_dict[i] = parse_single_line_to_dict(example.target)

    target_code_dict = {i: update_values_from_dict(copy.deepcopy(source_code_dict[i]), target_dict[i]) for i in range(len(target_dict))}

    input_code = {}
    for i in range(len(source_code_dict)):
        input_code[i] = {
            "source": remove_line_numbers(source_code_dict[i]),
            "target": remove_line_numbers(target_code_dict[i])
        }

    return input_code

In [10]:
import copy
input_code = process_code_examples(train_examples)

In [11]:
print(input_code[0])
json_str = json.dumps(input_code[0], indent=4)
print(json_str)

{'source': '#include <bits/stdc++.h> using namespace std; int ara[500008]; set<int> s; set<int>::iterator it; int n; void check(int num) { for (int i = 1; i < n; i++) { if (!(ara[i] % num == || ara[n + i] % num == 0)) { return; } } cout << num << endl; exit(0); } int primefactor(int num) { int i; for (i = 2; i <= num / i; i++) { bool flag = 0; while (num % i == 0) { num /= i; if (!flag) { check(i); } flag = 1; } } if (num > i) { check(num); } } int main() { scanf("%d", &n); for (int i = 0; i < n; i++) { scanf("%d%d", &ara[i], &ara[n + i]); } primefactor(ara[0]); primefactor(ara[n]); cout << "-1" << endl; return 0; }', 'target': '#include <bits/stdc++.h> using namespace std; int ara[500008]; set<int> s; set<int>::iterator it; int n; void check(int num) { for (int i = 1; i < n; i++) { if (!(ara[i] % num == || ara[n + i] % num == 0)) { return; } } cout << num << endl; exit(0); } int primefactor(int num) { int i; for (i = 2; i <= num / i; i++) { bool flag = 0; while (num % i == 0) { num /=