# Preprocess

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

import transformers
from transformers import AutoTokenizer, AutoModel
transformers.logging.set_verbosity_error()

from rank_bm25 import BM25Okapi, BM25L, BM25Plus
# from sentence_transformers import SentenceTransformer

import re

from glob import glob
from tqdm import tqdm

from itertools import combinations

import warnings
warnings.filterwarnings('ignore')

from argparse import ArgumentParser

# import wandb
# from pytorch_lightning.loggers import WandbLogger
# wandb_logger = WandbLogger(name="preprocess", project="DACON_236228")

parser = ArgumentParser(description="preprocess")
parser.add_argument('--text_pretrained_model', default="unixcoder-base", type=str)
parser.add_argument('--truncation_side', default='left', type=str) # right or left
parser.add_argument('--bm25', default='bm25plus', type=str)
parser.add_argument('--frac', default=0.01, type=float)
parser.add_argument('--seed', default=826, type=int)
parser.add_argument('--device', default=0, type=int)
parser.add_argument('--num_workers', default=0, type=int)
args = parser.parse_args('')

# wandb.config.update(args)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

SEED = args.seed

def set_seeds(seed=SEED):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    pl.seed_everything(SEED)

set_seeds()

idx = f"{args.text_pretrained_model}_{args.bm25}"
idx

In [None]:
if args.text_pretrained_model == "unixcoder-base": # 1024
    text_pretrained_model = "microsoft/unixcoder-base"
if args.text_pretrained_model == "graphcodebert-base": # 512
    text_pretrained_model = "microsoft/graphcodebert-base"
if args.text_pretrained_model == "codebert-base": # 512
    text_pretrained_model = "microsoft/codebert-base"

tokenizer = AutoTokenizer.from_pretrained(text_pretrained_model)
tokenizer.truncation_side = args.truncation_side

## preprocess.py

In [None]:
def read_cpp_code(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

train_code_paths = glob('data/train_code/*/*.cpp')

len(train_code_paths)

In [None]:
read_cpp_code(train_code_paths[0])

In [None]:
val = pd.read_csv("data/sample_train.csv")
val.head()

In [None]:
''' 데이터 클리닝 '''
def clean_data(script, data_type="dir"):
    if data_type == "dir":
        with open(script, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            preproc_lines = []
            in_multiline_comment = False
            for line in lines:
                if line.startswith('#include'): # #include로 시작하는 행 제거
                    continue
                line = line.strip().replace('\t', '').split('//')[0].strip() # 개행문자 제거, 주석 제거
                line = re.sub(' +', ' ', line) # 개행문자 제거
                if line == '': # 전처리 후 빈 라인은 skip
                    continue
                # 여러 줄 주석 시작
                if '/*' in line:
                    in_multiline_comment = True
                # 여러 줄 주석 안에 있는 내용은 무시
                if not in_multiline_comment:
                    preproc_lines.append(line)
                # 여러 줄 주석 종료
                if '*/' in line:
                    in_multiline_comment = False

    elif data_type == "file":
        lines = script.split('\n')
        preproc_lines = []
        in_multiline_comment = False
        for line in lines:
            if line.startswith('#include'): # #include로 시작하는 행 제거
                continue
            line = line.strip().replace('\t', '').split('//')[0].strip() # 개행문자 제거, 주석 제거
            line = re.sub(' +', ' ', line) # 개행문자 제거
            if line == '': # 전처리 후 빈 라인은 skip
                continue
            # 여러 줄 주석 시작
            if '/*' in line:
                in_multiline_comment = True
            # 여러 줄 주석 안에 있는 내용은 무시
            if not in_multiline_comment:
                preproc_lines.append(line)
            # 여러 줄 주석 종료
            if '*/' in line:
                in_multiline_comment = False

    processed_script = ' '.join(preproc_lines) # 개행 문자로 합침
    # processed_script = '\n'.join(preproc_lines) # 개행 문자로 합침
    return processed_script

In [None]:
clean_data(read_cpp_code(train_code_paths[0]), data_type="file")

In [None]:
''' positive, negative 페어 생성 함수 '''
def get_pairs(input_df, tokenizer):
    codes = input_df['code'].to_list()
    problems = input_df['problem_num'].unique().tolist()
    problems.sort()

    tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
    if args.bm25 == "bm25ok":
        bm25 = BM25Okapi(tokenized_corpus)
    if args.bm25 == "bm25l":
        bm25 = BM25L(tokenized_corpus)
    if args.bm25 == "bm25plus":
        bm25 = BM25Plus(tokenized_corpus)

    total_positive_pairs = []
    total_negative_pairs = []

    for problem in tqdm(problems):
        solution_codes = input_df[input_df['problem_num'] == problem]['code']
        positive_pairs = list(combinations(solution_codes.to_list(),2))

        solution_codes_indices = solution_codes.index.to_list()
        negative_pairs = []

        first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
        negative_code_scores = bm25.get_scores(first_tokenized_code)
        negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
        ranking_idx = 0

        for solution_code in solution_codes:
            negative_solutions = []
            while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
                high_score_idx = negative_code_ranking[ranking_idx]

                if high_score_idx not in solution_codes_indices:
                    negative_solutions.append(input_df['code'].iloc[high_score_idx])
                ranking_idx += 1

            for negative_solution in negative_solutions:
                negative_pairs.append((solution_code, negative_solution))

        total_positive_pairs.extend(positive_pairs)
        total_negative_pairs.extend(negative_pairs)

    pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
    pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

    neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
    neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

    pos_label = [1]*len(pos_code1)
    neg_label = [0]*len(neg_code1)

    pos_code1.extend(neg_code1)
    total_code1 = pos_code1
    pos_code2.extend(neg_code2)
    total_code2 = pos_code2
    pos_label.extend(neg_label)
    total_label = pos_label
    
    pair_data = pd.DataFrame(data={
        'code1':total_code1,
        'code2':total_code2,
        'similar':total_label
    })
    
    return pair_data

## data generation

In [None]:
code_folder = "data/train_code"
problem_folders = os.listdir(code_folder)
processed_scripts = []
problem_nums = []

for problem_folder in tqdm(problem_folders):
    scripts = os.listdir(os.path.join(code_folder, problem_folder))
    problem_num = scripts[0].split('_')[0]
    for script in scripts:
        script_file = os.path.join(code_folder, problem_folder, script)
        processed_script = clean_data(script_file, data_type="dir")
        processed_scripts.append(processed_script)
    problem_nums.extend([problem_num] * len(scripts))
    
pp_train_df = pd.DataFrame(
    data={'code': processed_scripts, 'problem_num': problem_nums}
)

pp_train_df.head()

In [None]:
pp_train_bm25 = get_pairs(pp_train_df, tokenizer)

plength = len(pp_train_bm25) // 10
for i in tqdm(range(10)):
    pp_train_df = pp_train_bm25.iloc[i*plength:(i+1)*plength]
    pp_train_df.to_parquet(f'pp_train_{idx}_{i}.parquet', engine='pyarrow', index=False)

In [None]:
test_df = pd.read_csv("data/test.csv")
code1 = test_df['code1'].values
code2 = test_df['code2'].values
processed_code1 = []
processed_code2 = []
for i in tqdm(range(len(code1))):
    processed_c1 = clean_data(code1[i], data_type="file")
    processed_c2 = clean_data(code2[i], data_type="file")
    processed_code1.append(processed_c1)
    processed_code2.append(processed_c2)
    
pp_test_df = pd.DataFrame(
    list(zip(processed_code1, processed_code2)), columns=["code1", "code2"]
)
    
pp_test_df.to_parquet(f'pp_test_{idx}.parquet', engine='pyarrow', index=False)

## sample generation

In [None]:
df = pd.DataFrame()
for i in range(10):
    temp_df = pd.read_parquet(f'pp_train_{idx}_{i}.parquet', engine='pyarrow')
    df = pd.concat([df, temp_df])
df = df.reset_index(drop=True)

len(df)

In [None]:
sample_df = df.sample(frac=args.frac, random_state=SEED).reset_index(drop=True)
sample_df = sample_df[(sample_df["code1"]!="") & (sample_df["code2"]!="")]
sample_df.to_csv(f"pp_train_{idx}_frac{args.frac}.csv", index=False)

len(sample_df)

In [None]:
# wandb.finish()