# Generator

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer

from rank_bm25 import BM25Okapi
from itertools import combinations

import re
from glob import glob
from tqdm import tqdm

import argparse

parser = argparse.ArgumentParser(description='Generator')
parser.add_argument('--pretrained_model', default="bert", type=str) # bert or codebert
parser.add_argument('--max_length', default=384, type=int)# 384 or 256
parser.add_argument('--validation_split', default=0.2, type=float)
parser.add_argument('--seed', default=1011, type=int)
args = parser.parse_args('')

if args.pretrained_model == "bert":
    pretrained_model = "bert-base-uncased"
elif args.pretrained_model == "codebert":
    pretrained_model = "microsoft/graphcodebert-base"
    
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, do_lower_case=True)
tokenizer.truncation_side='left'

MAX_LENGTH = args.max_length
VALIDATION_SPLIT=args.validation_split
SEED=args.seed

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds()

## Data Generating Process

In [None]:
def read_code(code):
    with open(code,'r',encoding='utf-8') as file:
        lines = file.readlines()
        script = '\n'.join(lines)
    return script

root_dir = 'data/code'

problem_num_list=[]
code_path_list = []
possible_code_extension = ['.py']


for (root, dirs, files) in os.walk(root_dir):
    if len(files) > 0:
        for file_name in files:
            if os.path.splitext(file_name)[1] in possible_code_extension:
                code_path = root + '/' + file_name
                code_path_list.append(code_path)
                problem_num_list.append(root.split("\\")[-1])
                
code_list=[read_code(i) for i in tqdm(code_path_list)]

len(code_list), len(problem_num_list)

In [None]:
def text_preprocessing(series):
    temp = series.copy()
    temp = temp.apply(lambda x : re.sub(re.compile("\"\"\".*\"\"\"", re.DOTALL), "", x))
    temp = temp.apply(lambda x : re.sub(re.compile("\'\'\'.*\'\'\'", re.DOTALL), "", x))
    temp = temp.apply(lambda x : re.sub(re.compile("#.*"), "", x))
    temp = temp.apply(lambda x : re.sub("print(.*)", "", x))
    temp = temp.apply(lambda x : x.replace("    ","\t"))
    temp = temp.apply(lambda x : x.split("\n"))
    temp_document=[]
    for document in tqdm(temp):
        temp_sentence=[]
        for sentence in document:
            if "import" in sentence:
                continue
            if sentence=="":
                continue
            temp_sentence.append(sentence.rstrip())
        temp_document.append("\n".join(temp_sentence))
    temp = pd.Series(data=temp_document, index=temp.index, name=temp.name)
    return temp

def code_preprocessing(df):
    temp = df.copy()
    temp["code"] = text_preprocessing(temp["code"])
    # temp["len"] = temp["code"].apply(tokenizer.tokenize).apply(len)
    # temp = temp[temp['len']<=MAX_LENGTH].reset_index(drop=True)
    return temp

df = pd.DataFrame({'code': code_list,
                   'problem_num': problem_num_list})

df = code_preprocessing(df)
df.info()

In [None]:
def generator(df):
    
    codes = df['code'].to_list()
    problems = df['problem_num'].unique().tolist()
    problems.sort()

    tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
    bm25 = BM25Okapi(tokenized_corpus)

    total_positive_pairs = []
    total_negative_pairs = []

    for problem in tqdm(problems):
        solution_codes = df[df['problem_num'] == problem]['code']
        positive_pairs = list(combinations(solution_codes.to_list(),2))

        solution_codes_indices = solution_codes.index.to_list()
        negative_pairs = []

        ### scoring
        negative_code_scores = []
        for solution_code in solution_codes:
            tokenized_code = tokenizer.tokenize(solution_code)
            negative_code_scores.append(bm25.get_scores(tokenized_code))
        negative_code_scores = np.mean(negative_code_scores, axis=0)
        negative_code_ranking = negative_code_scores.argsort()[::-1]
        ranking_idx = 0

        for solution_code in solution_codes:
            negative_solutions = []
            while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
                high_score_idx = negative_code_ranking[ranking_idx]

                if high_score_idx not in solution_codes_indices:
                    negative_solutions.append(df['code'].iloc[high_score_idx])
                ranking_idx += 1

            for negative_solution in negative_solutions:
                negative_pairs.append((solution_code, negative_solution))

        total_positive_pairs.extend(positive_pairs)
        total_negative_pairs.extend(negative_pairs)
    
    pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
    pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

    neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
    neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

    pos_label = [1]*len(pos_code1)
    neg_label = [0]*len(neg_code1)

    pos_code1.extend(neg_code1)
    total_code1 = pos_code1
    pos_code2.extend(neg_code2)
    total_code2 = pos_code2
    pos_label.extend(neg_label)
    total_label = pos_label
    pair_data = pd.DataFrame(data={
        'code1':total_code1,
        'code2':total_code2,
        'similar':total_label
    })
    return pair_data   
    

train_df, valid_df, train_label, valid_label = train_test_split(
    df,
    df['problem_num'],
    test_size=VALIDATION_SPLIT,
    random_state=SEED,
    stratify=df['problem_num'],
)

train_df = generator(train_df.reset_index(drop=True))
valid_df = generator(valid_df.reset_index(drop=True))

len(train_df), len(valid_df)

### CSV Save

In [None]:
# %%time

# train_df.to_csv(f'train_{args.pretrained_model}.csv', index=False)
# valid_df.to_csv(f'valid_{args.pretrained_model}.csv', index=False)