## 학습용 데이터 생성

### Library

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

random.seed(42)
np.random.seed(42)

### hyperparameter

In [2]:
path = '.' # train_code 폴더의 바로 위 폴더까지의 경로. 현재 환경에 train_code 폴더가 있다면 '.'이면 되고, 아니라면: ex) C:/.../Code_Similarity/train_code 라면 'C:/.../Code_Similarity'
sample_count = 1000000 # 만들고싶은 총 샘플 데이터 수
pair_proportion = 0.5 # 같은 문제를 해결하는 코드끼리 묶여있는 짝의 비율(similar가 1인 샘플의 비율. 0~1사이의 실수)

### generator function

In [3]:
def new_sample_generator(path, sample_count, pair_proportion):
    # pair 및 non-pair count 사전 계산
    pair_count = int(sample_count * pair_proportion)
    not_pair_count = sample_count - pair_count
    
    # similar 쌍과 not similar 쌍을 미리 생성
    sample_similar_list = np.random.permutation([1] * pair_count + [0] * not_pair_count)
    
    # 문제 번호 및 샘플 번호 생성
    problem_num_diction = [
        np.random.randint(1, 501) if boolean else list(np.random.choice(range(1, 501), size=2, replace=False)) 
        for boolean in tqdm(sample_similar_list)
    ]
    
    sample_num_diction = [
        list(np.random.choice(range(1, 501), size=2, replace=False if boolean else True)) 
        for boolean in tqdm(sample_similar_list)
    ]
    
    data = []

    for i in tqdm(range(sample_count)):
        boolean = sample_similar_list[i]
        if boolean:
            code_path_1 = f'/train_code/problem{problem_num_diction[i]:03d}/problem{problem_num_diction[i]:03d}_{sample_num_diction[i][0]}.cpp'
            code_path_2 = f'/train_code/problem{problem_num_diction[i]:03d}/problem{problem_num_diction[i]:03d}_{sample_num_diction[i][1]}.cpp'
        else:
            code_path_1 = f'/train_code/problem{problem_num_diction[i][0]:03d}/problem{problem_num_diction[i][0]:03d}_{sample_num_diction[i][0]}.cpp'
            code_path_2 = f'/train_code/problem{problem_num_diction[i][1]:03d}/problem{problem_num_diction[i][1]:03d}_{sample_num_diction[i][1]}.cpp'
        
        code_1 = open(path + code_path_1, encoding='utf-8').read()
        code_2 = open(path + code_path_2, encoding='utf-8').read()
        data.append({'code1_path': '.' + code_path_1, 'code2_path': '.' + code_path_2, 'code1': code_1, 'code2': code_2, 'similar': boolean})

    # dict을 DataFrame으로 변환
    new_df = pd.DataFrame(data)
    
    return new_df

### generate data

In [5]:
new_df = new_sample_generator(path, sample_count, pair_proportion)

100%|██████████| 1000000/1000000 [00:23<00:00, 43048.43it/s]
100%|██████████| 1000000/1000000 [00:42<00:00, 23684.40it/s]
100%|██████████| 1000000/1000000 [00:34<00:00, 29298.81it/s]


### data visualization

In [6]:
new_df

Unnamed: 0,code1_path,code2_path,code1,code2,similar
0,./train_code/problem368/problem368_42.cpp,./train_code/problem496/problem496_265.cpp,#include<iostream>\n#include<algorithm>\n#incl...,"#include<bits/stdc++.h>\n#define rep(i,n) for(...",0
1,./train_code/problem304/problem304_436.cpp,./train_code/problem304/problem304_270.cpp,#include <bits/stdc++.h> \nusing namespace st...,#include <bits/stdc++.h>\n#define _GLIBCXX_DEB...,1
2,./train_code/problem276/problem276_211.cpp,./train_code/problem123/problem123_406.cpp,#include<iostream>\n#include<algorithm>\n#defi...,#include<iostream>\n#include<string>\n#include...,0
3,./train_code/problem416/problem416_190.cpp,./train_code/problem041/problem041_151.cpp,#include <iostream>\n#include <string>\n#inclu...,#include <iostream>\n#include <string>\n#inclu...,0
4,./train_code/problem006/problem006_66.cpp,./train_code/problem006/problem006_173.cpp,#include<iostream>\n#include<algorithm>\n#incl...,#include <algorithm>\n#include <cstdio>\n#incl...,1
...,...,...,...,...,...
999995,./train_code/problem088/problem088_440.cpp,./train_code/problem088/problem088_489.cpp,#include <iostream>\n#include <cmath>\n\nusing...,//\n// AOJ 0009 Prime Number\n//\n// Created...,1
999996,./train_code/problem132/problem132_146.cpp,./train_code/problem132/problem132_281.cpp,#include <bits/stdc++.h>\nusing namespace std;...,"#pragma GCC optimize(""O3"")\n #pragma GCC opti...",1
999997,./train_code/problem141/problem141_172.cpp,./train_code/problem141/problem141_347.cpp,#include<bits/stdc++.h>\nusing namespace std;\...,#include <iostream>\n#include <vector>\n#inclu...,1
999998,./train_code/problem105/problem105_238.cpp,./train_code/problem172/problem172_252.cpp,"#include <bits/stdc++.h>\n#define rep(i, n) fo...",#include <bits/stdc++.h>\nusing namespace std;...,0


In [7]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   code1_path  1000000 non-null  object
 1   code2_path  1000000 non-null  object
 2   code1       1000000 non-null  object
 3   code2       1000000 non-null  object
 4   similar     1000000 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 38.1+ MB


### dataframe save

In [8]:
new_df.to_csv('train_2.csv', index=False)