In [65]:
import os
from tqdm import tqdm
import pandas as pd
import re
from rank_bm25 import BM25Okapi
from itertools import combinations
import torch
from transformers import AutoTokenizer
from glob import glob


In [66]:
base_path = './open'
train_path = os.path.join(base_path, 'train_code')
data_path = os.path.join(base_path, 'dataframe')

[]

# 전처리 함수

In [67]:
import re

def data_clean(text):
    # 중복 줄 바꿈 제거
    

    # 주석 제거
    text = re.sub(r'//.*', '', text)  # 한 줄 주석 제거
    text = re.sub(r'/\*.*?\*/', '', text, flags=re.DOTALL)  # 여러 줄 주석 제거
    
    # #include 
    text = re.sub(r'#include.*', '', text)

    text = re.sub(r'<vector>', '', text)
    text = re.sub(r'vector<.*?>', '', text)
    text = re.sub(r'typedef.*;', '', text)
    text = re.sub(r'template<.*?>', '', text)
    
    # using namespace std 제거
    text = re.sub(r'using namespace std;', '', text)
    
    # 데이터 타입 형식 제거 (int, long, char 등)
    text = re.sub(r'\b(int|long|char|short|float|double|bool|void|unsigned|signed)\b', '', text)
    
    # 특수 문자 제거 (#)
    
    # const 제거
    text = re.sub(r'\bconst\b', '', text)
    
    # pragma 제거
    text = re.sub(r'#pragma.*', '', text)
    
    text = re.sub(r'\n+', '\n', text)

    text = re.sub(r'\s+', ' ', text)
    text = text.replace('\t', ' ')
    
    # 빈 줄 제거
    text = text.strip()
    
    return text

exam = '''
#include<iostream>
<vector>
using namespace std;
typedef long long li;
#define repa(i,a,n) for(int i=(a);i<(n);i++)
#define rep(i,n) for(int i=0;i<(n);i++)
#define df 0
template<class T> void print(const T& t){ cout << t << "\n"; }
template<class T, class... Ts> void print(const T& t, const Ts&... ts) { cout << t; if (sizeof...(ts)) cout << " "; print(ts...); }

int main(){
  int n; cin >>n;
  vector<int> c(n-1),s(n-1),f(n-1);
  rep(i,n-1){
    cin >>c[i] >>s[i] >>f[i];
  }
  rep(j,n){
    int t=0;
    repa(i,j,n-1){
      //      if(df)print(s[i],f[i]);
      if(t<s[i])t=s[i];
      if(t%f[i])t=(t/f[i]+1)*f[i];
      t+=c[i];
    }
    print(t);
  }
}
'''

cleaned_exam = data_clean(exam)
print(cleaned_exam)


#define repa(i,a,n) for( i=(a);i<(n);i++) #define rep(i,n) for( i=0;i<(n);i++) #define df 0 print( T& t){ cout << t << " "; } print( T& t, Ts&... ts) { cout << t; if (sizeof...(ts)) cout << " "; print(ts...); } main(){ n; cin >>n; c(n-1),s(n-1),f(n-1); rep(i,n-1){ cin >>c[i] >>s[i] >>f[i]; } rep(j,n){ t=0; repa(i,j,n-1){ if(t<s[i])t=s[i]; if(t%f[i])t=(t/f[i]+1)*f[i]; t+=c[i]; } print(t); } }


# p_df(원본 코드를 풀어서 전처리한 후 정렬한 데이터프레임) 생성

In [9]:
import pandas as pd
import os
from tqdm import tqdm
import glob

code_list = []
p_num_list = []

# 파일들의 경로를 가져오기 위해 glob 사용
for p_num, problem in enumerate(tqdm(dir_list), start=1):
    for sol in glob.glob(os.path.join(problem, '*')):
        with open(sol, 'r', encoding='utf-8') as f:
            code = f.read()
            code_list.append(data_clean(code))
            p_num_list.append(p_num)

p_df = pd.DataFrame(data={"code": code_list, "p_num": p_num_list})

# CSV 파일로 저장
p_df.to_csv(os.path.join(data_path, "problem_df_2403290000.csv"), index=False)


100%|██████████| 500/500 [00:25<00:00, 19.81it/s]


In [41]:
pd.set_option('display.max_colwidth', None)

In [10]:
#불러올때
p_df = pd.read_csv(os.path.join(data_path, "problem_df_2403280000.csv"))

In [11]:
p_df.head()

Unnamed: 0,code,p_num
0,"#define for(i, n) for( i = 0; i < ()n; ++i) #d...",1
1,using ul = ; using ull = ; #define vec vect...,1
2,using ll = ; using ull = ; using ld = ; #d...,1
3,"#define endl '\n'; main() { xi, r, d, x2000,...",1
4,#define FAST ios_base::sync_with_stdio(0); cin...,1


# pair 생성 함수

In [13]:
from itertools import combinations
import random

def get_pair(inputs, tokenizer):
  codes = inputs['code'].to_list()
  problems = inputs['p_num'].unique().tolist()
  problems.sort()

  tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
  bm25 = BM25Okapi(tokenized_corpus)

  total_positive_pairs = []
  total_negative_pairs = []

  for problem in tqdm(problems):
    solution_codes = inputs[inputs['p_num'] == problem]['code']
    positive_pairs = list(combinations(solution_codes.to_list(), 2))
    positive_pairs = random.sample(positive_pairs, len(positive_pairs) // 20)

    solution_codes_indices = solution_codes.index.to_list()
    negative_pairs = []

    first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
    negative_code_scores = bm25.get_scores(first_tokenized_code)
    negative_code_ranking = negative_code_scores.argsort()[::-1]
    ranking_idx = 0

    for solution_code in solution_codes:
      negative_solutions = []
      while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
        high_score_idx = negative_code_ranking[ranking_idx]

        if high_score_idx not in solution_codes_indices:
          negative_solutions.append(inputs['code'].iloc[high_score_idx])
        ranking_idx += 1

      for negative_solution in negative_solutions:
        negative_pairs.append((solution_code, negative_solution))

    total_positive_pairs.extend(positive_pairs)
    total_negative_pairs.extend(negative_pairs)

  positive_code1 = list(map(lambda x:x[0], total_positive_pairs))
  positive_code2 = list(map(lambda x:x[1], total_positive_pairs))

  negative_code1 = list(map(lambda x:x[0], total_negative_pairs))
  negative_code2 = list(map(lambda x:x[1], total_negative_pairs))

  positive_label = [1] * len(positive_code1)
  negative_label = [0] * len(negative_code1)

  positive_code1.extend(negative_code1)
  positive_code2.extend(negative_code2)
  positive_label.extend(negative_label)

  pair_data = pd.DataFrame(data = {
      'code1' : positive_code1,
      'code2' : positive_code2,
      'similar' : positive_label
  })

  pair_data = pair_data.sample(frac=1).reset_index(drop=True)

  return pair_data



In [16]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    p_df,
    test_size = 0.05,
    random_state = 42,
    stratify = p_df['p_num']
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

train_df

Unnamed: 0,code,p_num
0,"maxN = 1e5 + 13; n, mx, num, a[maxN], mn = 1...",461
1,"#define rep(i, n) for ( i = 0; i < ()(n); ++i)...",199
2,using ll = ; using ld = ; #define INF 1e12 #...,92
3,using ll = ; main() { t; cin >> t; a = ()t...,309
4,"#define REP(i, n) for( i = 0; i < n; i++) #def...",448
...,...,...
237495,"#define rep(i,n) for ( i = 0; i < (n); i++) #d...",104
237496,solve1() { n; cin >> n; a(n); dp(n); for ( i...,272
237497,"#define rep(i,n) for( i = 0; i < ()(n); ++i) #...",105
237498,using llint = ; n; va; using vecitr = ::ite...,461


# bm_25 train,val 생성, 1차 토큰화

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('neulab/codebert-cpp')
tokenizer.truncation_side = 'left'

# train_df와 val_df 데이터프레임 복사
train_df_copy = train_df.copy()
val_df_copy = val_df.copy()

# 토크나이저를 사용하여 코드를 토큰화
train_df_copy['code_tokenized'] = train_df_copy['code'].apply(lambda x: tokenizer.tokenize(x))
val_df_copy['code_tokenized'] = val_df_copy['code'].apply(lambda x: tokenizer.tokenize(x))

# 코드 길이를 최대 512로 제한
train_df_copy['code_tokenized'] = train_df_copy['code_tokenized'].apply(lambda x: x[:512])
val_df_copy['code_tokenized'] = val_df_copy['code_tokenized'].apply(lambda x: x[:512])

# 새로운 데이터프레임을 사용하여 pair 생성 함수 호출
bm25_train_df = get_pair(train_df_copy, tokenizer)

# 결과 출력
bm25_train_df


tokenizer_config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (644 > 512). Running this sequence through the model will result in indexing errors
  7%|▋         | 37/500 [16:06<3:21:40, 26.13s/it]

KeyboardInterrupt



In [56]:
def generate_pairs(inputs, tokenizer):
    total_positive_pairs = []
    total_negative_pairs = []

    for problem in tqdm(inputs['p_num'].unique(), desc="Generating pairs"):
        solution_codes = inputs[inputs['p_num'] == problem]['code']
        if len(solution_codes) < 2:
            continue  # 솔루션 코드가 2개 미만인 경우 쌍을 생성할 수 없음

        # 긍정적인 쌍 생성
        positive_pairs = list(combinations(solution_codes.tolist(), 2))
        total_positive_pairs.extend(positive_pairs)

        # 부정적인 쌍 생성
        for solution_code in solution_codes:
            other_solution_codes = solution_codes[solution_codes != solution_code]
            for other_solution_code in other_solution_codes:
                total_negative_pairs.append((solution_code, other_solution_code))

    return total_positive_pairs, total_negative_pairs



def get_pair_val(val_df, tokenizer):
    # 코드 쌍 생성
    positive_pairs, negative_pairs = generate_pairs(val_df, tokenizer)

    # 데이터프레임 생성
    if len(positive_pairs) == 0 or len(negative_pairs) == 0:
        return pd.DataFrame(columns=['code1', 'code2', 'similar'])

    pair_data = pd.DataFrame({
        'code1': [],
        'code2': [],
        'similar': []
    })

    # 데이터프레임에 데이터 추가
    pair_data['code1'] = [pair[0] for pair in positive_pairs] + [pair[0] for pair in negative_pairs]
    pair_data['code2'] = [pair[1] for pair in positive_pairs] + [pair[1] for pair in negative_pairs]
    pair_data['similar'] = [1] * len(positive_pairs) + [0] * len(negative_pairs)

    # 데이터프레임을 임의로 섞음
    pair_data = pair_data.sample(frac=1).reset_index(drop=True)

    return pair_data


In [57]:
bm25_val_df = get_pair_val(val_df_copy, tokenizer)

Generating pairs: 100%|█████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 796.38it/s]


In [30]:
bm25_train_df.to_csv(os.path.join(data_path, "bm25_train_df_2403262033.csv"), index=False)
bm25_val_df.to_csv(os.path.join(data_path, "bm25_val_df_2403262033.csv"), index=False)

NameError: name 'bm25_train_df' is not defined

In [60]:
bm25_train_df

Unnamed: 0,code1,code2,similar
0,"Main() { A, B, C, D; cin >> A >> B >> C >> D...",#define fi first #define se second ll MOD = 1e...,0
1,"#define for(i, n) for ( i = 0; i < ()(n); i++)...","#define for(i, n) for (ll i = 0; i < (ll)(n); ...",0
2,"#define REP(i,n) for( i=0, i##_len=(n); i<i##_...","#define for(i,n) for ( i = 0; i < (n); ++i) us...",1
3,"main() { c[1001]; i, sum = 0; x; while (1...","och[] = { '0','i','x','c','m' }; getn( * s)...",0
4,main(){ a(1000001); k; cin>>k; for( i=1;i<=1...,"#define for(i, n) for ( i = 0; i < (n); i++) #...",0
...,...,...,...
6026421,#define ll #define ld #define pb push_back...,"ll n,len,q; ll num; ll block; ll a[100010]; ll...",1
6026422,"main() { string str = """"; stack <> R; while (...",main() { a; array; for(;cin >> a;) { if(a==0...,1
6026423,#define pb push_back #define pu push #define l...,#define vi #define pb push_back #define ff fir...,0
6026424,"#define endl ""\n"" #define sz(x) ((ll)(x).size(...","chmax( T &a, T &b ) { if ( a <= b ) { a = b; ...",1


In [62]:
# 해당 코드는 test.df 생성코드

import pandas as pd
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('neulab/codebert-cpp')
tokenizer.truncation_side = 'left'

# test.csv 파일을 읽어들임
test_df = pd.read_csv(os.path.join(base_path, 'test.csv'))

# 데이터프레임을 복사
test_df_copy = test_df.copy()

# 코드를 토큰화하여 새로운 열에 저장
test_df_copy['code1_tokenized'] = test_df_copy['code1'].apply(lambda x: tokenizer.tokenize(x)[:512])
test_df_copy['code2_tokenized'] = test_df_copy['code2'].apply(lambda x: tokenizer.tokenize(x)[:512])

# 토큰화된 결과를 CSV 파일에 저장
test_df_copy.to_csv(os.path.join(data_path, "test_df_2403280000.csv"), index=False)


Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors


In [5]:
test_df

Unnamed: 0,pair_id,code1,code2
0,TEST_000000,#include <bits/stdc++.h>\nusing namespace std;...,"#include <bits/stdc++.h>\n#define rep(i, n) fo..."
1,TEST_000001,"#include<bits/stdc++.h>\n#define rep(i,n)for(i...",// //bitset操作\n// #include <iostream>\n// #inc...
2,TEST_000002,#include <bits/stdc++.h>\nusing namespace std;...,#include <bits/stdc++.h>\n#include <ext/pb_ds/...
3,TEST_000003,#include <bits/stdc++.h>\nusing namespace std;...,#include <bits/stdc++.h>\nusing namespace std;...
4,TEST_000004,#include<bits/stdc++.h>\nusing namespace std;\...,#include<iostream>\n#include<algorithm>\n#incl...
...,...,...,...
594995,TEST_594995,#include <bits/stdc++.h>\n#include <unordered_...,#include <bits/stdc++.h>\n#define ALL(a) (a).b...
594996,TEST_594996,#include <bits/stdc++.h>\nusing namespace std;...,#include <bits/stdc++.h>\nusing namespace std;...
594997,TEST_594997,#include <algorithm>\n#include <bits/stdc++.h>...,"#include ""bits/stdc++.h""\n\nusing namespace st..."
594998,TEST_594998,#include <iostream>\n#include <vector>\n#inclu...,#include <iostream>\n#include <math.h>\nusing ...


# 학습 준비

In [70]:

import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import torch
import transformers

from glob import glob
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, AutoModel, AutoModelForSequenceClassification,DataCollatorForTokenClassification,EarlyStoppingCallback
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric, Dataset

from tqdm import tqdm
from tqdm import trange
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [71]:
device

'cuda'

In [72]:
data_path = './open/dataframe/'


train_df = pd.read_csv(os.path.join(data_path, "bm25_train_df_2403262033.csv"))
val_df = pd.read_csv(os.path.join(data_path, "bm25_val_df_2403262033.csv"))
train_df = train_df.rename(columns={'similar': 'label'})
val_df = val_df.rename(columns={'similar': 'label'})

In [73]:
train_df

Unnamed: 0,code1,code2,label
0,"#define rep(i, n) for(ll i = 0; i < (n); i++) ...","main(){ N, K; scanf(""%d"", &N); scanf(""%d"", &K)...",1
1,#define a[15]; main() { for ( i=1;i<=12;i++) c...,"main(){ a1,a2,a3,a4,b1,b2,b3,b4,b5,b6,b7,b8; c...",1
2,#define INF 1e+9 main(){ dp[1001][1001] = {}; ...,"#define rep(i, n) for (ll i = 0; i < (n); ++i)...",0
3,#define INF 999999999999999997 #define MP make...,"main(){ n,m; cin >> n >> m; ac,tle; ac=n-m; tl...",1
4,using ll = ; INFint = 2e9+1; ll INFll = 2e18+1...,"#define repl(i, l, r) for (ll i = (l); i < (r)...",0
...,...,...,...
5426495,using ll= ; #define _CRT_SECURE_NO_WARNINGS #d...,"#define SORT(c) sort((c).begin(),(c).end()) #d...",0
5426496,#define mp make_pair #define pb push_back #def...,#define maxn 200010 maxm=1<<26; #define INF 0x...,0
5426497,"main(){ n,leg[2]={0,0},count=0; state=false; f...","main() { string a[100]; b; while (cin >> b, b)...",1
5426498,"> G[100000]; ll K; d(100000, LLONG_MAX); prior...","#define REP(i,n) for( i=0, i##_len=(n); i<i##_...",0


# 300만 샘플 추출

In [74]:
train_df = train_df.sample(n = 3000000, replace = True).reset_index(drop=True)
val_df = val_df.sample(n= 1000, replace = True).reset_index(drop = True)
dataset_train = Dataset.from_pandas(train_df)
dataset_val = Dataset.from_pandas(val_df)

In [75]:
train_df

Unnamed: 0,code1,code2,label
0,"#define rep(i, n) for((i) = 0; (i) < (n); (i)+...","max_n = 1011, inf = 1000111222; mabs( x) { if ...",0
1,"#define rep(i,n) for ( i = 0; i < (n); ++i) us...","#define rep(i,n) for( i=0;i<(n);++i) #define A...",0
2,#define N ( )(1e9 + 7) #define MAX 500000 fact...,"#define FOR(i, b, e) for(ll i = (ll)(b); i < (...",1
3,"#define rep(i,a,b) for( i=a;i<b;i++) #define r...",using boost::multiprecision::cpp_int; using ll...,0
4,main() { cin.tie(0); ios::sync_with_stdio(fals...,"#define rep(i, s, e) for ( i = s; i < e; ++i) ...",1
...,...,...,...
2999995,INF = 1e9; MOD = 1e9+7; LINF = 1e18; #define d...,"#define rep(i,j,n) for( i=(j);i<(n);i++) #defi...",0
2999996,"#define rep(i,n) for ( i = 0;i < n;i++) using ...","#define rep0(i, n) for ( i = 0; i < ()(n); i++...",1
2999997,#define ll #define pb push_back #define mk mak...,main() { simap words; std::string input; while...,1
2999998,"MAX = 510000; MOD = 1000000007; #define rep(i,...",LG = 21; FN = 400005; MOD = 1e9 + 7; INF = 1e9...,1


# 모델 준비, 2차 토큰화

In [76]:
import random

#model_name = 'neulab/codebert-cpp'
model_name = 'microsoft/codereviewer'
wd = 0.01
batch_size = 16
lr = 2e-5
epochs = 1
task = 'binary_classification'
label_list = ['0', '1']
num_labels = 2

def seed_everything(seed:42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at microsoft/codereviewer and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [77]:
def tokenize_function(examples):
    tokenizer.truncation_side = 'left'
    return tokenizer(examples["code1"], examples["code2"],padding="max_length", max_length = 512, truncation=True)

In [78]:
tokenized_train_datasets= dataset_train.map(tokenize_function, batched=True)
tokenized_val_datasets = dataset_val.map(tokenize_function, batched=True)

Map:   0%|          | 0/3000000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

# 학습 시작

In [79]:
args = TrainingArguments(
    output_dir = './open/codereviewer',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate= lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    seed = 42,
    weight_decay=wd,
    load_best_model_at_end=True,
    logging_dir = './open/codereviewer'
)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = []
    for i in pred.predictions:
      preds.append(np.argmax(i, axis=1).flatten())
    print(preds)
    acc = accuracy_score(labels, preds[0])
    return {'accuracy': acc}

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.013,0.029452,0.994


[array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

There were missing keys in the checkpoint model loaded: ['transformer.encoder.embed_tokens.weight', 'transformer.decoder.embed_tokens.weight'].


TrainOutput(global_step=187500, training_loss=0.039946160835266115, metrics={'train_runtime': 73568.4794, 'train_samples_per_second': 40.778, 'train_steps_per_second': 2.549, 'total_flos': 1.83233074176e+18, 'train_loss': 0.039946160835266115, 'epoch': 1.0})

# 학습 완료한 모델 저장

In [80]:
base_path = './open/codereviewer'

torch.save(model, os.path.join(base_path + '/model0401.pt'))  # 전체 모델 저장

In [81]:
base_path = './open/codereviewer'
data_path = './open/dataframe'

In [82]:
model = torch.load(os.path.join(base_path, 'model0401.pt'))

tokenizer = AutoTokenizer.from_pretrained(model_name)

test_df = pd.read_csv(os.path.join(data_path,'test_df_2403262033.csv'))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [83]:
test_df = test_df.drop(columns = ['pair_id', 'code1_tokenized', 'code2_tokenized'], axis = 1)

In [84]:
for i in range(len(test_df)):
  if i % 10000 == 0:
    print(i)
  test_df.iloc[i]['code1'] = data_clean(test_df.iloc[i]['code1'])
  test_df.iloc[i]['code2'] = data_clean(test_df.iloc[i]['code2'])

0


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  test_df.iloc[i]['code1'] = data_clean(test_df.iloc[i]['code1'])
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update t

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000


In [85]:
args = TrainingArguments(
    output_dir = './open',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate= lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    seed = 42,
    weight_decay=wd,
    load_best_model_at_end=True,
    logging_dir = './open'
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = []
    for i in pred.predictions:
      preds.append(np.argmax(i, axis=1).flatten())
    print(preds)
    acc = accuracy_score(labels, preds[0])
    return {'accuracy': acc}

trainer = Trainer(
    model,
    args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


# 예측

In [86]:
predictions = []
for i in trange(len(test_df) // 5000):
  pred = []
  dataset_test = Dataset.from_pandas(test_df[i*5000:(i+1)*5000])
  tokenized_test_datasets = dataset_test.map(tokenize_function, batched=True)
  predictions_test = trainer.predict(tokenized_test_datasets)

  for j in predictions_test.predictions:
    pred.append(np.argmax(j, axis=1).flatten())

  predictions.extend(pred[0])

len(predictions)

  0%|          | 0/119 [00:00<?, ?it/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  1%|          | 1/119 [00:57<1:52:21, 57.13s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  2%|▏         | 2/119 [01:50<1:47:10, 54.96s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  3%|▎         | 3/119 [02:43<1:44:47, 54.20s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  3%|▎         | 4/119 [03:37<1:43:11, 53.84s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  4%|▍         | 5/119 [04:30<1:41:52, 53.62s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  5%|▌         | 6/119 [05:23<1:40:49, 53.54s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  6%|▌         | 7/119 [06:17<1:39:50, 53.48s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  7%|▋         | 8/119 [07:10<1:38:53, 53.45s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  8%|▊         | 9/119 [08:03<1:37:57, 53.43s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  8%|▊         | 10/119 [08:57<1:37:03, 53.43s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  9%|▉         | 11/119 [09:51<1:36:19, 53.51s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 10%|█         | 12/119 [10:44<1:35:24, 53.50s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 11%|█         | 13/119 [11:37<1:34:29, 53.49s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 12%|█▏        | 14/119 [12:31<1:33:36, 53.49s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 13%|█▎        | 15/119 [13:24<1:32:41, 53.48s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 13%|█▎        | 16/119 [14:18<1:31:47, 53.47s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 14%|█▍        | 17/119 [15:11<1:30:54, 53.47s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 15%|█▌        | 18/119 [16:05<1:30:02, 53.49s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 16%|█▌        | 19/119 [16:58<1:29:08, 53.48s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 17%|█▋        | 20/119 [17:52<1:28:16, 53.50s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 18%|█▊        | 21/119 [18:45<1:27:21, 53.49s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 18%|█▊        | 22/119 [19:39<1:26:26, 53.47s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 19%|█▉        | 23/119 [20:32<1:25:30, 53.44s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 20%|██        | 24/119 [21:26<1:24:36, 53.43s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 21%|██        | 25/119 [22:19<1:23:43, 53.44s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 22%|██▏       | 26/119 [23:12<1:22:50, 53.44s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 23%|██▎       | 27/119 [24:06<1:22:06, 53.55s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 24%|██▎       | 28/119 [25:00<1:21:08, 53.50s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 24%|██▍       | 29/119 [25:54<1:20:45, 53.84s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 25%|██▌       | 30/119 [26:48<1:19:38, 53.69s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 26%|██▌       | 31/119 [27:41<1:18:34, 53.57s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 27%|██▋       | 32/119 [28:34<1:17:35, 53.51s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 28%|██▊       | 33/119 [29:28<1:16:35, 53.44s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 29%|██▊       | 34/119 [30:21<1:15:37, 53.39s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 29%|██▉       | 35/119 [31:14<1:14:43, 53.37s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 30%|███       | 36/119 [32:07<1:13:45, 53.33s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 31%|███       | 37/119 [33:01<1:12:52, 53.32s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 32%|███▏      | 38/119 [33:54<1:12:10, 53.47s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 33%|███▎      | 39/119 [34:48<1:11:10, 53.38s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 34%|███▎      | 40/119 [35:41<1:10:14, 53.35s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 34%|███▍      | 41/119 [36:34<1:09:20, 53.34s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 35%|███▌      | 42/119 [37:28<1:08:30, 53.38s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 36%|███▌      | 43/119 [38:21<1:07:37, 53.39s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 37%|███▋      | 44/119 [39:15<1:06:43, 53.38s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 38%|███▊      | 45/119 [40:08<1:05:57, 53.48s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 39%|███▊      | 46/119 [41:01<1:04:59, 53.42s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 39%|███▉      | 47/119 [41:55<1:04:06, 53.42s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 40%|████      | 48/119 [42:48<1:03:13, 53.43s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 41%|████      | 49/119 [43:42<1:02:21, 53.45s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 42%|████▏     | 50/119 [44:35<1:01:28, 53.46s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 43%|████▎     | 51/119 [45:29<1:00:35, 53.46s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 44%|████▎     | 52/119 [46:22<59:42, 53.47s/it]  

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 45%|████▍     | 53/119 [47:16<58:49, 53.47s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 45%|████▌     | 54/119 [48:11<58:22, 53.88s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 46%|████▌     | 55/119 [49:04<57:22, 53.78s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 47%|████▋     | 56/119 [49:58<56:24, 53.72s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 48%|████▊     | 57/119 [50:51<55:23, 53.61s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 49%|████▊     | 58/119 [51:46<55:00, 54.10s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 50%|████▉     | 59/119 [52:40<53:55, 53.92s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 50%|█████     | 60/119 [53:33<52:52, 53.76s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 51%|█████▏    | 61/119 [54:28<52:18, 54.11s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 52%|█████▏    | 62/119 [55:22<51:11, 53.89s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 53%|█████▎    | 63/119 [56:15<50:11, 53.77s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 54%|█████▍    | 64/119 [57:08<49:11, 53.66s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 55%|█████▍    | 65/119 [58:02<48:12, 53.56s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 55%|█████▌    | 66/119 [58:55<47:14, 53.48s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 56%|█████▋    | 67/119 [59:48<46:18, 53.44s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 57%|█████▋    | 68/119 [1:00:42<45:27, 53.49s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 58%|█████▊    | 69/119 [1:01:36<44:35, 53.51s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 59%|█████▉    | 70/119 [1:02:29<43:47, 53.62s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 60%|█████▉    | 71/119 [1:03:23<42:53, 53.61s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 61%|██████    | 72/119 [1:04:17<41:58, 53.58s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 61%|██████▏   | 73/119 [1:05:10<41:04, 53.57s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 62%|██████▏   | 74/119 [1:06:04<40:09, 53.55s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 63%|██████▎   | 75/119 [1:06:57<39:16, 53.56s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 64%|██████▍   | 76/119 [1:07:51<38:22, 53.54s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 65%|██████▍   | 77/119 [1:08:44<37:28, 53.53s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 66%|██████▌   | 78/119 [1:09:38<36:33, 53.51s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 66%|██████▋   | 79/119 [1:10:31<35:40, 53.50s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 67%|██████▋   | 80/119 [1:11:25<34:52, 53.66s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 68%|██████▊   | 81/119 [1:12:19<33:56, 53.60s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 69%|██████▉   | 82/119 [1:13:12<33:02, 53.57s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 70%|██████▉   | 83/119 [1:14:06<32:07, 53.55s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 71%|███████   | 84/119 [1:15:00<31:18, 53.67s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 71%|███████▏  | 85/119 [1:15:53<30:23, 53.64s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 72%|███████▏  | 86/119 [1:16:48<29:42, 54.02s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 73%|███████▎  | 87/119 [1:17:42<28:44, 53.89s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 74%|███████▍  | 88/119 [1:18:35<27:47, 53.81s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 75%|███████▍  | 89/119 [1:19:30<27:06, 54.23s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 76%|███████▌  | 90/119 [1:20:24<26:08, 54.07s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 76%|███████▋  | 91/119 [1:21:18<25:10, 53.95s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 77%|███████▋  | 92/119 [1:22:11<24:14, 53.87s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 78%|███████▊  | 93/119 [1:23:05<23:18, 53.77s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 79%|███████▉  | 94/119 [1:24:00<22:35, 54.20s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 80%|███████▉  | 95/119 [1:24:54<21:37, 54.04s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 81%|████████  | 96/119 [1:25:48<20:40, 53.92s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 82%|████████▏ | 97/119 [1:26:41<19:44, 53.86s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 82%|████████▏ | 98/119 [1:27:35<18:49, 53.77s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 83%|████████▎ | 99/119 [1:28:28<17:54, 53.72s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 84%|████████▍ | 100/119 [1:29:24<17:08, 54.15s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 85%|████████▍ | 101/119 [1:30:18<16:13, 54.10s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 86%|████████▌ | 102/119 [1:31:11<15:15, 53.88s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 87%|████████▋ | 103/119 [1:32:04<14:19, 53.73s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 87%|████████▋ | 104/119 [1:32:58<13:24, 53.64s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 88%|████████▊ | 105/119 [1:33:51<12:29, 53.57s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 89%|████████▉ | 106/119 [1:34:44<11:35, 53.48s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 90%|████████▉ | 107/119 [1:35:38<10:41, 53.43s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 91%|█████████ | 108/119 [1:36:31<09:47, 53.41s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 92%|█████████▏| 109/119 [1:37:24<08:53, 53.39s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 92%|█████████▏| 110/119 [1:38:18<08:00, 53.39s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 93%|█████████▎| 111/119 [1:39:11<07:07, 53.43s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 94%|█████████▍| 112/119 [1:40:05<06:13, 53.39s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 95%|█████████▍| 113/119 [1:40:59<05:22, 53.82s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 96%|█████████▌| 114/119 [1:41:53<04:28, 53.78s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 97%|█████████▋| 115/119 [1:42:46<03:34, 53.65s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 97%|█████████▋| 116/119 [1:43:41<02:41, 53.99s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 98%|█████████▊| 117/119 [1:44:35<01:47, 53.82s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

 99%|█████████▉| 118/119 [1:45:28<00:53, 53.66s/it]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

100%|██████████| 119/119 [1:46:21<00:00, 53.63s/it]


595000

In [87]:
sub = pd.read_csv(os.path.join('./open', 'sample_submission.csv'))
sub['similar'] = predictions

sub.to_csv(os.path.join(base_path, 'submission_0401.csv'), index = False)