In [1]:
import os
from tqdm import tqdm
import pandas as pd
import re
from rank_bm25 import BM25Okapi
from itertools import combinations
import torch
from transformers import AutoTokenizer
import numpy as np
import random

random.seed(42)
np.random.seed(42)

In [2]:
# GPU 사용 가능 여부를 확인합니다.
if torch.cuda.is_available():
    # 현재 사용 가능한 GPU 디바이스의 개수를 가져옵니다.
    device_count = torch.cuda.device_count()
    print(f"GPU를 사용할 수 있습니다. 사용 가능한 GPU 디바이스 개수: {device_count}")
    
    # 각 GPU 디바이스의 정보를 출력합니다.
    for i in range(device_count):
        device_name = torch.cuda.get_device_name(i)
        print(f"GPU 디바이스 {i}: {device_name}")
else:
    print("GPU를 사용할 수 없습니다.")


GPU를 사용할 수 있습니다. 사용 가능한 GPU 디바이스 개수: 1
GPU 디바이스 0: NVIDIA GeForce RTX 3050 Laptop GPU


In [2]:
val = pd.read_csv("C:\\Users\\82102\\Desktop\\Machine_Learning\\DACON_materials\\Code_Similarity\\sample_train.csv")
val.head()

Unnamed: 0,code1_path,code2_path,code1,code2,similar
0,./train_code/problem393/problem393_19.cpp,./train_code/problem033/problem033_439.cpp,#include <bits/stdc++.h>\n\nusing namespace st...,#include <algorithm>\n#include <bitset>\n#incl...,0
1,./train_code/problem019/problem019_210.cpp,./train_code/problem019/problem019_63.cpp,#include <iostream>\n\nusing namespace std;\n\...,#include <iostream>\n#include <string>\nusing ...,1
2,./train_code/problem107/problem107_486.cpp,./train_code/problem107/problem107_340.cpp,#include <iostream>\n#include <vector>\nusing ...,#include <cstdio>\n#include <cstdlib>\n#includ...,1
3,./train_code/problem187/problem187_257.cpp,./train_code/problem403/problem403_135.cpp,#include <bits/stdc++.h>\n#include <unordered_...,#include <bits/stdc++.h>\nusing namespace std;...,0
4,./train_code/problem173/problem173_490.cpp,./train_code/problem173/problem173_345.cpp,#include <bits/stdc++.h>\ntypedef long long ll...,"#include ""bits/stdc++.h""\n#define rep(i,n) for...",1


In [42]:
val.isnull().sum()

code1_path    0
code2_path    0
code1         0
code2         0
similar       0
dtype: int64

In [4]:
val['similar'].value_counts()

similar
0    10000
1    10000
Name: count, dtype: int64

# 추가 데이터 생성

In [5]:
path = '.' # train_code 폴더의 바로 위 폴더까지의 경로. 현재 환경에 train_code 폴더가 있다면 '.'이면 되고, 아니라면: ex) C:/.../Code_Similarity/train_code 라면 'C:/.../Code_Similarity'
sample_count = 20000 # 만들고싶은 총 샘플 데이터 수
pair_proportion = 0.5 # 같은 문제를 해결하는 코드끼리 묶여있는 짝의 비율(similar가 1인 샘플의 비율. 0~1사이의 실수)

In [6]:
def new_sample_generator(path, sample_count, pair_proportion):
    # pair 및 non-pair count 사전 계산
    pair_count = int(sample_count * pair_proportion)
    not_pair_count = sample_count - pair_count
    
    # similar 쌍과 not similar 쌍을 미리 생성
    sample_similar_list = np.random.permutation([1] * pair_count + [0] * not_pair_count)
    
    # 문제 번호 및 샘플 번호 생성
    problem_num_diction = [
        np.random.randint(1, 501) if boolean else list(np.random.choice(range(1, 501), size=2, replace=False)) 
        for boolean in tqdm(sample_similar_list)
    ]
    
    sample_num_diction = [
        list(np.random.choice(range(1, 501), size=2, replace=False if boolean else True)) 
        for boolean in tqdm(sample_similar_list)
    ]
    
    data = []

    for i in tqdm(range(sample_count)):
        boolean = sample_similar_list[i]
        if boolean:
            code_path_1 = f'/train_code/problem{problem_num_diction[i]:03d}/problem{problem_num_diction[i]:03d}_{sample_num_diction[i][0]}.cpp'
            code_path_2 = f'/train_code/problem{problem_num_diction[i]:03d}/problem{problem_num_diction[i]:03d}_{sample_num_diction[i][1]}.cpp'
        else:
            code_path_1 = f'/train_code/problem{problem_num_diction[i][0]:03d}/problem{problem_num_diction[i][0]:03d}_{sample_num_diction[i][0]}.cpp'
            code_path_2 = f'/train_code/problem{problem_num_diction[i][1]:03d}/problem{problem_num_diction[i][1]:03d}_{sample_num_diction[i][1]}.cpp'
        
        code_1 = open(path + code_path_1, encoding='utf-8').read()
        code_2 = open(path + code_path_2, encoding='utf-8').read()
        data.append({'code1_path': '.' + code_path_1, 'code2_path': '.' + code_path_2, 'code1': code_1, 'code2': code_2, 'similar': boolean})

    # dict을 DataFrame으로 변환
    new_df = pd.DataFrame(data)
    
    return new_df

In [7]:
new_df = new_sample_generator(path, sample_count, pair_proportion)

100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 35653.87it/s]
100%|█████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 20855.86it/s]
100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [02:30<00:00, 133.24it/s]


In [32]:
new_df.to_csv("./new_df.csv", index=False)

In [3]:
# new_df = pd.read_csv('./new_df.csv')

In [6]:
train = pd.concat([val, new_df], ignore_index=True)

In [7]:
train

Unnamed: 0,code1_path,code2_path,code1,code2,similar
0,./train_code/problem393/problem393_19.cpp,./train_code/problem033/problem033_439.cpp,#include <bits/stdc++.h>\n\nusing namespace st...,#include <algorithm>\n#include <bitset>\n#incl...,0
1,./train_code/problem019/problem019_210.cpp,./train_code/problem019/problem019_63.cpp,#include <iostream>\n\nusing namespace std;\n\...,#include <iostream>\n#include <string>\nusing ...,1
2,./train_code/problem107/problem107_486.cpp,./train_code/problem107/problem107_340.cpp,#include <iostream>\n#include <vector>\nusing ...,#include <cstdio>\n#include <cstdlib>\n#includ...,1
3,./train_code/problem187/problem187_257.cpp,./train_code/problem403/problem403_135.cpp,#include <bits/stdc++.h>\n#include <unordered_...,#include <bits/stdc++.h>\nusing namespace std;...,0
4,./train_code/problem173/problem173_490.cpp,./train_code/problem173/problem173_345.cpp,#include <bits/stdc++.h>\ntypedef long long ll...,"#include ""bits/stdc++.h""\n#define rep(i,n) for...",1
...,...,...,...,...,...
39995,./train_code/problem068/problem068_177.cpp,./train_code/problem468/problem468_105.cpp,#include <iostream>\n#include <algorithm>\n#in...,"#include<cstdio>\nint main()\n{\n\tint a,b;\n\...",0
39996,./train_code/problem174/problem174_15.cpp,./train_code/problem191/problem191_411.cpp,#include<bits/stdc++.h>\nusing namespace std;\...,//============================================...,0
39997,./train_code/problem326/problem326_458.cpp,./train_code/problem326/problem326_224.cpp,#include <bits/stdc++.h>\nusing namespace std;...,#include<bits/stdc++.h>\nusing namespace std;\...,1
39998,./train_code/problem029/problem029_256.cpp,./train_code/problem029/problem029_306.cpp,#include <iostream>\nusing namespace std;\n\ni...,#include<iostream>\n#define MAX 100\nusing nam...,1


In [8]:
train.isnull().sum()

code1_path    0
code2_path    0
code1         0
code2         0
similar       0
dtype: int64

# 전처리

In [31]:
val_copy = train[['code1', 'code2', 'similar']]

## 주석 제거

In [32]:
for i in tqdm(range(len(val_copy))):
    txt = val_copy['code1'][i]
    two_geul = [0, 0]
    juseok_flag = False
    new_txt = ""
    for j in range(len(txt)):
        two_geul[0] = two_geul[1]
        two_geul[1] = txt[j]
        if two_geul == ['/', '/']:
            new_txt = new_txt[:-1]
            juseok_flag = True
        if juseok_flag:
            if two_geul[1] == '\n':
                new_txt += txt[j]
                juseok_flag = False
        else:
            new_txt += txt[j]
    val_copy.loc[i, 'code1'] = new_txt

100%|██████████████████████████████████████████████████████████████████████████| 40000/40000 [00:30<00:00, 1332.32it/s]


In [33]:
for i in tqdm(range(len(val_copy))):
    txt = val_copy['code2'][i]
    two_geul = [0, 0]
    juseok_flag = False
    new_txt = ""
    for j in range(len(txt)):
        two_geul[0] = two_geul[1]
        two_geul[1] = txt[j]
        if two_geul == ['/', '/']:
            new_txt = new_txt[:-1]
            juseok_flag = True
        if juseok_flag:
            if two_geul[1] == '\n':
                new_txt += txt[j]
                juseok_flag = False
        else:
            new_txt += txt[j]
    val_copy.loc[i, 'code2'] = new_txt

100%|██████████████████████████████████████████████████████████████████████████| 40000/40000 [00:29<00:00, 1345.75it/s]


In [34]:
for i in tqdm(range(len(val_copy))):
    txt = val_copy['code1'][i]
    two_geul = [0, 0]
    juseok_flag = False
    new_txt = ""
    for j in range(len(txt)):
        two_geul[0] = two_geul[1]
        two_geul[1] = txt[j]
        if two_geul == ['/', '*']:
            new_txt = new_txt[:-1]
            juseok_flag = True
        if juseok_flag:
            if two_geul == ['*', '/']:
                juseok_flag = False
        else:
            new_txt += txt[j]
    val_copy.loc[i, 'code1'] = new_txt

100%|██████████████████████████████████████████████████████████████████████████| 40000/40000 [00:29<00:00, 1374.76it/s]


In [35]:
for i in tqdm(range(len(val_copy))):
    txt = val_copy['code2'][i]
    two_geul = [0, 0]
    juseok_flag = False
    new_txt = ""
    for j in range(len(txt)):
        two_geul[0] = two_geul[1]
        two_geul[1] = txt[j]
        if two_geul == ['/', '*']:
            new_txt = new_txt[:-1]
            juseok_flag = True
        if juseok_flag:
            if two_geul == ['*', '/']:
                juseok_flag = False
        else:
            new_txt += txt[j]
    val_copy.loc[i, 'code2'] = new_txt

100%|██████████████████████████████████████████████████████████████████████████| 40000/40000 [00:28<00:00, 1384.48it/s]


## #으로 시작하는거 제거

### define 남기기

In [36]:
# for i in tqdm(range(len(val_copy))):
#     txt = val_copy['code1'][i]
#     two_geul = [0, 0]
#     shap_flag = False
#     new_txt = ""
#     for j in range(len(txt)):
#         two_geul[0] = two_geul[1]
#         two_geul[1] = txt[j]
#         if two_geul[0] == '#':
#             if two_geul[1] != 'd':
#                 new_txt = new_txt[:-1]
#                 shap_flag = True
#         if shap_flag:
#             if two_geul[1] == '\n':
#                 new_txt += txt[j]
#                 shap_flag = False
#         else:
#             new_txt += txt[j]
#     val_copy.loc[i, 'code1'] = new_txt

In [37]:
# for i in tqdm(range(len(val_copy))):
#     txt = val_copy['code2'][i]
#     two_geul = [0, 0]
#     shap_flag = False
#     new_txt = ""
#     for j in range(len(txt)):
#         two_geul[0] = two_geul[1]
#         two_geul[1] = txt[j]
#         if two_geul[0] == '#':
#             if two_geul[1] != 'd':
#                 new_txt = new_txt[:-1]
#                 shap_flag = True
#         if shap_flag:
#             if two_geul[1] == '\n':
#                 new_txt += txt[j]
#                 shap_flag = False
#         else:
#             new_txt += txt[j]
#     val_copy.loc[i, 'code2'] = new_txt

### define도 제거

In [38]:
for i in tqdm(range(len(val_copy))):
    txt = val_copy['code1'][i]
    two_geul = [0, 0]
    shap_flag = False
    new_txt = ""
    for j in range(len(txt)):
        two_geul[0] = two_geul[1]
        two_geul[1] = txt[j]
        if two_geul[1] == '#':
            shap_flag = True
        if shap_flag:
            if two_geul[1] == '\n':
                new_txt += txt[j]
                shap_flag = False
        else:
            new_txt += txt[j]
    val_copy.loc[i, 'code1'] = new_txt

100%|██████████████████████████████████████████████████████████████████████████| 40000/40000 [00:25<00:00, 1545.97it/s]


In [39]:
for i in tqdm(range(len(val_copy))):
    txt = val_copy['code2'][i]
    two_geul = [0, 0]
    shap_flag = False
    new_txt = ""
    for j in range(len(txt)):
        two_geul[0] = two_geul[1]
        two_geul[1] = txt[j]
        if two_geul[1] == '#':
            shap_flag = True
        if shap_flag:
            if two_geul[1] == '\n':
                new_txt += txt[j]
                shap_flag = False
        else:
            new_txt += txt[j]
    val_copy.loc[i, 'code2'] = new_txt

100%|██████████████████████████████████████████████████████████████████████████| 40000/40000 [00:26<00:00, 1507.34it/s]


## 연속 줄바꿈 제거

In [40]:
for i in tqdm(range(len(val_copy))):
    txt = val_copy['code1'][i]
    two_geul = [0, 0]
    new_txt = ""
    for j in range(len(txt)):
        two_geul[0] = two_geul[1]
        two_geul[1] = txt[j]
        if two_geul == ['\n', '\n']:
            pass
        else:
            new_txt += txt[j]
    val_copy.loc[i, 'code1'] = new_txt

100%|██████████████████████████████████████████████████████████████████████████| 40000/40000 [00:24<00:00, 1619.07it/s]


In [41]:
for i in tqdm(range(len(val_copy))):
    txt = val_copy['code2'][i]
    two_geul = [0, 0]
    new_txt = ""
    for j in range(len(txt)):
        two_geul[0] = two_geul[1]
        two_geul[1] = txt[j]
        if two_geul == ['\n', '\n']:
            pass
        else:
            new_txt += txt[j]
    val_copy.loc[i, 'code2'] = new_txt

100%|██████████████████████████████████████████████████████████████████████████| 40000/40000 [00:24<00:00, 1634.22it/s]


# 데이터 나누기

In [49]:
val_copy.isnull().sum()

code1      0
code2      0
similar    0
dtype: int64

In [45]:
val_copy = val_copy.dropna()

In [43]:
val_copy.to_csv('train_4.csv', index=False)

In [2]:
val_copy_re = pd.read_csv("train_4.csv")

In [3]:
val_copy_re_500 = val_copy_re.loc[0:499]

In [6]:
val_copy_re_5000

Unnamed: 0,code1,code2,similar
0,\nusing namespace std;\nusing ll = long long;\...,"\ntemplate<typename T1, typename T2>\ninline v...",0
1,\nusing namespace std;\nint main(void) {\n in...,\nusing namespace std;\nint main(void)\n{\n ...,1
2,\nusing namespace std;\nconst static int MAX =...,\nusing namespace std;\nstatic const double EP...,1
3,"\nusing namespace std;\nll gcd(ll a, ll b) {\n...",\nusing namespace std;\nint main()\n{\n int a...,0
4,\ntypedef long long ll;\nusing namespace std;\...,\nusing namespace std;\ntypedef long long int ...,1
...,...,...,...
4995,\nusing namespace std;\nint main (){\n long l...,\nusing namespace std;\nint main()\n{\n\tstrin...,0
4996,\nusing namespace std;\nsigned main()\n{\n ch...,\nusing namespace std;\ntypedef long long ll;\...,0
4997,\nusing namespace std;\nconst int INF = 1e9 - ...,\nusing namespace std;\nconst int NMAX = 10000...,1
4998,\nusing namespace std;\nusing ll = long long;\...,"\nusing namespace std;\ntemplate <class T, cla...",1


In [54]:
val_copy_re.isnull().sum()

code1       8
code2      16
similar     0
dtype: int64

In [4]:
val_copy_re_1000.isnull().sum()

code1      0
code2      0
similar    0
dtype: int64

In [57]:
val_copy[val_copy_re.isnull().any(axis=1)]

Unnamed: 0,code1,code2,similar
3550,,\nusing namespace std;\nint main(){\n\tint n;\...,0
3818,\nusing namespace std;\nint main() {\n long l...,,1
4101,,\nusing namespace std;\ntypedef long long ll;\...,0
5785,\nusing namespace std;\nint mp[128];\nbool dic...,,1
5939,\nusing namespace std;\nint main()\n{\n\tint F...,,0
8811,\nusing namespace std;\nusing ll = long long;\...,,1
10323,,\nusing namespace std;\ntypedef long long ll;\...,1
11131,,\nusing namespace std;\ntypedef long long ll;\...,0
11883,\nusing namespace std;\ntypedef long long ll;\...,,0
12653,\nusing namespace std;\nsigned main()\n{\n\tio...,,0


In [3]:
val_copy = val_copy_re.dropna()

In [4]:
val_copy = val_copy_re_500.dropna()

In [5]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    val_copy,
    test_size = 0.2,
    random_state = 42,
    stratify = val_copy['similar']
)

train_df

Unnamed: 0,code1,code2,similar
192,\nusing namespace std;\nusing ll = long long;\...,\nusing namespace std;\ntypedef long long ll;\...,1
313,\nusing namespace std;\ntypedef long long ll;\...,\nusing namespace std;\nint CalcSumOfDigit(int...,1
25,\nusing namespace std;\nint main(){\n doubl...,\nusing namespace std;\ntypedef long long ll;\...,0
242,\nusing namespace std;\nusing ll = long long;\...,\nusing namespace std;\nusing ll = long long;\...,0
170,\nusing ll = long long;\nusing ull = unsigned ...,\nusing namespace std;\nusing ll = long long;\...,1
...,...,...,...
400,\nint INF = 1e9 + 7;\nunsigned NthDayOfWeekToD...,\ntypedef long long ll;\ntypedef long double l...,0
225,\nusing namespace std;\nusing ll = long long;\...,\ntypedef long long ll;\nusing namespace std;\...,1
384,\nusing ll=long long;\nconstexpr ll mod = 1e9 ...,\nusing namespace std;\nconst int mod = 100000...,1
61,"\nusing namespace std;\ntemplate<typename S, t...",\nusing namespace std;\nconst int maxn = 1e5 +...,0


In [6]:
train_df['similar'].value_counts()

similar
0    204
1    196
Name: count, dtype: int64

# 학습

## 코드 토큰화

In [11]:
# tokenizer = AutoTokenizer.from_pretrained('neulab/codebert-cpp')
# tokenizer.truncation_side = 'left'

In [62]:
train_df.isnull().sum()

code1      0
code2      0
similar    0
dtype: int64

In [63]:
# bm25_train_df = train_df.copy()
# bm25_test_df = test_df.copy()

In [64]:
# bm25_train_df['code1'] = train_df['code1'].apply(tokenizer.tokenize)
# bm25_test_df['code1'] = test_df['code1'].apply(tokenizer.tokenize)
# bm25_train_df['code2'] = train_df['code2'].apply(tokenizer.tokenize)
# bm25_test_df['code2'] = test_df['code2'].apply(tokenizer.tokenize)

# bm25_train_df

Token indices sequence length is longer than the specified maximum sequence length for this model (665 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,code1,code2,similar
36339,"[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, int, Ġmain,...","[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, string, Ġs,...",1
29692,"[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, using, Ġll,...","[Ċ, int, Ġmain, (){, Ċ, ĉ, Ċ, ĉ, int, ĠN, ;, Ċ...",1
33828,"[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, using, Ġll,...","[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, int, Ġmain,...",0
4271,"[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, int, Ġmain,...","[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, Ġ, Ċ, int, ...",1
28595,"[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, const, Ġint...","[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, using, Ġll,...",1
...,...,...,...
28159,"[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, template, <...","[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, ty, ped, ef...",1
8822,"[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, void, Ġsolv...","[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, const, Ġint...",0
755,"[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, ty, ped, ef...","[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, ty, ped, ef...",1
6609,"[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, ty, ped, ef...","[Ċ, using, Ġnamespace, Ġstd, ;, Ċ, ty, ped, ef...",1


In [65]:
# bm25_train_df.to_csv("./bm25_train_df.csv", index=False)
# bm25_test_df.to_csv("./bm25_test_df.csv", index=False)

In [None]:
# !pip install tensorflow

In [None]:
# !pip install rank_bm25
# !pip install datasets
# ! pip install -U accelerate
# ! pip install -U transformers

## 모델 준비

In [7]:
from glob import glob
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, AutoModel, AutoModelForSequenceClassification,DataCollatorForTokenClassification,EarlyStoppingCallback
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras_preprocessing.sequence import pad_sequences
from datasets import load_dataset, load_metric, Dataset

from tqdm import trange
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [8]:
device

'cuda'

In [68]:
# bm25_train_df = bm25_train_df.rename(columns={'similar': 'label'})
# bm25_test_df = bm25_test_df.rename(columns={'similar': 'label'})

In [69]:
# dataset_train = Dataset.from_pandas(bm25_train_df)
# dataset_test = Dataset.from_pandas(bm25_test_df)

In [9]:
#model_name = 'neulab/codebert-cpp'
model_name = 'microsoft/codereviewer'
wd = 0.01
batch_size = 16
lr = 2e-5
epochs = 1
task = 'binary_classification'
label_list = ['0', '1']
num_labels = 2

def seed_everything(seed:42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at microsoft/codereviewer and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
def tokenize_function(examples):
    tokenizer.truncation_side = 'left'
    return tokenizer(examples["code1"], examples["code2"],padding="max_length", max_length = 512, truncation=True)

In [11]:
dataset_train = Dataset.from_pandas(train_df)
dataset_test = Dataset.from_pandas(test_df)

In [12]:
tokenized_train_datasets= dataset_train.map(tokenize_function, batched=True)
tokenized_test_datasets = dataset_test.map(tokenize_function, batched=True)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [13]:
args = TrainingArguments(
    output_dir = './open/codereviewer',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate= lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    seed = 42,
    weight_decay=wd,
    load_best_model_at_end=True,
    logging_dir = './open/codereviewer'
)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = []
    for i in pred.predictions:
      preds.append(np.argmax(i, axis=1).flatten())
    print(preds)
    acc = accuracy_score(labels, preds[0])
    return {'accuracy': acc}

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_test_datasets,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()
#0.992000

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 9.81 GiB is allocated by PyTorch, and 88.24 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# VRAM 용량 문제로 노트북에서는 학습 불가. virtual memory 인식 못하는 중. runpod에서 학습합니다.

In [None]:
base_path = './open/codereviewer'

torch.save(model, os.path.join(base_path + '/model0324.pt'))  # 전체 모델 저장

In [None]:
#test를 위한 다시 시작

In [None]:
base_path = './open/codereviewer'
data_path = './open/dataframe/'

In [None]:
#model = AutoModelForSequenceClassification.from_pretrained(os.path.join(base_path, 'checkpoint-93750'))

model = torch.load(os.path.join(base_path, 'model0323.pt'))

tokenizer = AutoTokenizer.from_pretrained(model_name)

test_df = pd.read_csv(os.path.join(data_path,'test_df_2403062244.csv'))

In [None]:
test_df = test_df.drop(columns = ['pair_id', 'code1_tokenized', 'code2_tokenized'], axis = 1)

In [None]:
for i in range(len(test_df)):
  if i % 10000 == 0:
    print(i)
  test_df.iloc[i]['code1'] = data_clean(test_df.iloc[i]['code1'])
  test_df.iloc[i]['code2'] = data_clean(test_df.iloc[i]['code2'])

In [None]:
args = TrainingArguments(
    output_dir = './open',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate= lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    seed = 42,
    weight_decay=wd,
    load_best_model_at_end=True,
    logging_dir = './open'
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = []
    for i in pred.predictions:
      preds.append(np.argmax(i, axis=1).flatten())
    print(preds)
    acc = accuracy_score(labels, preds[0])
    return {'accuracy': acc}

trainer = Trainer(
    model,
    args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
predictions = []
for i in trange(len(test_df) // 5000):
  pred = []
  dataset_test = Dataset.from_pandas(test_df[i*5000:(i+1)*5000])
  tokenized_test_datasets = dataset_test.map(tokenize_function, batched=True)
  predictions_test = trainer.predict(tokenized_test_datasets)

  for j in predictions_test.predictions:
    pred.append(np.argmax(j, axis=1).flatten())

  predictions.extend(pred[0])

len(predictions)

In [None]:
#public 0.76592 (codebert - test전처리 전)
#public 0.81441 (codebert - test전처리 후)
#public xx (codereviewer - test전처리 후)

In [None]:


sub = pd.read_csv(os.path.join('./open', 'sample_submission.csv'))
sub['similar'] = predictions

sub.to_csv(os.path.join(base_path, 'submission_0323.csv'), index = False)

In [None]:
########

In [None]:
tokenized_val_datasets = dataset_val.map(tokenize_function, batched=True)

In [None]:
predictions_val = trainer.predict(tokenized_val_datasets)

In [None]:
preds = []
for i in predictions_val.predictions:
  preds.append(np.argmax(i, axis=1).flatten())

predict = preds[0]

In [None]:
score = 0
for i in range(len(predict)):
  if val_df.iloc[i]['label'] == predict[i]:
    score += 1

score / len(predict)