## 1. 라이브러리 불러오기

In [1]:
import torch
import torchvision
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import json
from sklearn.metrics import classification_report
import glob
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## 2. 학습 코드

In [3]:
# 데이터셋 클래스 정의
class Dataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.loc[idx]
        text = item['문장']
        label = item['종교']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [4]:
# 데이터 로드 함수
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8-sig') as f:
        data = json.load(f)
    return data

# 병렬로 데이터셋 불러오기
def load_dataset(paths):
    data = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(load_data, p) for p in paths]
        for future in tqdm(as_completed(futures), total=len(futures)):
            data.append(future.result())
    return pd.DataFrame(data)

# 데이터로더 함수
def create_data_loader(data, tokenizer, max_len, batch_size):
    ds = Dataset(data, tokenizer, max_len)
    return DataLoader(ds, batch_size=batch_size, num_workers=4)

In [5]:
# 모델 및 토크나이저 초기화
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
model = AutoModelForSequenceClassification.from_pretrained("beomi/KcELECTRA-base", num_labels=2)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# CUDA 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [7]:
# 데이터 로드 밑 전처리
pathtrain = glob.glob('../dataset/*.json')[:79999]
pathval = glob.glob('../dataset/*.json')[80000:]

train_data = load_dataset(pathtrain)
val_data = load_dataset(pathval)

train_data = train_data[['종교', '문장']].reset_index(drop=True)
val_data = val_data[['종교', '문장']].reset_index(drop=True)

train_data.loc[train_data['종교'] > 1, '종교'] = 1
val_data.loc[val_data['종교'] > 1, '종교'] = 1

100%|██████████| 79999/79999 [00:01<00:00, 59098.11it/s] 
100%|██████████| 20000/20000 [00:00<00:00, 27086.64it/s] 


In [8]:
# 데이터로더 생성
train_data_loader = create_data_loader(train_data, tokenizer, max_len=128, batch_size=16)
val_data_loader = create_data_loader(val_data, tokenizer, max_len=128, batch_size=16)

In [9]:
# 트레이너 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="epoch",
    eval_steps=500,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_loader.dataset,
    eval_dataset=val_data_loader.dataset
)

In [10]:
# 모델 학습
trainer.train()

  0%|          | 0/7500 [00:00<?, ?it/s]

{'loss': 0.7496, 'grad_norm': 4.752614498138428, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}
{'loss': 0.7158, 'grad_norm': 4.315892219543457, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 0.6469, 'grad_norm': 3.537320137023926, 'learning_rate': 3e-06, 'epoch': 0.01}
{'loss': 0.5795, 'grad_norm': 2.918511390686035, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.02}
{'loss': 0.4971, 'grad_norm': 2.6368844509124756, 'learning_rate': 5e-06, 'epoch': 0.02}
{'loss': 0.4088, 'grad_norm': 2.280935287475586, 'learning_rate': 6e-06, 'epoch': 0.02}
{'loss': 0.3252, 'grad_norm': 1.6221752166748047, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.03}
{'loss': 0.2274, 'grad_norm': 1.453083872795105, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.03}
{'loss': 0.1747, 'grad_norm': 1.1064348220825195, 'learning_rate': 9e-06, 'epoch': 0.04}
{'loss': 0.1749, 'grad_norm': 0.4324991703033447, 'learning_rate': 9.900000000000002e-06, 'epoch': 0.04}
{'loss': 0.0701, 'g

  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.027928220108151436, 'eval_runtime': 41.9009, 'eval_samples_per_second': 477.317, 'eval_steps_per_second': 14.916, 'epoch': 1.0}
{'loss': 0.0429, 'grad_norm': 0.235475093126297, 'learning_rate': 3.5671428571428574e-05, 'epoch': 1.0}
{'loss': 0.0374, 'grad_norm': 1.7686628103256226, 'learning_rate': 3.56e-05, 'epoch': 1.01}
{'loss': 0.0497, 'grad_norm': 0.13819831609725952, 'learning_rate': 3.552857142857143e-05, 'epoch': 1.01}
{'loss': 0.0873, 'grad_norm': 0.23657521605491638, 'learning_rate': 3.545714285714286e-05, 'epoch': 1.02}
{'loss': 0.0337, 'grad_norm': 0.5048577189445496, 'learning_rate': 3.538571428571429e-05, 'epoch': 1.02}
{'loss': 0.0432, 'grad_norm': 0.10266518592834473, 'learning_rate': 3.5314285714285714e-05, 'epoch': 1.02}
{'loss': 0.0247, 'grad_norm': 0.2264554798603058, 'learning_rate': 3.5242857142857145e-05, 'epoch': 1.03}
{'loss': 0.0128, 'grad_norm': 0.12377672642469406, 'learning_rate': 3.517142857142857e-05, 'epoch': 1.03}
{'loss': 0.012, 'grad_no

  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.027531713247299194, 'eval_runtime': 41.8869, 'eval_samples_per_second': 477.476, 'eval_steps_per_second': 14.921, 'epoch': 2.0}
{'loss': 0.025, 'grad_norm': 0.07359577715396881, 'learning_rate': 1.7821428571428574e-05, 'epoch': 2.0}
{'loss': 0.0135, 'grad_norm': 0.13018742203712463, 'learning_rate': 1.775e-05, 'epoch': 2.01}
{'loss': 0.0563, 'grad_norm': 0.29138273000717163, 'learning_rate': 1.7678571428571432e-05, 'epoch': 2.01}
{'loss': 0.0268, 'grad_norm': 0.0666225478053093, 'learning_rate': 1.760714285714286e-05, 'epoch': 2.02}
{'loss': 0.0496, 'grad_norm': 0.5357342958450317, 'learning_rate': 1.7535714285714287e-05, 'epoch': 2.02}
{'loss': 0.0391, 'grad_norm': 0.07121506333351135, 'learning_rate': 1.7464285714285717e-05, 'epoch': 2.02}
{'loss': 0.0091, 'grad_norm': 0.06546091288328171, 'learning_rate': 1.7392857142857145e-05, 'epoch': 2.03}
{'loss': 0.076, 'grad_norm': 0.11804705113172531, 'learning_rate': 1.7321428571428572e-05, 'epoch': 2.03}
{'loss': 0.0679, 'g

  0%|          | 0/625 [00:00<?, ?it/s]

{'eval_loss': 0.024882493540644646, 'eval_runtime': 41.1279, 'eval_samples_per_second': 486.288, 'eval_steps_per_second': 15.196, 'epoch': 3.0}
{'train_runtime': 1734.2261, 'train_samples_per_second': 138.389, 'train_steps_per_second': 4.325, 'train_loss': 0.050568756240606305, 'epoch': 3.0}


TrainOutput(global_step=7500, training_loss=0.050568756240606305, metrics={'train_runtime': 1734.2261, 'train_samples_per_second': 138.389, 'train_steps_per_second': 4.325, 'total_flos': 1.578646598830848e+16, 'train_loss': 0.050568756240606305, 'epoch': 3.0})

In [11]:
# 모델 저장
trainer.save_model('./saved_model')