In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import json
from sklearn.metrics import classification_report
import glob
import pandas as pd

In [None]:

# 데이터셋 클래스 정의
class Dataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['문장']
        label = item['종교']

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# 데이터 로드 함수
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# 데이터로더 함수
def create_data_loader(data, tokenizer, max_len, batch_size):
    ds = Dataset(data, tokenizer, max_len)
    return DataLoader(ds, batch_size=batch_size, num_workers=4)

In [None]:
# 모델 및 토크나이저 초기화
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
model = AutoModelForSequenceClassification.from_pretrained("beomi/KcELECTRA-base", num_labels=2)

In [None]:
# CUDA 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 데이터 로드
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8-sig') as f:
        data = json.load(f)
    return data

pathtrain = glob.glob('../dataset/*.json')[:10000]
pathvalid = glob.glob('../dataset/*.json')[10001:12000]
train_data = []
val_data = []
for p in pathtrain:
	train_data.append(load_data(p))
for p in pathvalid:
	train_data.append(load_data(p))
train_data = pd.DataFrame(train_data)
val_data = pd.DataFrame(val_data)

In [None]:
# 데이터로더 생성
train_data_loader = create_data_loader(train_data, tokenizer, max_len=128, batch_size=16)
val_data_loader = create_data_loader(val_data, tokenizer, max_len=128, batch_size=16)

In [None]:
# 트레이너 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="epoch",
    eval_steps=500,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_loader.dataset,
    eval_dataset=val_data_loader.dataset
)

In [None]:
# 모델 학습
trainer.train()

In [None]:
# 모델 저장
trainer.save_model('./saved_model')

In [None]:
# 모델 검증
predictions, labels, _ = trainer.predict(val_data_loader.dataset)
predictions = torch.argmax(predictions, axis=1)