In [1]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_path = '/home/geum_bi/klue'

model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=7).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_path)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /home/geum_bi/klue and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pickle

data_path = './data1.pkl'

with open(data_path, 'rb') as file:
    data = pickle.load(file)

In [3]:
from datasets import Dataset, DatasetDict

# 데이터 형식을 맞춤
formatted_data = {'text': [item[0] for item in data], 'label': [int(item[1]) for item in data]}

# Dataset 객체로 변환
dataset = Dataset.from_dict(formatted_data)

# 데이터셋을 train, test로 분할
train_test_split = dataset.train_test_split(test_size=0.2)  # 20%를 테스트 데이터로 사용

# DatasetDict 객체로 변환
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

In [4]:
dataset_dict["train"][0]

{'text': '상황을 봐서. 자꾸 내려가면 어쩔 수 없이 팔아야겠지. 일단은 기다려보고.', 'label': 3}

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [6]:
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/15499 [00:00<?, ? examples/s]

Map:   0%|          | 0/3875 [00:00<?, ? examples/s]

In [7]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',                      # 출력 디렉토리
    evaluation_strategy="epoch",                 # 매 에폭마다 평가
    learning_rate=1e-5,                          # 학습률
    per_device_train_batch_size=16,              # 학습 배치 크기
    per_device_eval_batch_size=16,               # 평가 배치 크기
    num_train_epochs=10,                         # 학습 에폭 수
    weight_decay=0.01,                           # 가중치 감쇠
    seed=42,                                     # 시드 값
    lr_scheduler_type="linear",                  # 학습률 스케줄러 타입
    warmup_ratio=0.1,                            # 워밍업 비율
    optim="adamw_torch",                         # 옵티마이저 (기본값은 AdamW)
)

In [8]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.5555,0.326708,0.890065
2,0.2724,0.279741,0.914581
3,0.162,0.322084,0.912516
4,0.1073,0.304456,0.924903
5,0.0854,0.368194,0.923355
6,0.0702,0.37537,0.925161
7,0.0514,0.393696,0.925935
8,0.0468,0.435671,0.923355
9,0.0341,0.43993,0.926194
10,0.0257,0.446312,0.924645


TrainOutput(global_step=9690, training_loss=0.19061983465779309, metrics={'train_runtime': 9018.1162, 'train_samples_per_second': 17.187, 'train_steps_per_second': 1.075, 'total_flos': 4.0781413187328e+16, 'train_loss': 0.19061983465779309, 'epoch': 10.0})

In [12]:
model.save_pretrained('./final_model')
tokenizer.save_pretrained('./final_model')

('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/vocab.txt',
 './final_model/added_tokens.json',
 './final_model/tokenizer.json')