In [1]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, AutoTokenizer
from transformers.optimization import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.optim import AdamW
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

dataFilePath = 'datasets/'
saveFilePath = 'saves/'
MODEL_NAME = "skt/kobert-base-v1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 64
BATCH_SIZE = 64

In [2]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        print(f"SentimentDataset initialized with {len(texts)} texts and {len(labels)} labels.")
        # self.labels_count는 main 함수에서 이 객체 생성 후 직접 할당됩니다.
        # 예: train_dataset.labels_count = num_labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        if text.lower() == 'nan' or pd.isna(text):
            text = ""

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        # --- START: 수정 및 추가 부분 ---
        current_token_type_ids = encoding['token_type_ids'].flatten()

        # --- START: 이 부분을 수정합니다 ---
        # 이전 경고 메시지 출력 부분은 이제 필요 없을 수 있지만,
        # 클리핑이 제대로 되는지 확인하기 위해 잠깐 유지할 수도 있습니다.
        # if current_token_type_ids.max() > 1 or current_token_type_ids.min() < 0:
        #     print(f"\nWARNING: token_type_ids at item {item} contains values outside [0, 1]. Max: {current_token_type_ids.max().item()}, Min: {current_token_type_ids.min().item()}")

        # 핵심 수정: token_type_ids 값을 0 또는 1로 강제로 클리핑
        current_token_type_ids = torch.clamp(current_token_type_ids, 0, 1)
        # --- END: 이 부분을 수정합니다 ---

        if hasattr(self, 'labels_count') and not (0 <= label < self.labels_count):
             raise ValueError(f"Label {label} is out of bounds [0, {self.labels_count-1}] at item {item} for text: '{text}'")


        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': current_token_type_ids, # flatten된 텐서 사용
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_model(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(data_loader,desc="Training")
    for batch_idx, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        try:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            progress_bar.set_postfix({'train_loss': f"{(total_loss/(batch_idx+1)):.4f}"})
        except RuntimeError as e:
            print(f"\nRuntimeError during training at batch {batch_idx}: {e}")
            print(f"Problematic batch labels: {labels.cpu().numpy()}")
            print(f"Problematic batch input_ids min/max: {input_ids.min().item()}/{input_ids.max().item()}")
            print(f"Problematic batch token_type_ids min/max: {token_type_ids.min().item()}/{token_type_ids.max().item()}")
            # --- START: 추가 부분 ---
            print(f"Problematic batch token_type_ids unique values: {torch.unique(token_type_ids).cpu().numpy()}")
            # --- END: 추가 부분 ---
            raise e

    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []
    progress_bar = tqdm(data_loader, desc="Evaluating")
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(progress_bar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            try:
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids
                )
                _, preds = torch.max(outputs.logits, dim=1)
                predictions.extend(preds.cpu().numpy())
                true_labels.extend(labels.cpu().numpy())
            except RuntimeError as e:
                print(f"\nRuntimeError during evaluation at batch {batch_idx}: {e}")
                print(f"Problematic batch labels: {labels.cpu().numpy()}")
                print(f"Problematic batch input_ids min/max: {input_ids.min().item()}/{input_ids.max().item()}")
                print(f"Problematic batch token_type_ids min/max: {token_type_ids.min().item()}/{token_type_ids.max().item()}")
                # --- START: 추가 부분 ---
                print(f"Problematic batch token_type_ids unique values: {torch.unique(token_type_ids).cpu().numpy()}")
                # --- END: 추가 부분 ---
                raise e
    return accuracy_score(true_labels, predictions), f1_score(true_labels, predictions, average='weighted')

In [3]:
df = pd.read_csv(f'{dataFilePath}sentiment_data.csv',index_col=0)
df.head()

Unnamed: 0,발화,감정,str_len
0,언니 동생으로 부르는게 맞는 일인가요..??,불안,24
1,그냥 내 느낌일뿐겠지?,불안,12
2,아직너무초기라서 그런거죠?,불안,14
3,유치원버스 사고 낫다던데,불안,13
4,근데 원래이런거맞나요,불안,11


In [4]:
df.loc[(df['감정'] == '불안'),'감정'] = 0
df.loc[(df['감정'] == '당황'),'감정'] = 1
df.loc[(df['감정'] == '분노'),'감정'] = 2
df.loc[(df['감정'] == '슬픔'),'감정'] = 3
df.loc[(df['감정'] == '중립'),'감정'] = 4
df.loc[(df['감정'] == '행복'),'감정'] = 5
df.loc[(df['감정'] == '혐오'),'감정'] = 6

In [5]:
df.head()

Unnamed: 0,발화,감정,str_len
0,언니 동생으로 부르는게 맞는 일인가요..??,0,24
1,그냥 내 느낌일뿐겠지?,0,12
2,아직너무초기라서 그런거죠?,0,14
3,유치원버스 사고 낫다던데,0,13
4,근데 원래이런거맞나요,0,11


In [6]:
texts = df['발화'].tolist()
labels = df['감정'].tolist()

unique_labels = sorted(list(set(labels)))
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

print(f"감정 라벨 매핑: {label_to_id}")
numeric_labels = [label_to_id[label] for label in labels]

num_labels = len(unique_labels)
print(f"총 감정 클래스 수: {num_labels}")

# 모든 numeric_labels가 0부터 num_labels-1 사이에 있는지 최종 확인
if not all(0 <= l < num_labels for l in numeric_labels):
    print("오류: numeric_labels에 num_labels 범위를 벗어나는 값이 있습니다. 데이터와 매핑을 확인하세요.")
    problematic_labels = [l for l in numeric_labels if not (0 <= l < num_labels)]
    print(f"문제되는 라벨 값들: {set(problematic_labels)}")
    exit() # 중요한 오류이므로 바로 종료

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, numeric_labels, test_size=0.2, random_state=42, stratify=numeric_labels)
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, MAX_LEN)

train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

감정 라벨 매핑: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6}
총 감정 클래스 수: 7
SentimentDataset initialized with 117078 texts and 117078 labels.
SentimentDataset initialized with 29270 texts and 29270 labels.


In [7]:
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [8]:
NUM_EPOCHS = 3
LEARNING_RATE = 2e-5

# 옵티마이저 및 스케줄러 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
# 총 학습 스텝 수 계산
total_steps = len(train_data_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0, # 워밍업 스텝 (0으로 설정)
    num_training_steps=total_steps
)

# 2. 모델 학습 (Fine-tuning)
print("KoBERT 모델 학습 시작...")
for epoch in range(NUM_EPOCHS):
    print(f'--- Epoch {epoch + 1}/{NUM_EPOCHS} ---')
    train_loss = train_model(model, train_data_loader, optimizer, scheduler, device)
    print(f'  Train Loss: {train_loss:.4f}')

    # 검증 및 평가
    val_accuracy, val_f1 = evaluate_model(model, val_data_loader, device)
    print(f'  Validation Accuracy: {val_accuracy:.4f}, F1-Score: {val_f1:.4f}')
print("KoBERT 모델 학습 완료!")


KoBERT 모델 학습 시작...
--- Epoch 1/3 ---


Training:   0%|          | 0/1830 [00:00<?, ?it/s]

  Train Loss: 1.3820


Evaluating:   0%|          | 0/458 [00:00<?, ?it/s]

  Validation Accuracy: 0.4878, F1-Score: 0.4210
--- Epoch 2/3 ---


Training:   0%|          | 0/1830 [00:00<?, ?it/s]

  Train Loss: 1.3001


Evaluating:   0%|          | 0/458 [00:00<?, ?it/s]

  Validation Accuracy: 0.5084, F1-Score: 0.4432
--- Epoch 3/3 ---


Training:   0%|          | 0/1830 [00:00<?, ?it/s]

KeyboardInterrupt: 