In [1]:
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, AutoTokenizer, AutoConfig
from transformers.optimization import get_linear_schedule_with_warmup
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from torch.cuda.amp import autocast, GradScaler
from torch.optim import AdamW
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


MODEL_SAVE_PATH = "best_kobert_sentiment_model.pt" 
CHECKPOINT_PATH = "kobert_sentiment_checkpoint.pt"
dataFilePath = 'datasets/'
saveFilePath = 'saves/'
MODEL_NAME = "skt/kobert-base-v1"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 128
BATCH_SIZE = 32

In [2]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        print(f"SentimentDataset initialized with {len(texts)} texts and {len(labels)} labels.")
        # self.labels_count는 main 함수에서 이 객체 생성 후 직접 할당됩니다.
        # 예: train_dataset.labels_count = num_labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        if text.lower() == 'nan' or pd.isna(text):
            text = ""

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        current_token_type_ids = encoding['token_type_ids'].flatten()

        current_token_type_ids = torch.clamp(current_token_type_ids, 0, 1)


        if hasattr(self, 'labels_count') and not (0 <= label < self.labels_count):
             raise ValueError(f"Label {label} is out of bounds [0, {self.labels_count-1}] at item {item} for text: '{text}'")


        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': current_token_type_ids, # flatten된 텐서 사용
            'labels': torch.tensor(label, dtype=torch.long)
        }

class CustomBertForSequenceClassification(BertForSequenceClassification):
    def __init__(self, config, class_weights=None):
        super().__init__(config)
        if class_weights is not None:
            self.loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        else:
            self.loss_fct = nn.CrossEntropyLoss() # 가중치가 없으면 일반 CrossEntropyLoss 사용

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

def train_model(model, data_loader, optimizer, scheduler, device, scaler=None):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(tqdm(data_loader, desc="Training")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        if scaler:
            with torch.amp.autocast('cuda'): 
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    labels=labels
                )
                loss = outputs.loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scheduler.step()
        total_loss += loss.item()

    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(data_loader, desc="Evaluating")):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    return accuracy_score(true_labels, predictions), f1_score(true_labels, predictions, average='weighted')

def save_checkpoint(epoch, model, optimizer, scheduler, scaler, best_f1_score, id_to_label, unique_labels, patience_counter, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'scaler_state_dict': scaler.state_dict() if scaler else None,
        'best_f1_score': best_f1_score,
        'id_to_label': id_to_label,
        'unique_labels': unique_labels,
        'model_config': model.config.to_dict(),
        'patience_counter': patience_counter # <--- 이 부분 추가
    }, path)
    print(f"체크포인트가 '{path}'에 성공적으로 저장되었습니다.")

def load_checkpoint(path, device, class_weights=None):
    if not os.path.exists(path):
        return False, None

    checkpoint = torch.load(path, map_location=device)

    config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=len(checkpoint['unique_labels']))
    model = CustomBertForSequenceClassification(config, class_weights=class_weights)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    scaler_state_dict = checkpoint.get('scaler_state_dict', None) 

    start_epoch = checkpoint['epoch'] + 1
    best_f1_score = checkpoint['best_f1_score']
    id_to_label = checkpoint['id_to_label']
    unique_labels = checkpoint['unique_labels']
    patience_counter = checkpoint.get('patience_counter', 0) # <--- 이 부분 추가 (기본값 0)

    print(f"체크포인트가 '{path}'에서 성공적으로 로드되었습니다. 학습을 에폭 {start_epoch}부터 재개합니다.")
    
    loaded_data = {
        'start_epoch': start_epoch,
        'model': model,
        'optimizer': optimizer,
        'scaler_state_dict': scaler_state_dict,
        'best_f1_score': best_f1_score,
        'id_to_label': id_to_label,
        'unique_labels': unique_labels,
        'scheduler_state_dict': checkpoint['scheduler_state_dict'],
        'patience_counter': patience_counter # <--- 이 부분 추가
    }
    return True, loaded_data

In [3]:
df = pd.read_csv(f'{dataFilePath}sentiment_data.csv',index_col=0)
df.head()

Unnamed: 0,발화,감정,str_len
0,언니 동생으로 부르는게 맞는 일인가요..??,불안,24
1,그냥 내 느낌일뿐겠지?,불안,12
2,아직너무초기라서 그런거죠?,불안,14
3,유치원버스 사고 낫다던데,불안,13
4,근데 원래이런거맞나요,불안,11


In [4]:
df.loc[(df['감정'] == '불안'),'감정'] = 0
df.loc[(df['감정'] == '당황'),'감정'] = 1
df.loc[(df['감정'] == '분노'),'감정'] = 2
df.loc[(df['감정'] == '슬픔'),'감정'] = 3
df.loc[(df['감정'] == '중립'),'감정'] = 4
df.loc[(df['감정'] == '행복'),'감정'] = 5
df.loc[(df['감정'] == '혐오'),'감정'] = 6

In [5]:
texts = df['발화'].tolist()
labels = df['감정'].tolist()

unique_labels = sorted(list(set(labels)))
label_to_id = {label: i for i, label in enumerate(unique_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

print(f"감정 라벨 매핑: {label_to_id}")
numeric_labels = [label_to_id[label] for label in labels]

num_labels = len(unique_labels)
print(f"총 감정 클래스 수: {num_labels}")

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(numeric_labels), y=numeric_labels)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

# 모든 numeric_labels가 0부터 num_labels-1 사이에 있는지 최종 확인
if not all(0 <= l < num_labels for l in numeric_labels):
    print("오류: numeric_labels에 num_labels 범위를 벗어나는 값이 있습니다. 데이터와 매핑을 확인하세요.")
    problematic_labels = [l for l in numeric_labels if not (0 <= l < num_labels)]
    print(f"문제되는 라벨 값들: {set(problematic_labels)}")
    exit() # 중요한 오류이므로 바로 종료

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, numeric_labels, test_size=0.2, random_state=42, stratify=numeric_labels)
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, MAX_LEN)

max_cpu_cores = os.cpu_count()
print(f"시스템의 CPU 코어 수: {max_cpu_cores}")

recommended_num_workers = min(4, max_cpu_cores if max_cpu_cores else 0)

print(f"권장 num_workers 시작 값: {recommended_num_workers}")

train_data_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0, # 여기에 설정
    pin_memory=True # GPU로 데이터를 더 빠르게 전송
)

val_data_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    num_workers=0, # 여기에 설정
    pin_memory=True
)

감정 라벨 매핑: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6}
총 감정 클래스 수: 7
SentimentDataset initialized with 115947 texts and 115947 labels.
SentimentDataset initialized with 28987 texts and 28987 labels.
시스템의 CPU 코어 수: 20
권장 num_workers 시작 값: 4


In [6]:
class_weights_tensor = class_weights_tensor.to(device)
# 학습 관련 변수들을 기본값으로 초기화
PATIENCE = 5
NUM_EPOCHS = 30
start_epoch = 0
best_f1_score = -1.0
LEARNING_RATE = 2e-6
model = None
optimizer = None
scaler = None
scheduler = None # 스케줄러도 초기화
# 체크포인트 로드 시도
# load_checkpoint의 첫 번째 반환 값(성공 여부)을 확인
load_success, loaded_data = load_checkpoint(CHECKPOINT_PATH, device, class_weights=class_weights_tensor)

if load_success: # 체크포인트 로드에 성공한 경우
    start_epoch = loaded_data['start_epoch']
    model = loaded_data['model']
    optimizer = loaded_data['optimizer']
    best_f1_score = loaded_data['best_f1_score']
    id_to_label = loaded_data['id_to_label']
    unique_labels = loaded_data['unique_labels']
    num_labels = len(unique_labels)
    patience_counter = loaded_data['patience_counter'] # <--- 이 부분 추가
    # AMP Scaler 재구성
    if loaded_data['scaler_state_dict'] and str(device) == 'cuda':
        scaler = GradScaler()
        scaler.load_state_dict(loaded_data['scaler_state_dict'])
    elif str(device) == 'cuda': 
        scaler = GradScaler()
    else: 
        scaler = None
    print(f"학습을 에폭 {start_epoch}부터 재개합니다. (이전 최고 F1-Score: {best_f1_score:.4f}, 인내 카운터: {patience_counter})")
else: # 체크포인트 로드에 실패한 경우 (파일 없음) -> 처음부터 학습 시작
    print(f"체크포인트 파일 '{CHECKPOINT_PATH}'을 찾을 수 없습니다. 처음부터 학습을 시작합니다.")
    # 모델 초기화
    try:
        model = CustomBertForSequenceClassification.from_pretrained(
            MODEL_NAME, num_labels=num_labels, class_weights=class_weights_tensor)
    except TypeError: # 이전 버전 transformers 호환성
        config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=num_labels)
        model = CustomBertForSequenceClassification(config, class_weights=class_weights_tensor)
        pretrained_model_state_dict = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels).state_dict()
        model.load_state_dict(pretrained_model_state_dict, strict=False)
    model.to(device)
    
    # 옵티마이저 생성
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    # 스케일러 생성
    if str(device) == 'cuda':
        scaler = GradScaler()
    else:
        scaler = None
        
# 스케줄러 재구성/초기화 (옵티마이저가 설정된 후에 수행되어야 함)
total_steps = len(train_data_loader) * NUM_EPOCHS
num_warmup_steps = int(total_steps * 0.1)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=total_steps
)
# 로드된 스케줄러 상태가 있다면 로드
if load_success and loaded_data['scheduler_state_dict'] is not None:
    scheduler.load_state_dict(loaded_data['scheduler_state_dict'])

체크포인트가 'kobert_sentiment_checkpoint.pt'에서 성공적으로 로드되었습니다. 학습을 에폭 23부터 재개합니다.
학습을 에폭 23부터 재개합니다. (이전 최고 F1-Score: 0.5322, 인내 카운터: 5)


  scaler = GradScaler()


In [7]:
print("KoBERT 모델 학습 시작...")
for epoch in range(start_epoch, NUM_EPOCHS):
    print(f'--- Epoch {epoch + 1}/{NUM_EPOCHS} ---')
    train_loss = train_model(model, train_data_loader, optimizer, scheduler, device, scaler)
    print(f'  Train Loss: {train_loss:.4f}')
    val_accuracy, val_f1 = evaluate_model(model, val_data_loader, device)
    print(f'  Validation Accuracy: {val_accuracy:.4f}, F1-Score: {val_f1:.4f}')
    
    if val_f1 > best_f1_score:
        best_f1_score = val_f1
        patience_counter = 0 # <--- F1-Score 개선 시 카운터 초기화
        save_checkpoint(epoch, model, optimizer, scheduler, scaler, best_f1_score, id_to_label, unique_labels, patience_counter, MODEL_SAVE_PATH) # <--- patience_counter 인자 추가
        print(f"  새로운 최고 F1-Score ({best_f1_score:.4f}) 달성, 모델 저장됨.")
    else:
        patience_counter += 1 # <--- F1-Score 개선 없을 시 카운터 증가
        print(f"  F1-Score 개선 없음. 현재 최고 F1-Score: {best_f1_score:.4f} (인내 카운터: {patience_counter}/{PATIENCE})")
    save_checkpoint(epoch, model, optimizer, scheduler, scaler, best_f1_score, id_to_label, unique_labels, patience_counter, CHECKPOINT_PATH) # <--- patience_counter 인자 추가
    # 조기 종료 조건 확인
    if patience_counter >= PATIENCE: # <--- 이 부분 추가: 인내심 한계 도달 시 학습 중단
        print(f"검증 F1-Score가 {PATIENCE} 에폭 동안 개선되지 않아 학습을 조기 종료합니다.")
        break 
print("KoBERT 모델 학습 완료!")

KoBERT 모델 학습 시작...
--- Epoch 24/30 ---


Training:   0%|          | 0/3624 [00:00<?, ?it/s]

  Train Loss: 1.5021


Evaluating:   0%|          | 0/906 [00:00<?, ?it/s]

  Validation Accuracy: 0.4662, F1-Score: 0.4707
  F1-Score 개선 없음. 현재 최고 F1-Score: 0.5322 (인내 카운터: 6/5)
체크포인트가 'kobert_sentiment_checkpoint.pt'에 성공적으로 저장되었습니다.
검증 F1-Score가 5 에폭 동안 개선되지 않아 학습을 조기 종료합니다.
KoBERT 모델 학습 완료!
