In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertModel
import numpy as np

# GPU 사용 가능 여부 확인 및 설정
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
def load_and_preprocess_data():
    main_file_path = '/local_datasets/AACommu/test/train_data_cafe_5000.json' 
    
    # 1. 메인 데이터 로드 (JSON)
    try:
        df_combined = pd.read_json(main_file_path)

        df_combined['label'] = 0
        df_combined.loc[df_combined.index % 2 == 0, 'label'] = 1 # 짝수 인덱스에 레이블 1 부여
        df_combined['text'] = df_combined['input_text']
        df_combined = df_combined.loc[:, ['text', 'label']]
    except FileNotFoundError:
        # 파일이 없을 경우 예시 데이터 생성
        print(f"⚠️ Warning: {main_file_path} not found. Using dummy data.")
        df_combined = pd.DataFrame({
            'text': ["JSON 파일 하나만 사용합니다.", "오늘의 학습 주제는 BERT입니다.", "데이터셋을 단일 파일로 구성했습니다."], 
            'label': [0, 1, 0]
        })
        
    # 텍스트 정제, 레이블 인코딩 등의 추가 전처리 로직을 여기에 삽입 (필요시)
    
    print(f"Total data size (Single JSON): {len(df_combined)}")
    
    # 필수 컬럼(text, label)이 있는지 확인 
    if 'text' not in df_combined.columns or 'label' not in df_combined.columns:
        raise ValueError("DataFrame must contain 'text' and 'label' columns.")
        
    return df_combined

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 데이터 준비
df = load_and_preprocess_data()

# 모델 설정 (예: 한국어 BERT 기반 모델)
MODEL_NAME = "klue/bert-base"
MAX_LEN = 128
BATCH_SIZE = 16

# 토크나이저 초기화
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 학습/검증 데이터 분리
# 레이블이 없거나 불균형할 경우 stratify 옵션 조정 필요
df_train, df_val = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])

# Dataset 및 DataLoader 생성 (JSON 저장 과정 생략됨)
train_dataset = TextDataset(
    texts=df_train.text.to_list(),
    labels=df_train.label.to_list(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

val_dataset = TextDataset(
    texts=df_val.text.to_list(),
    labels=df_val.label.to_list(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [None]:
class AACommuModel(nn.Module):
    def __init__(self, n_classes, model_name):
        super(AACommuModel, self).__init__()
        # 사전 학습된 BERT 모델 로드
        self.bert = BertModel.from_pretrained(model_name, return_dict=False)
        self.drop = nn.Dropout(p=0.3)
        # 분류를 위한 출력 레이어
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        # BERT 모델에 입력 (pooled_output만 사용)
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

# 모델 초기화
N_CLASSES = df['label'].nunique()
model = AACommuModel(N_CLASSES, MODEL_NAME)
model = model.to(device)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for step, d in enumerate(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        # 그라디언트 클리핑
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
        
        if step % 100 == 0:
            print(f"  Step {step}: Loss {loss.item():.4f}")

    return correct_predictions.double() / n_examples, sum(losses) / len(losses)


def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, sum(losses) / len(losses)

In [None]:
# 하이퍼파라미터 설정
EPOCHS = 4
LEARNING_RATE = 2e-5

# 옵티마이저와 손실 함수
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss().to(device)

# 학습 시작
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
best_accuracy = 0

print("\n=== Start Model Training ===")

for epoch in range(EPOCHS):
    print(f'\n[Epoch {epoch + 1}/{EPOCHS}]')
    
    # 훈련 (Train)
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        len(df_train)
    )

    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}')
    
    # 검증 (Validation)
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val)
    )
    
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}')

    # 결과 저장
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc.item())
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc.item())
    
    # 최적 모델 저장
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'AACommu_model_best_weights.pt')
        best_accuracy = val_acc
        print("Model weights saved: AACommu_model_best_weights.pt")

print("\n=== Training Complete ===")