##### 0. 라이브러리 임포트

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import matplotlib.pyplot as plt
import time
import os


  from .autonotebook import tqdm as notebook_tqdm


##### 1. 데이터 준비

In [1]:

# 예제 데이터셋
texts = ["사업개요에 대한 문장들...", "과업범위에 대한 문장들..."]  # 예시 데이터
labels = [0, 1]  # Y_label 값들을 숫자로 맵핑하여 라벨링

# Dataset 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


NameError: name 'Dataset' is not defined

##### 2. 데이터어노테이션

In [None]:

# 사전 훈련된 BERT Multilingual 모델 및 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=10)

# 데이터셋 생성
dataset = CustomDataset(texts, labels, tokenizer, max_len=128)

# 훈련 데이터와 검증 데이터를 분할
train_size = int(0.7 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# DataLoader 정의
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

##### 3. 함수정의

In [None]:

# 옵티마이저와 손실 함수 정의
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
criterion = torch.nn.CrossEntropyLoss()

# 훈련 함수 정의
def train_epoch(model, data_loader, criterion, optimizer, device):
    model = model.train()

    total_loss = 0
    correct_predictions = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits, labels)
        total_loss += loss.item()

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return total_loss / len(data_loader), correct_predictions.double() / len(data_loader.dataset)

# 평가 함수 정의
def eval_model(model, data_loader, criterion, device):
    model = model.eval()

    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = criterion(logits, labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    return total_loss / len(data_loader), correct_predictions.double() / len(data_loader.dataset)

##### 4. 모델구현 및 훈련

In [None]:
# 모델 훈련 루프
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

epochs = 3
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

best_val_acc = 0.0
best_val_loss = float('inf')
best_model_path = './best_model.pth'

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    
    # Train
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    train_losses.append(train_loss)
    train_accuracies.append(train_acc.item())
    print(f'Train loss: {train_loss:.4f}, Train accuracy: {train_acc:.4f}')
    
    # Evaluate
    val_loss, val_acc = eval_model(model, val_loader, criterion, device)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc.item())
    print(f'Validation loss: {val_loss:.4f}, Validation accuracy: {val_acc:.4f}')

     # Validation Accuracy 기준으로 모델 저장
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), best_model_path)
        print(f"New best model saved with validation accuracy: {val_acc:.4f}")

##### 5. 훈련과정 시각화

In [None]:
# 시각화
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, epochs + 1), train_losses, label='Train Loss')
plt.plot(range(1, epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, epochs + 1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, epochs + 1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy over Epochs')
plt.legend()

plt.show()

##### 6. 모델 용량 확인

In [None]:
# 모델 크기 확인
model_size = sum(p.numel() for p in model.parameters()) * 4 / (1024**2)  # MB 단위로 크기 확인 (float32이므로 4바이트 사용)
print(f'Model size: {model_size:.2f} MB')

##### 7. 샘플당 추론시간 측정

In [None]:
# 샘플당 추론 시간 측정
sample_text = "이 문장은 샘플 테스트 문장입니다."
input = tokenizer(sample_text, return_tensors='pt', padding=True, truncation=True, max_length=128)
input = {key: value.to(device) for key, value in input.items()}

model.eval()
with torch.no_grad():
    start_time = time.time()
    for _ in range(100):  # 100번 반복해서 평균 추론 시간 계산
        outputs = model(**input)
    avg_inference_time = (time.time() - start_time) / 100

print(f'Average inference time per sample: {avg_inference_time:.6f} seconds')