In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm.auto import tqdm
import math
import unicodedata
import re
import gc
import torch.nn as nn
import torch.nn.functional as F
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
import CustomBert
from CustomBert import CustomBertForMaskedLM
from CustomBert import CustomBertConfig
from WordPieceTokenizer import WordPieceTokenizer as Tokenizer
import os
import matplotlib.pyplot as plt
from datasets import load_dataset
from PretrainDataset import TokenizedDataset
from PretrainDataset import CustomDataCollatorForMLM

In [2]:
def group_texts(examples, tokenizer, MAX_SEQUENCE_LENGTH):
    import torch
    concatenated_text = " ".join(examples["text"])

    encoded_output = tokenizer.encode(
        concatenated_text,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        padding=True,
        add_special_tokens=True
    )
    
    return {
        "input_ids": [torch.tensor(encoded_output['input_ids'], dtype=torch.long)],
        "attention_mask": [torch.tensor(encoded_output['attention_mask'], dtype=torch.long)],
        "token_type_ids": [torch.tensor(encoded_output['token_type_ids'], dtype=torch.long)],
    }

In [3]:
tokenizer = Tokenizer(vocab_file_path="saves/vocab.txt",do_lower_case=False,strip_accents=False,clean_text=True)
VOCAB_SIZE = tokenizer.get_vocab_size()
MAX_SEQUENCE_LENGTH = 128
BATCH_SIZE = 16

num_cpu_cores = os.cpu_count()
num_processes_to_use = num_cpu_cores if num_cpu_cores is not None else 1

datasetsPath = 'datasets/'
PREPROCESSED_TEXT_DIR = f'{datasetsPath}preprocess_wiki_text'

text_files = [os.path.join(PREPROCESSED_TEXT_DIR, f) for f in os.listdir(PREPROCESSED_TEXT_DIR) if f.endswith('.txt')]

if not text_files:
    exit()
print(f'총 {len(text_files)}개의 텍스트 파일 로드 시작...')
raw_dataset = load_dataset("text", data_files={"train": text_files}, split="train")
print(f"원시 데이터셋 로드 완료. 총 {len(raw_dataset)}개의 샘플.")
    
print('데이터셋 토큰화 및 청킹 시작...')
tokenized_dataset = raw_dataset.map(
    group_texts,
    batched=True,
    num_proc=4,
    remove_columns=["text"],
    fn_kwargs={"tokenizer":tokenizer,"MAX_SEQUENCE_LENGTH":MAX_SEQUENCE_LENGTH},
    desc=f"맵핑 데이터셋 (토큰화 및 청킹, 최대 길이 {MAX_SEQUENCE_LENGTH})"
)
print(len(tokenized_dataset))


총 1개의 텍스트 파일 로드 시작...


Loading dataset shards:   0%|          | 0/39 [00:00<?, ?it/s]

원시 데이터셋 로드 완료. 총 3025090개의 샘플.
데이터셋 토큰화 및 청킹 시작...
3028


In [4]:
print("토큰화된 데이터셋 저장 중...")
tokenized_dataset_path = "datasets/tokenized_dataset"
tokenized_dataset.save_to_disk(tokenized_dataset_path)
print(f"토큰화된 데이터셋이 '{tokenized_dataset_path}'에 저장되었습니다.")

토큰화된 데이터셋 저장 중...


Saving the dataset (0/1 shards):   0%|          | 0/3028 [00:00<?, ? examples/s]

토큰화된 데이터셋이 'datasets/tokenized_dataset'에 저장되었습니다.


In [5]:
full_dataset = TokenizedDataset(tokenized_dataset)

train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size,val_size])

data_collator = CustomDataCollatorForMLM(tokenizer=tokenizer, mlm_probability=0.15)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=data_collator,
    num_workers=0
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=data_collator,
    num_workers=0
)

print(f"PyTorch DataLoader 생성 완료. 배치 크기: {BATCH_SIZE}")
print(f"총 훈련 배치 수: {len(train_dataloader)}")
print(f"총 검증 배치 수: {len(val_dataloader)}")

PyTorch DataLoader 생성 완료. 배치 크기: 16
총 훈련 배치 수: 152
총 검증 배치 수: 38


In [6]:
print("--- 데이터셋 토큰화 및 디코딩 확인 ---")

for i, batch in enumerate(val_dataloader):
    if i >= 1:
        break

    print(f"\n--- Batch {i+1} ---")
    
    input_ids = batch["input_ids"]
    for j in range(input_ids.shape[0]):
        print(f"\nSample {j+1} Input IDs (Original):")
        print(input_ids[j].tolist())
        decoded_input = tokenizer.decode(input_ids[j].tolist())
        print(f"Decoded Input: {decoded_input}")

        labels = batch["labels"]
        print(f"Sample {j+1} Labels (Original):")
        print(labels[j].tolist())
        
        decoded_label = tokenizer.decode(labels[j].tolist())
        print(f"Decoded Labels (masked tokens only): {decoded_label}")
        
        print(f"--- Masked Token Check for Sample {j+1} ---")
        input_ids_list = input_ids[j].tolist()
        labels_list = labels[j].tolist()
        
        masked_positions_and_labels = []
        for k, (input_id, label_id) in enumerate(zip(input_ids_list, labels_list)):
            if label_id != -100:
                masked_token = tokenizer.decode([input_id])
                original_token = tokenizer.decode([label_id])
                masked_positions_and_labels.append(f"Pos {k}: '{masked_token}' was masked, should be '{original_token}'")
        
        if masked_positions_and_labels:
            for item in masked_positions_and_labels:
                print(item)
        else:
            print("No masked tokens with labels found in this sample (all -100).")

print("\n--- 디코딩 확인 완료 ---")

--- 데이터셋 토큰화 및 디코딩 확인 ---

--- Batch 1 ---

Sample 1 Input IDs (Original):
[2, 17633, 28309, 2080, 31312, 1175, 2743, 8166, 7, 4, 4716, 12702, 4, 3155, 9451, 11011, 1, 2280, 535, 4, 12205, 1, 3182, 3564, 8076, 2021, 297, 2166, 2135, 2492, 5434, 2870, 31312, 1175, 2743, 8166, 17633, 28309, 2080, 4, 4, 2743, 4, 4, 4, 230, 9, 4, 26920, 12702, 2198, 4734, 4647, 2080, 7, 1062, 4716, 12702, 7280, 3155, 9451, 1, 5671, 5744, 1140, 1, 26920, 12702, 26989, 2749, 5281, 4, 22542, 14356, 1, 983, 1028, 2124, 2266, 2351, 4, 4, 5656, 6779, 1123, 12087, 6108, 4737, 4191, 2000, 3698, 2003, 2198, 4, 1, 7473, 11011, 6978, 4737, 2172, 19427, 2172, 63, 4, 19165, 5415, 26920, 12702, 4, 7331, 4, 2074, 2280, 5626, 1, 7909, 4, 2217, 4737, 30346, 4737, 4191, 7909, 4, 2711, 4, 2080, 3]
Decoded Input: [CLS] 시리즈 탄소중립을 위한 바이오산업의 새로운 도전 '[MASK]이트바이오[MASK]협력 협의체 발족[UNK] 개최 생[MASK] 플라스틱[UNK] 규제개선 이루어질 지원 등 개발 연구 협력모델 발굴 바이오산업의 새로운 도전 시리즈 탄소중립을 위한[MASK][MASK] 새로운[MASK][MASK][MASK] 는 .[MASK] 화이트바이오 산업 경쟁력 강화를 위한 '화이트바이오 

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_SAVE_PATH = "saves/Pretrain.pt"

HIDDEN_SIZE = 768
NUM_HIDDEN_LAYERS = 12
NUM_ATTENTION_HEADS = 12
INTERMEDIATE_SIZE = 3072
TYPE_VOCAB_SIZE = 2
DROPOUT_PROB = 0.1

config = CustomBertConfig(
    VOCAB_SIZE=VOCAB_SIZE,
    HIDDEN_SIZE=HIDDEN_SIZE,
    NUM_HIDDEN_LAYERS=NUM_HIDDEN_LAYERS,
    NUM_ATTENTION_HEADS=NUM_ATTENTION_HEADS,
    INTERMEDIATE_SIZE=INTERMEDIATE_SIZE,
    MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH,
    TYPE_VOCAB_SIZE=TYPE_VOCAB_SIZE,
    DROPOUT_PROB=DROPOUT_PROB
)

model = CustomBertForMaskedLM(config)

if os.path.exists(MODEL_SAVE_PATH):
    print("모델 가중치 로드 중...")
    # 먼저 CPU에 로드한 후 모델에 로드합니다.
    loaded_state_dict = torch.load(MODEL_SAVE_PATH, map_location='cpu')
    model.load_state_dict(loaded_state_dict)
    print("모델 가중치 로드 완료.")
else:
    print("새로운 모델 초기화 완료. 저장된 가중치를 찾을 수 없습니다.")

model.to(device)

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Custom Bert 모델 초기화 완료. 총 학습 가능 파라미터 수 : {num_params}')
print(f'모델이 담긴 장치 : {device}')

모델 가중치 로드 중...
모델 가중치 로드 완료.
Custom Bert 모델 초기화 완료. 총 학습 가능 파라미터 수 : 110946560
모델이 담긴 장치 : cuda


In [7]:
EPOCHS = 50
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.1
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_dataloader) * EPOCHS
WARMUP_STEPS = int(total_steps * 0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS,num_training_steps=total_steps)
print(f"총 학습 스텝 수: {total_steps}")
print(f"워밍업 스텝 수: {WARMUP_STEPS}")

총 학습 스텝 수: 7600
워밍업 스텝 수: 760


In [12]:
LOSS_PATH = f"saves/Pretrain_loss.txt"
BEST_ACCURACY_PATH = f"saves/Pretrain_best_accuracy.txt"
best_val_accuracy = 0.0
if os.path.exists(BEST_ACCURACY_PATH):
    with open(BEST_ACCURACY_PATH, 'r') as f:
        content = f.read()
        if content.strip():
            best_val_accuracy = float(content.strip())
            print(f"이전 최고 검증 정확도: {best_val_accuracy:.4f} 불러옴.")

if os.path.exists(LOSS_PATH):
    with open(LOSS_PATH,'r') as f:
        content = f.read()
        if content.strip():
            prev_loss = float(content.strip())
else:
    prev_loss = 100

train_losses = []
train_accuarcy = []
scaler = torch.cuda.amp.GradScaler()

print(f"\n<--- 학습 시작 ---> ({EPOCHS} 에폭)")

for e in range(EPOCHS):
    model.train()
    loss_sum = 0
    progress_bar = tqdm(train_dataloader, desc=f"Pre-train Epoch {e+1}")

    for step, batch in enumerate(progress_bar):
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
                labels=batch["labels"]
            )
            loss = outputs["loss"]

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        scheduler.step()

        loss_sum += loss.item()
        progress_bar.set_postfix({'loss': f"{(loss_sum/(step+1)):.4f}"})

        del outputs, loss
        if 'cuda' in str(device):
            torch.cuda.empty_cache()
        gc.collect()

    avg_train_loss = loss_sum / len(train_dataloader)
    train_losses.append(avg_train_loss)

    print(f"Pre-train Epoch {e+1} 완료. 평균 학습 손실: {avg_train_loss:.4f}")

    model.eval()
    total_val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    val_progressbar = tqdm(val_dataloader, desc=f"Pre-train Epoch {e+1} Valid")
    
    with torch.no_grad():
        for step, batch in enumerate(val_progressbar):        
            batch = {k: v.to(device) for k, v in batch.items()}

            with torch.cuda.amp.autocast():
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    token_type_ids=batch["token_type_ids"],
                    labels=batch["labels"]
                )
                loss = outputs["loss"]
                logits = outputs["logits"]
                
            total_val_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)

            active_labels = batch["labels"].view(-1)
            active_predictions = predictions.view(-1)

            mask = (active_labels != -100)

            correct_predictions += (active_predictions[mask] == active_labels[mask]).sum().item()
            total_predictions += mask.sum().item()

            val_progressbar.set_postfix({'val_loss': f"{(total_val_loss/(step+1)):.4f}"})

            del outputs, loss, logits, predictions, active_labels, active_predictions, mask # 메모리 해제
            if 'cuda' in str(device):
                torch.cuda.empty_cache()
            gc.collect()
        avg_val_loss = total_val_loss / len(val_dataloader)
        val_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0
        if step % 100 == 0: 
            print(f"\n--- Batch {step} Debug Info ---")
            print("predictions sample (masked):", predictions.view(-1)[mask][:10]) # 예측값 샘플
            print("active_labels sample (masked):", active_labels[mask][:10])   # 실제 레이블 샘플
            print("total_predictions for this batch:", mask.sum().item()) # 이 배치에서 계산에 포함된 토큰 수
            print("---------------------------------")
        train_accuarcy.append(val_accuracy)
        print(f"Pre-train Epoch {e+1} 완료. 평균 검증 손실: {avg_val_loss:.4f}, 검증 정확도: {(val_accuracy*100):.2f}%")

        if val_accuracy > best_val_accuracy:
            torch.save(model.state_dict(), MODEL_SAVE_PATH)
            best_val_accuracy = val_accuracy
            print(f"새로운 최고 검증 정확도 {(best_val_accuracy*100):.2f}% 달성! 모델 가중치 '{MODEL_SAVE_PATH}' 저장 완료.")
            
            with open(BEST_ACCURACY_PATH, 'w') as f:
                f.write(str(best_val_accuracy))
            print(f"최고 검증 정확도 '{BEST_ACCURACY_PATH}' 저장 완료.")
        else:
            print(f"현재 검증 정확도 ({(val_accuracy*100):.2f}%)는 최고 정확도 ({(best_val_accuracy*100):.2f}%)보다 낮습니다. 모델을 저장하지 않습니다.")

print("\n<--- 학습 완료 --->")

이전 최고 검증 정확도: 0.0137 불러옴.

<--- 학습 시작 ---> (50 에폭)


  scaler = torch.cuda.amp.GradScaler()


Pre-train Epoch 1:   0%|          | 0/152 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, EPOCHS + 1), train_losses, marker='o', linestyle='-', color='b')
plt.title('Pre-training Learning Curve')
plt.xlabel('Epoch')
plt.ylabel('Average Training Loss')
plt.grid(True)
plt.xticks(range(1, EPOCHS + 1)) # x축 눈금 에폭 수에 맞춰 표시
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, EPOCHS + 1), train_accuarcy, marker='o', linestyle='-', color='b')
plt.title('Pre-training Learning Curve')
plt.xlabel('Epoch')
plt.ylabel('Average Training Accuarcy')
plt.grid(True)
plt.xticks(range(1, EPOCHS + 1)) # x축 눈금 에폭 수에 맞춰 표시
plt.show()

In [None]:
print(train_losses)

In [None]:
print(train_accuarcy)