In [33]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import glob
import pandas as pd

In [34]:
# 시드 고정 함수
def fix_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

In [35]:
# 메모리 컨트롤
torch.cuda.empty_cache()
PYTORCH_CUDA_ALLOC_CONF='max_split_size_mb:128'
# Hyperparameter
CFG = {
    'seed':0,
    # CUDA 사용 가능 여부 확인
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    # 전처리용
    'valid_size':0.05,
    # train/vali 용
    'batch_size':1,
    'LR' : 2e-5, # Learning Rate
    'EPOCHS' : 10, # 학습 Epoch
}
fix_seed(CFG['seed'])

## 전처리

In [56]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2')
class Corpus_Dataset(Dataset):
    def __init__(self,data=list()):
        # 데이터 포맷팅 및 토크나이징
        formatted_data = []
        for input_text in data:
            input_ids = tokenizer.encode(input_text, return_tensors='pt')[0]
            if len(input_ids)>10:
                formatted_data.append(input_ids)
        print('Done.')
        self.data = formatted_data
        tokenizer.save_pretrained("./hansoldeco-kogpt2")
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        return self.data[idx]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


## DataLoader

In [57]:
# 데이터 로드
txts = glob.glob('./ref_corpus/*data.txt')
data=[]
for txt in txts:
    with open(txt,'r',encoding='utf-8') as f:
        sentences = f.readlines()
        for sentence in sentences:
            data.append(sentence)
    print(txt,len(sentences))
dataset = Corpus_Dataset(data)
print(len(dataset))

./ref_corpus\1 data.txt 242
./ref_corpus\2 data.txt 100
./ref_corpus\3 data.txt 338
./ref_corpus\4 data.txt 277
./ref_corpus\5 data.txt 696
./ref_corpus\6 data.txt 571
./ref_corpus\7 data.txt 98
./ref_corpus\8 data.txt 571
./ref_corpus\9 data.txt 189
Done.
2651


In [59]:
train_dataset,valid_dataset = train_test_split(dataset ,test_size=CFG['valid_size'], random_state=CFG['seed'])
print(f'train: {len(train_dataset)}, valid: {len(valid_dataset)}')
train_loader = DataLoader(train_dataset,batch_size=CFG['batch_size'], shuffle=True)
valid_loader = DataLoader(valid_dataset,batch_size=CFG['batch_size'], shuffle=False)

train: 2518, valid: 133


## Model

In [31]:
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.resize_token_embeddings(len(tokenizer))

Embedding(51200, 768)

## Train

In [25]:
def validation(model, valid_loader,epoch):
    model.eval()
    valid_loss = 0
    with torch.no_grad():
        progress_bar = tqdm(enumerate(valid_loader))
        for batch_idx, batch in progress_bar:
                # 데이터를 GPU단으로 이동
                batch = batch.to(CFG['device'])
                outputs = model(batch, labels=batch)
                loss = outputs.loss

                valid_loss += loss.item()

                # 진행률 표시줄에 평균 손실 업데이트
                progress_bar.set_description(f"<Validation> Epoch {epoch+1} - Avg Loss: {valid_loss / (batch_idx+1):.4f}")
    return valid_loss/len(valid_dataset)

In [84]:
def train(model,train_loader,valid_loader):
    model.to(CFG['device']) # 모델을 GPU단으로 이동
    optimizer = AdamW(model.parameters(), lr=CFG['LR'])
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-8, verbose=True)
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = CFG['EPOCHS']*len(train_loader))
    best_score = float('inf')
    # 모델 학습
    for epoch in range(CFG['EPOCHS']):
        model.train()
        total_loss = 0
        progress_bar = tqdm(enumerate(train_loader))
        for batch_idx, batch in progress_bar:
            # 데이터를 GPU단으로 이동
            # model.zero_grad()
            batch = batch.to(CFG['device'])
            outputs = model(batch, labels=batch)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

            # 진행률 표시줄에 평균 손실 업데이트
            progress_bar.set_description(f"<Train> Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")
        
        # Validation
        val_loss = validation(model,valid_loader, epoch)
        # Scheduler step
        scheduler.step(val_loss)
        # 에폭의 평균 손실을 출력
        print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Train Loss: {total_loss / len(train_dataset):.4f}, Valid Loss: {val_loss:.4f}")
        if val_loss<best_score:
            # 모델 저장
            print('New Minimum Valid Loss!')
            print("..save current best model..")
            model.save_pretrained("./hansoldeco-kogpt2")


In [None]:
train(model, train_loader,valid_loader)