In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 시드 고정 함수
def fix_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
    random.seed(seed)

In [3]:
# 메모리 컨트롤
torch.cuda.empty_cache()
PYTORCH_CUDA_ALLOC_CONF='max_split_size_mb:128'
# Hyperparameter
CFG = {
    'seed':0,
    # CUDA 사용 가능 여부 확인
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    # 전처리용
    'valid_size':0.05,
    # train/vali 용
    'batch_size':1,
    'LR' : 2e-5, # Learning Rate
    'EPOCHS' : 10, # 학습 Epoch
}
fix_seed(CFG['seed'])

## 전처리

In [3]:
# tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2', eos_token='</s>')
# class GPTDataset(Dataset):
#     def __init__(self,csv=pd.DataFrame):
#         # 데이터 포맷팅 및 토크나이징
#         formatted_data = []
#         for _, row in tqdm(csv.iterrows()):
#             for q_col in ['질문_1', '질문_2']:
#                 for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
#                     # 질문과 답변 쌍을 </s> token으로 연결
#                     input_text = row[q_col] + tokenizer.eos_token +row[a_col]
#                     input_ids = tokenizer.encode(input_text, return_tensors='pt')[0].to(torch.int64)
#                     formatted_data.append(input_ids)
#         print('Done.')
#         self.data = formatted_data
#         tokenizer.save_pretrained("./hansoldeco-kogpt2")
#     def __len__(self):
#         return len(self.data)
#     def __getitem__(self,idx):
#         return self.data[idx]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [4]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2', eos_token='</s>')
class GPTDataset_v2(Dataset):
    def __init__(self,csv=pd.DataFrame):
        cont_word = [' 또한, ', ' 그리고 ']
        categories = ['마감재','인테리어','시공','마감하자','건축구조','기타','타 마감하자']
        # 데이터 포맷팅 및 토크나이징
        formatted_data = []
        for idx,(_, row) in tqdm(enumerate(csv.iterrows())):
            for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
                # 질문과 답변 쌍을 </s> token으로 연결
                input_text = row['질문_2'] 
                # 0.4 확률로 질문 연결
                if random.randint(1,10)<5:
                    next_q = csv[csv['category']==row['category']].drop(csv.index[idx]).sample(1).iloc[0,:]
                    input_text += (random.choice(cont_word)+next_q['질문_2']+tokenizer.eos_token +row[a_col]+' '+next_q[a_col])
                else:
                    input_text+= (tokenizer.eos_token +row[a_col])
                input_ids = tokenizer.encode(input_text, return_tensors='pt')[0].to(torch.int64)
                formatted_data.append(input_ids)
        print('Done.')
        self.data = formatted_data
        tokenizer.save_pretrained("./hansoldeco-kogpt2")
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        return self.data[idx]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


## DataLoader

In [5]:
# 데이터 로드
data = pd.read_csv('./train.csv')
dataset = GPTDataset_v2(data)

0it [00:00, ?it/s]

644it [00:04, 156.95it/s]

Done.





In [6]:
train_dataset,valid_dataset = train_test_split(dataset,test_size=CFG['valid_size'])
print(f'train: {len(train_dataset)}, valid: {len(valid_dataset)}')
train_loader = DataLoader(train_dataset,batch_size=CFG['batch_size'], shuffle=True)
valid_loader = DataLoader(valid_dataset,batch_size=CFG['batch_size'], shuffle=False)

train: 3059, valid: 161


## Model

In [7]:
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.resize_token_embeddings(len(tokenizer))

  return self.fget.__get__(instance, owner)()


Embedding(51200, 768)

## Train

In [8]:
def validation(model, valid_loader,epoch):
    model.eval()
    valid_loss = 0
    with torch.no_grad():
        progress_bar = tqdm(enumerate(valid_loader))
        for batch_idx, batch in progress_bar:
                # 데이터를 GPU단으로 이동
                batch = batch.to(CFG['device'])
                outputs = model(batch, labels=batch)
                loss = outputs.loss

                valid_loss += loss.item()

                # 진행률 표시줄에 평균 손실 업데이트
                progress_bar.set_description(f"<Validation> Epoch {epoch+1} - Avg Loss: {valid_loss / (batch_idx+1):.4f}")
    return valid_loss/len(valid_dataset)

In [9]:
def train(model,train_loader,valid_loader):
    model.to(CFG['device']) # 모델을 GPU단으로 이동
    optimizer = AdamW(model.parameters(), lr=CFG['LR'])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-8, verbose=True)
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = CFG['EPOCHS'])
    best_score = float('inf')
    # 모델 학습
    for epoch in range(CFG['EPOCHS']):
        model.train()
        total_loss = 0
        progress_bar = tqdm(enumerate(train_loader))
        for batch_idx, batch in progress_bar:
            # 데이터를 GPU단으로 이동
            batch = batch.to(CFG['device'])
            outputs = model(batch, labels=batch)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

            # 진행률 표시줄에 평균 손실 업데이트
            progress_bar.set_description(f"<Train> Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")
        
        # Validation
        val_loss = validation(model,valid_loader, epoch)
        # Scheduler step
        scheduler.step(val_loss)
        # 에폭의 평균 손실을 출력
        print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Train Loss: {total_loss / len(train_dataset):.4f}, Valid Loss: {val_loss:.4f}")
        if val_loss<best_score:
            # 모델 저장
            print('New Minimum Valid Loss!')
            print("..save current best model..")
            model.save_pretrained("./hansoldeco-kogpt2")


In [None]:
train(model, train_loader,valid_loader)

## Inference

In [None]:
def inference():
    # 저장된 Fine-tuned 모델과 토크나이저 불러오기
    model_dir = path['colab']+"hansoldeco-kogpt2"
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    model.to(CFG['device'])
    model.eval()
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir,eos_token='</s>')
    model.resize_token_embeddings(len(tokenizer))
    
    # Inference를 위한 test.csv 파일 로드
    test = pd.read_csv(path['colab']+'test.csv')
    
    # test.csv의 '질문'에 대한 '답변'을 저장할 리스트
    preds = []

    # '질문' 컬럼의 각 질문에 대해 답변 생성
    gen_texts=[]
    for test_question in tqdm(test['질문']):
        # 입력 텍스트를 토큰화하고 모델 입력 형태로 변환
        # input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')
        input_ids = tokenizer.encode('[Q] '+ test_question + tokenizer.eos_token+'[A] ', return_tensors='pt')
        # 답변 생성
        output_sequences = model.generate(
            input_ids=input_ids.to(CFG['device']),
            max_length=150,
            temperature=0.9,
            top_k=1,
            top_p=0.9,
            repetition_penalty=1.2,
            do_sample=True,
            num_return_sequences=1
        )

        # 생성된 텍스트(답변) 저장
        
        for generated_sequence in output_sequences:
            full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
            # 토큰화 v1
            # answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)+5
            # baseline 기준
            # answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
            answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)+5
            answer_only = full_text[answer_start:].strip()
            gen_texts.append([full_text[:answer_start],answer_only])
            answer_only = answer_only.replace('\n', ' ')
            preds.append(answer_only)
    pd.DataFrame(gen_texts).to_csv(path['colab']+'ref_v1+train_texts_150.csv', index=False,encoding='utf-8-sig')
    return preds

In [None]:
preds = inference()

In [None]:
# Test 데이터셋의 모든 질의에 대한 답변으로부터 512 차원의 Embedding Vector 추출
# 평가를 위한 Embedding Vector 추출에 활용하는 모델은 'distiluse-base-multilingual-cased-v1' 이므로 반드시 확인해주세요.
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = model.encode(preds)

# 제출용 파일 제작
submit = pd.read_csv('./sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
# 리더보드 제출을 위한 csv파일 생성
submit.to_csv('./baseline_submit.csv', index=False)