In [20]:
import os
os.getcwd()

'/home/ubuntu/Project/SJ'

In [1]:
import pandas as pd
data = pd.read_excel('/home/ubuntu/Project/SJ/Data9.xlsx')

In [2]:
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2", bos_token='</s>', eos_token='</s>',
    unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [3]:
from torch.utils.data import Dataset
class ChatDataset(Dataset):
    def __init__(self, conversations, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.inputs = []
        self.attention_masks = []

        for conv in conversations:
            # 질문과 답변을 결합
            combined_text = f"{conv['question']} {tokenizer.eos_token} {conv['answer']} {tokenizer.eos_token}"

            # 토큰화
            tokenized_text = tokenizer(combined_text, truncation=True, max_length=max_length, padding="max_length")

            self.inputs.append(torch.tensor(tokenized_text['input_ids']))
            self.attention_masks.append(torch.tensor(tokenized_text['attention_mask']))
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.attention_masks[idx]

In [4]:
# 데이터셋 변환
def build_dataset(data, tokenizer):
    conversations = []
    for i in range(len(data) - 1):
        if data.iloc[i]['QA여부'] == 'q' and data.iloc[i+1]['QA여부'] == 'a':
            conversations.append({
                'question': data.iloc[i]['발화문'],
                'answer': data.iloc[i+1]['발화문']
            })
    return conversations



In [6]:
conversations = build_dataset(data, tokenizer)

In [7]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# 전체 데이터셋을 학습 데이터셋과 검증 데이터셋으로 분리
train_conversations, val_conversations = train_test_split(conversations, test_size=0.1)

# 학습 및 검증 데이터셋을 위한 DataLoader 생성
train_dataset = ChatDataset(train_conversations, tokenizer)
val_dataset = ChatDataset(val_conversations, tokenizer)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [8]:
from transformers import AdamW
from tqdm import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

# 학습 설정
epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [9]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        # 배치 데이터를 GPU로 옮김 (필요한 경우)
        inputs, masks = [x.to(device) for x in batch]

        # 그레이디언트 초기화
        optimizer.zero_grad()

        # 모델의 결과를 얻음
        outputs = model(input_ids=inputs, attention_mask=masks, labels=inputs)
        loss = outputs.loss

        # 역전파 수행
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    # 에포크마다 평균 손실을 출력
    print(f"Epoch {epoch + 1}/{epochs} completed. Average loss: {total_loss / len(train_loader)}")

100%|██████████| 3358/3358 [21:56<00:00,  2.55it/s]


Epoch 1/5 completed. Average loss: 0.10562355213528449


100%|██████████| 3358/3358 [21:55<00:00,  2.55it/s]


Epoch 2/5 completed. Average loss: 0.07641309816246057


  1%|▏         | 45/3358 [00:17<21:43,  2.54it/s]


KeyboardInterrupt: 

In [10]:
model.eval()
total_eval_loss = 0
for batch in tqdm(validation_loader):  # 검증 데이터 로더
    inputs, masks = [x.to(device) for x in batch]

    with torch.no_grad():
        outputs = model(input_ids=inputs, attention_mask=masks, labels=inputs)
        loss = outputs.loss
        total_eval_loss += loss.item()

100%|██████████| 374/374 [00:50<00:00,  7.41it/s]


In [11]:
# 평균 검증 손실 계산
avg_val_loss = total_eval_loss / len(validation_loader)
print(f"Validation Loss: {avg_val_loss}")

Validation Loss: 0.07331350381999092


In [17]:
def generate_response(sentence, model, tokenizer, device):
    # 입력 문장 토큰화
    input_ids = tokenizer.encode(sentence + tokenizer.eos_token, return_tensors='pt')
    input_ids = input_ids.to(device)

    # 모델이 응답 생성
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=512, num_return_sequences=1)
    
    # 생성된 토큰을 문자열로 변환
    full_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # 입력 문장 이후의 부분만 반환
    # 가정: 응답은 '입력 문장 응답' 형식으로 구성됨
    response_start = full_response.find(sentence) + len(sentence)
    if response_start != -1:
        return full_response[response_start:].strip()
    else:
        return full_response

In [18]:
# 예시 문장으로 응답 생성
sentence = input()
response = generate_response(sentence, model, tokenizer, device)
print(response)

 지금 이 시간엔 짜장면 빨리 갖다 주죠?


네, 이 시간엔 빨리 갖다 드릴게요.


In [19]:
# 모델 저장
model_path = "/home/ubuntu/Project/SJ/saved_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/home/ubuntu/Project/SJ/saved_model/tokenizer_config.json',
 '/home/ubuntu/Project/SJ/saved_model/special_tokens_map.json',
 '/home/ubuntu/Project/SJ/saved_model/tokenizer.json')