In [2]:
!pip install transformers



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [5]:
!pip install torch



In [6]:
import pandas as pd
from konlpy.tag import Okt
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import torch
from tqdm import tqdm

In [7]:
df = pd.read_excel('/content/drive/MyDrive/Project/Data3.xlsx', engine='openpyxl')
df['개체명'] = df['개체명'].fillna('NaN')

In [8]:
# 형태소 분석기 인스턴스 생성
okt = Okt()

# 정제와 토큰화를 위한 함수 정의
def clean_and_tokenize(sentence):
    # 정제: 불필요한 공백, 특수 문자 제거
    sentence = re.sub(r'[^가-힣\s]', '', sentence)
    sentence = sentence.strip()

    # 토큰화: 형태소 분석기를 사용하여 토큰화
    tokens = okt.morphs(sentence)
    return tokens

In [9]:
# 토크나이저 로드
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [10]:
tokenizer.pad_token = tokenizer.eos_token

encoded_data = []

for index, row in df.iterrows():
    # 질문과 대답을 인코딩
    encoding = tokenizer.encode_plus(row['SENTENCE'], str(row['개체명']), max_length=512, truncation=True, padding='max_length')

    # 'input_ids'와 'attention_mask'를 입력으로 사용
    input_ids, attention_mask = encoding['input_ids'], encoding['attention_mask']

    # '개체명'이 포함된 부분을 레이블로 사용
    labels = [-100 if i==0 else input_id for i, input_id in zip(attention_mask, input_ids)]

    # 인코딩된 데이터를 리스트에 추가
    encoded_data.append((input_ids, labels))

In [11]:
# 데이터셋 분할 및 DataLoader 설정
train_inputs, val_inputs, train_labels, val_labels = train_test_split([item[0] for item in encoded_data], [item[1] for item in encoded_data], test_size=0.2)
train_encodings = {'input_ids': train_inputs, 'labels': train_labels}
val_encodings = {'input_ids': val_inputs, 'labels': val_labels}

In [12]:
# GPT-2 모델 로드
model = GPT2LMHeadModel.from_pretrained('gpt2')

# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=5e-5)

# 스케줄러 설정 (선택적)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=-1)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



In [13]:
class GPT2Dataset(Dataset):
    def __init__(self, encodings):
        self.input_ids = encodings['input_ids']
        self.labels = encodings['labels']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
                'labels': torch.tensor(self.labels[idx], dtype=torch.long)}

# 데이터셋 인스턴스 생성
train_dataset = GPT2Dataset(train_encodings)
val_dataset = GPT2Dataset(val_encodings)

In [14]:
# 패딩을 위한 함수 정의
def pad_collate(batch):
    # 입력과 레이블의 최대 길이를 결정합니다.
    max_length = max(max(len(item['input_ids']) for item in batch),
                     max(len(item['labels']) for item in batch))

    # 입력과 레이블을 max_length에 맞추어 패딩합니다.
    input_ids = pad_sequence([item['input_ids'] for item in batch],
                             batch_first=True, padding_value=0)
    labels = pad_sequence([item['labels'] for item in batch],
                          batch_first=True, padding_value=-100)

    # 원하는 길이로 잘라내지 않고, 패딩을 추가하여 모든 시퀀스를 같은 길이로 만듭니다.
    if max_length > input_ids.size(1):
        input_ids = F.pad(input_ids, (0, max_length - input_ids.size(1)))
    if max_length > labels.size(1):
        labels = F.pad(labels, (0, max_length - labels.size(1)), value=-100)

    return {'input_ids': input_ids, 'labels': labels}

# DataLoader에 pad_collate 함수 적용
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=pad_collate)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=pad_collate)


In [15]:
print(len(train_dataset), len(val_dataset))
print(len(train_dataset[0]['input_ids']), len(train_dataset[0]['labels']))


12580 3146
512 512


In [16]:
# GPU 사용 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# 학습 루프
epochs = 2

for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_loader):
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()  # 스케줄러 업데이트

    model.eval()
    with torch.no_grad():
        total_eval_loss = 0
        for batch in val_loader:
            inputs = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()

        avg_val_loss = total_eval_loss / len(val_loader)
        print(f'Validation Loss: {avg_val_loss}')

100%|██████████| 1573/1573 [30:12<00:00,  1.15s/it]


Validation Loss: 1.2774562920410621


 19%|█▉        | 304/1573 [05:50<24:23,  1.15s/it]


OutOfMemoryError: ignored