In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install datasets

In [None]:
import pandas as pd
import numpy as np

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast

def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer = tokenizer,
        mlm = mlm,
    )
    return data_collator

def train(train_file_path, model_name, output_dir, overwrite_output_dir,
          per_device_train_batch_size, num_train_epochs, save_steps):
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name,
                bos_token = '</s>', eos_token = '</s>', unk_token = '<unk>',
                pad_token = '<pad>', mask_token = '<mask>')
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir, legacy_format = False)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir = output_dir,
        overwrite_output_dir = overwrite_output_dir,
        per_device_eval_batch_size = per_device_train_batch_size,
        num_train_epochs = num_train_epochs,
    )

    trainer = Trainer(
        model = model,
        args = training_args,
        data_collator = data_collator,
        train_dataset = train_dataset,
    )

    trainer.train()
    trainer.save_model()

train_file_path = '/content/drive/MyDrive/Colab Notebooks/팀프로젝트/빅데이터 지능형 서비스과정(최종프로젝트)/KoGPT2_FineTunning/Branch(여행_통합)/Data/df_trip.txt'
model_name = 'skt/kogpt2-base-v2'
output_dir = '/content/drive/MyDrive/Colab Notebooks/팀프로젝트/빅데이터 지능형 서비스과정(최종프로젝트)/KoGPT2_FineTunning/Branch(여행_통합)/Model'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 500

train(train_file_path = train_file_path,
      model_name = model_name,
      output_dir = output_dir,
      overwrite_output_dir = overwrite_output_dir,
      per_device_train_batch_size = per_device_train_batch_size,
      num_train_epochs = num_train_epochs,
      save_steps = save_steps
)

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
from tqdm.notebook import tqdm

def load_model(model_path):
  model = GPT2LMHeadModel.from_pretrained(model_path)
  return model

def load_tokenizer(tokenizer_path):
  tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)
  return tokenizer

def generate_text(sequence, max_length):
  model_path = '/content/drive/MyDrive/Colab Notebooks/팀프로젝트/빅데이터 지능형 서비스과정(최종프로젝트)/KoGPT2_FineTunning/Branch(여행_통합)/Model'
  model = load_model(model_path)
  tokenizer = load_tokenizer(model_path)
  ids = tokenizer.encode(f'{sequence},', return_tensors = 'pt')
  final_outputs = model.generate(
      ids,
      do_sample = True,
      max_length = max_length,
      pad_token_id = model.config.pad_token_id,
      tok_k = 5,
      top_p = 0.90,
      no_repeat_ngram_size=3,
      repetition_penalty = 2.0,
  )
  return tokenizer.decode(final_outputs[0], skip_special_tokens = True)
  # return tokenizer.decode(final_outputs[0], skip_special_tokens = True)



In [None]:
# sequence = '고기 고기가 올라가 있는 피자 한 판'
# max_length = 64
# sentence_list = []
# # print('input : ' + sequence + ' ' + refer)
# # for i in tqdm(range(5)):
# sentence_list.append(generate_text(sequence, max_length))
# sentence = generate_text(sequence, max_length)
# ch_sentence = sentence.split(sequence + ', ')[1:]
# print(f'입력 값 : {sequence}')
# ch_sentence = ch_sentence[0].replace('\n', ' ')
# ch_sentence = ch_sentence.replace('.', '. ')
# ch_sentence = ch_sentence.replace('"', '')
# ch_sentence = ch_sentence.replace('<unk>', '')
# ch_sentence = ch_sentence.replace('?', '? ')
# ch_sentence = ch_sentence.replace('!', '! ')
# ch_sentence = ch_sentence.replace('  ', ' ')
# ch_sentence = ch_sentence.replace('  ', ' ')
# ch_sentence = ch_sentence.replace('  ', ' ')
# ch_sentence = ch_sentence.replace('다. ', '다.\n')
# print(ch_sentence)

In [None]:
!pip install git+https://github.com/ssut/py-hanspell.git

In [None]:
from hanspell import spell_checker

In [None]:
def spell_check(sequence):
    result = spell_checker.check(sequence)
    checked_sequence = result.checked
    return checked_sequence

def result_sequence(sequence, max_length):
    sequence1 = generate_text(sequence, max_length)
    sequence2 = sequence1.split(f'{sequence}, ')[1]
    sequence3 = spell_check(sequence2)
    sequence4 = sequence3.replace('  ', ' ')
    sequence5 =  sequence4.replace('. ', '.. ')
    sequence6 = ' '.join(sequence5.split('. ')[:-1])
    sequence7 = spell_check(sequence6)
    return sequence6

In [None]:
from tqdm.notebook import tqdm

sentence = '고기 고기가 올라가 있는 피자 한 판'
sequence_list = []
for _ in tqdm(range(2)):
    sequence = result_sequence(sentence, 64)
    sequence_list.append(sequence)
sequence = ' '.join(sequence_list)
sequence

  0%|          | 0/2 [00:00<?, ?it/s]

loading configuration file /content/drive/MyDrive/Colab Notebooks/팀프로젝트/빅데이터 지능형 서비스과정(최종프로젝트)/KoGPT2_FineTunning/Branch(여행_통합)/Model/config.json
Model config GPT2Config {
  "_name_or_path": "skt/kogpt2-base-v2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 0,
  "created_date": "2021-04-28",
  "embd_pdrop": 0.1,
  "eos_token_id": 1,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "license": "CC-BY-NC-SA 4.0",
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 3,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights"

'그리고 마지막으로 저녁을 먹을 식당은 어디쯤에 있는지 맥주 마시기 좋은 바는 또 어떤지 눈여겨 봐두고 숙소에서 가장 가까운 슈퍼와 인상 깊은 주인이 과일 주스를 갈아주는 노점도 익혀둔다. 그리고 다른 종류의 소스인 유부 가락국수. 고등어라고 하면 한국에서도 최근 들어 조금씩 대중화되고는 있지만 여전히 어른들이 가지고 노는 비싼 장난감 정도로 치부를 하고 있다. 가끔은 너무 비싸므로 사지 않는 것이 나을지도 모른다.'