In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install datasets

In [None]:
import pandas as pd
import numpy as np

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast

def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer = tokenizer,
        mlm = mlm,
    )
    return data_collator

def train(train_file_path, model_name, output_dir, overwrite_output_dir,
          per_device_train_batch_size, num_train_epochs, save_steps):
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name,
                bos_token = '</s>', eos_token = '</s>', unk_token = '<unk>',
                pad_token = '<pad>', mask_token = '<mask>')
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir, legacy_format = False)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir = output_dir,
        overwrite_output_dir = overwrite_output_dir,
        per_device_eval_batch_size = per_device_train_batch_size,
        num_train_epochs = num_train_epochs,
    )

    trainer = Trainer(
        model = model,
        args = training_args,
        data_collator = data_collator,
        train_dataset = train_dataset,
    )

    trainer.train()
    trainer.save_model()

train_file_path = '/content/drive/MyDrive/Colab Notebooks/팀프로젝트/빅데이터 지능형 서비스과정(최종프로젝트)/KoGPT2_FineTunning/WishBeen(여행매거진)/Data/df_trip_wishbeen.txt'
model_name = 'skt/kogpt2-base-v2'
output_dir = '/content/drive/MyDrive/Colab Notebooks/팀프로젝트/빅데이터 지능형 서비스과정(최종프로젝트)/KoGPT2_FineTunning/WishBeen(여행매거진)/Model'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50.0
save_steps = 500

train(train_file_path = train_file_path,
      model_name = model_name,
      output_dir = output_dir,
      overwrite_output_dir = overwrite_output_dir,
      per_device_train_batch_size = per_device_train_batch_size,
      num_train_epochs = num_train_epochs,
      save_steps = save_steps
)

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel
from tqdm.notebook import tqdm

def load_model(model_path):
  model = GPT2LMHeadModel.from_pretrained(model_path)
  return model

def load_tokenizer(tokenizer_path):
  tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)
  return tokenizer

def generate_text(sequence, max_lenth):
  model_path = '/content/drive/MyDrive/Colab Notebooks/팀프로젝트/빅데이터 지능형 서비스과정(최종프로젝트)/KoGPT2_FineTunning/WishBeen(여행매거진)/Model'
  model = load_model(model_path)
  tokenizer = load_tokenizer(model_path)
  ids = tokenizer.encode(f'{sequence},', return_tensors = 'pt')
  final_outputs = model.generate(
      ids,
      do_sample = True,
      max_length = max_length,
      pad_token_id = model.config.pad_token_id,
      tok_k = 5,
      top_p = 0.90,
      repetition_penalty = 2.0,
  )
  return tokenizer.decode(final_outputs[0])
  # return tokenizer.decode(final_outputs[0], skip_special_tokens = True)

sequence = '코끼리 앞에서 남자가 사진을 찍고있다.'
max_length = 128
sentence_list = []
# print('input : ' + sequence + ' ' + refer)
# for i in tqdm(range(5)):
sentence_list.append(generate_text(sequence, max_length))
sentence = generate_text(sequence, max_length)

In [None]:
ch_sentence = sentence.split(sequence + ', ')[1:]
print(f'입력 값 : {sequence}')
ch_sentence = ch_sentence[0].replace('\n', ' ')
ch_sentence = ch_sentence.replace('.', '. ')
ch_sentence = ch_sentence.replace('"', '')
ch_sentence = ch_sentence.replace('<unk>', '')
ch_sentence = ch_sentence.replace('?', '? ')
ch_sentence = ch_sentence.replace('!', '! ')
ch_sentence = ch_sentence.replace('  ', ' ')
ch_sentence = ch_sentence.replace('  ', ' ')
ch_sentence = ch_sentence.replace('  ', ' ')
ch_sentence = ch_sentence.replace('다. ', '다.\n')
print(ch_sentence)

입력 값 : 코끼리 앞에서 남자가 사진을 찍고있다.
Miss India라는 노래. 원래 이 땅에는 수많은 소수민족이 모여 살고 있다.
마쭈와 윈난성 (Myanmarang) 두 개의 나라 수단은 한때 아프리카에서 가장 강성했던 곳이었다.
그런데 그 힘은 이리도 독보적이었던 것. 오랜 내전 끝에 2011년 말에 패전드가 됐고, 남수단이 독립하면서두만 남게 되었다.
그때 베이루트가 그토록 바라던 민족이 다시 뭉친 것이다.
여기는 난민과 혼자인 모습이 너무나 똑같기 때문이다.
배낭을 메는 여행자들이 넘나들썩한 옷깃의 끈을 붙잡고서라도 자신이 살던 곳을 그리려고 잔뜩
