Author: Jun Seob Shim

Creation Date: 29 Nov, 2023

Last Update Date: 19 Dec, 2023

# **Model Training**

In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
# load dataset
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

# load pre-trained GPT-2 model, and fine tune
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()

  # save model for loading later (for inference)
  trainer.save_model()

In [None]:
# model parameters
train_file_path = "Sentences.txt"
model_name = 'gpt2-large'
output_dir = 'gpt2-large_result'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [None]:
# model fine tuning
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,2.8579
1000,2.3537
1500,2.2353
2000,2.1562
2500,1.9059
3000,1.8122
3500,1.8112
4000,1.8235
4500,1.6247
5000,1.4955


# **Inference and Evaluation**

**If you are only loading (and not training) model, you must still run the 'model parameters' cell in the above 'Model Training' section: to set GPT-2 base model size and to initialize reference to output_dir for saved model. No other cells need to be run in order to run the below cells for inference and evaluation.**

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

## **Generating text from fine tuned model**

In [None]:
# functions to load model and prepare for text generation
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(sequence, max_length):
    model_path = output_dir
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        #do_sample=True,
        max_length=max_length,
        num_beams = 5,
        no_repeat_ngram_size = 5,
        pad_token_id=model.config.eos_token_id,
        #top_k=50,
        #top_p=0.95,
        early_stopping = True
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
# loading model and initializing parameters for text generation
model_path = output_dir
model = load_model(model_path)
tokenizer = load_tokenizer(model_path)

# max token length for output text
max_length = 500

# function to generate output based on string input (fine tuned model)
def finetune_generate(sequence):
    ids = tokenizer.encode(sequence, return_tensors = 'pt')
    output = model.generate(
        ids,
        max_length=max_length,
        num_beams = 5,
        no_repeat_ngram_size = 5,
        pad_token_id=model.config.eos_token_id,
        early_stopping = True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
# read prompts from text file into list, each element in list is one prompt
prompts_file = open('short_prompts.txt', 'r') # currently set to 'short_prompts.txt,' change to 'output.txt' for longer prompts curated with prompt engineering (see 'BERT'prompt_engineer'.ipynb')
prompts_list = prompts_file.readlines()
prompts_file.close()

# open empty file for saving outputs
generated_finetune = open('generated_finetuned.txt', 'w')

for item in prompts_list:
    finetuned_text = finetune_generate(item)
    # add spacing between sentences
    spaced_finetuned = finetuned_text.replace("\n", "").replace(".", ". ").replace("?", "? ").replace("!", "! ")
    # add newline to separate each output
    generated_finetune.write(spaced_finetuned + "\n")

generated_finetune.close()

## **Generating text from standard base GPT-2 model**

In [None]:
# selecting the same pretrained model size that was fine tuned above (currently set to 'gpt2-large')
gpt_size = model_name

standard_tokenizer = GPT2Tokenizer.from_pretrained(gpt_size)
standard_model = GPT2LMHeadModel.from_pretrained(gpt_size , pad_token_id = standard_tokenizer.eos_token_id )
standard_tokenizer.decode(standard_tokenizer.eos_token_id)

def standard_generate(sequence):
    input_ids = standard_tokenizer.encode(sequence , return_tensors = 'pt')
    output = standard_model.generate(input_ids, max_length = max_length, num_beams = 5,no_repeat_ngram_size  = 5 , early_stopping = True)

    return standard_tokenizer.decode(output[0] , skip_special_tokens = True)

In [None]:
# save in separate file for comparison
generated_base = open('generated_base.txt', 'w')

for item in prompts_list:
    base_text = standard_generate(item)
    spaced_base = base_text.replace("\n", "").replace(".", ". ").replace("?", "? ").replace("!", "! ")
    generated_base.write(spaced_base + "\n")

generated_base.close()