In [2]:

from google.colab import drive
drive.mount('/content/drive')

!pip install transformers
# !pip install wandb
# import wafndb
# wandb.login()

# Optional: log both gradients and parameters
# %env WANDB_WATCH=all


import ipywidgets
from IPython import display
import os

import torch

from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import GPT2Model, GPT2Config
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import Trainer, TrainingArguments

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator


display.clear_output()
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9

if gpu_info.find('failed') >= 0 and ram_gb < 30:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  try:
      if gpu_info.find('failed') < 0:
         print(gpu_info)
  except:
    display.clear_output()
  finally:
    print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
    print('You are using a high-RAM runtime!')


Your runtime has 38.0 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
parent_directory = '/content/drive/My Drive/CSCI 470 Project/Epic/'
checkpoint_path = 'Models/gpt2_medium_Stephen_King/checkpoint-000'
epochs = 3
batch_size=5
# wandb.init(config={"epochs": epochs, "batch_size": batch_size})

# Initializing a GPT2 configuration
configuration = GPT2Config()
# Initializing a model from the configuration
model = GPT2Model(configuration)
# Accessing the model configuration
configuration = model.config
# print(configuration)
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
# # Download model and configuration from S3 and cache.
model = AutoModelWithLMHead.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)


# tokenizer = AutoTokenizer.from_pretrained("pranavpsv/gpt2-genre-story-generator")

# model = AutoModelWithLMHead.from_pretrained("pranavpsv/gpt2-genre-story-generator")
# # Example of usage
# from transformers import pipeline

# story_gen = pipeline("text-generation", "pranavpsv/gpt2-genre-story-generator")
# print(story_gen("<BOS> <superhero> Batman"))

train_path = parent_directory + 'Data/Cleaned_UTF8/merged_Stephen_King_train.txt';
test_path = parent_directory + 'Data/Cleaned_UTF8/merged_Stephen_King_test.txt';
output_path = parent_directory + 'Models/gpt2_small_Stephen_King'

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)
training_args = TrainingArguments(
  run_name="gpt2-small-generator_initial_Stephen_King_texts",
  output_dir=output_path, #The output directory
  overwrite_output_dir=True, #overwrite the content of the output directory, set to true if continuing training
  num_train_epochs=epochs, # number of training epochs
  per_device_train_batch_size=batch_size, # batch size for training
  per_device_eval_batch_size=int(batch_size/2),  # batch size for evaluation
  eval_steps = 100, # Number of update steps between two evaluations.
  save_steps = 200, # after # steps model is saved
  warmup_steps=200,# number of warmup steps for learning rate scheduler
  evaluation_strategy="steps",
  logging_steps = 200,
#   logging_dir=parent_directory+"Models/gpt2_small_Stephen_King/logs",
  do_predict=True
  )

trainer = Trainer(
  model=model,
  args=training_args,
  data_collator=data_collator,
  train_dataset=train_dataset,
  eval_dataset=test_dataset
)

try:
  display.clear_output()
  print("Train Path:\t", train_path)
  print("Test Path:\t", test_path)
  print("Output Directory Path:\t", output_path)
  trainer.train()

finally:
  print("Exiting Training")
  trainer.save_model()
  drive.flush_and_unmount()
  print('All changes made in this colab session should now be visible in Drive.')


Train Path:	 /content/drive/My Drive/CSCI 470 Project/Epic/Data/Cleaned_UTF8/merged_Stephen_King_train.txt
Test Path:	 /content/drive/My Drive/CSCI 470 Project/Epic/Data/Cleaned_UTF8/merged_Stephen_King_test.txt
Output Directory Path:	 /content/drive/My Drive/CSCI 470 Project/Epic/Models/gpt2_small_Stephen_King


Step,Training Loss,Validation Loss


In [None]:

from transformers import pipeline

generator = pipeline('text-generation',model=output_path+"/checkpoint-000", tokenizer=tokenizer ,config={'max_length':800})

result = generator('Input Prompt')[0]['generated_text']