In [None]:
!pip install git+https://github.com/huggingface/transformers@main
!pip install accelerate
!pip install git+https://github.com/AJStangl/gpt-model-finetuning@master
!pip install  simpletransformers==0.63.3

In [None]:
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer
import gc
import os
import logging
import pandas
import torch
import gc
from shared_code.fine_tuning.datasets.reddit_dataset import RedditDataset
from shared_code.fine_tuning.tensor_encoding.tensor_encoding import TokenizerAdapter
from shared_code.fine_tuning.datasets.reddit_dataset import RedditDataset
import pandas
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer
import logging
import pandas
from simpletransformers.language_modeling import LanguageModelingModel

In [None]:
use_head_model = False
write_text_file = False

In [None]:
model_name = "yuli-bot"

parent_directory = "/content/drive/MyDrive/RawData"

model_output_dir = f"{parent_directory}/{model_name}"

tokenizer_path = f"{model_output_dir}"

training_data_path = f"/content/drive/MyDrive/RawData/training.csv"

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
tokenizer_adapter = TokenizerAdapter(tokenizer)
model = None
if use_head_model:
	tokenizer.save_pretrained(model_output_dir)
	model = GPT2LMHeadModel.from_pretrained('gpt2-medium').cuda()
	tokenizer_adapter = TokenizerAdapter(tokenizer)

In [None]:
def has_valid_line(input: str) -> bool:
    black_list = ["**NO SIGN**", "**Image Stats:**", "**INCOMPLETE MEAT TUBE**", "[removed]", "[deleted]", 'Unfortunately, your post was removed for the following reason(s)']
    for line in black_list:
        if input.__contains__(line):
            print(f":: Line contains word {line}... Skipping")
            return False
        else:
            return True

In [None]:
df = pandas.read_csv(training_data_path)

conversations = list(df['TrainingString'])

valid_lines = []

for conversation in conversations:
	if tokenizer_adapter.token_length_appropriate(conversation) and has_valid_line(conversation):
		valid_lines.append(conversation)

In [None]:
generator = torch.Generator()

generator.manual_seed(0)

logging.info(f":: Total Number Of Samples {len(valid_lines)}")

if use_head_model and model:

	max_length = max([len(tokenizer.encode(prompt)) for prompt in valid_lines])

	model.resize_token_embeddings(len(tokenizer))

	logging.info(f":: Max Length Of Sample {max_length}")

	dataset = RedditDataset(valid_lines, tokenizer, max_length=max_length)

	train_size = int(0.9 * len(dataset))

	train_dataset, eval_dataset = random_split(dataset, [train_size, len(dataset) - train_size], generator=generator)

else: # Do it the other way with a file
	train_size = int(0.9 * len(valid_lines))

	train_dataset, eval_dataset = random_split(list(valid_lines), [train_size, len(valid_lines) - train_size], generator=generator)

	with open("train.txt", 'w', encoding="utf-8") as train_out, open("eval.txt", "w", encoding="utf-8") as eval_out:
		train_out.writelines([repr(line)[1:-1] + "<|endoftext|>" + "\n" for line in train_dataset])
		eval_out.writelines([repr(line)[1:-1] + "<|endoftext|>" + "\n" for line in eval_dataset])

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
if use_head_model:
	training_args = TrainingArguments(output_dir=model_output_dir)
	training_args.num_train_epochs = 5
	training_args.logging_steps = 100
	training_args.save_steps = 1000
	training_args.weight_decay = 0.05
	training_args.logging_dir = './logs'
	training_args.fp16 = True
	training_args.auto_find_batch_size = True
	training_args.gradient_accumulation_steps = 50
	training_args.learning_rate = 1e-4
else:
	training_args = {
    "overwrite_output_dir": True,
    "learning_rate": 1e-4,
    "gradient_accumulation_steps": 100,
    "dataset_type": "simple",
    "sliding_window": True,
    "max_seq_length": 1024,
	"mlm": False, # has to be false for gpt-2
    "evaluate_during_training": True,
    "use_cached_eval_features": True,
    "evaluate_during_training_verbose": True,
    "save_optimizer_and_scheduler": False,
    "save_eval_checkpoints": True,
    "save_model_every_epoch": False,
    "save_steps": -1,
    "train_batch_size":3,
    "num_train_epochs":12,
    "output_dir": f"{model_output_dir}/",
	"best_model_dir": f"{model_output_dir}/best_model",
}
model = LanguageModelingModel("gpt2", "gpt2-medium", args=training_args)
model.train_model(train_file="train.txt", eval_file="eval.txt", args=training_args, verbose=True)

In [None]:
if use_head_model:
	trainer: Trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,
							   data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
														   'attention_mask': torch.stack([f[1] for f in data]),
														   'labels': torch.stack([f[0] for f in data])
														   })
	trainer.train()
	trainer.save_model()

In [None]:
import re
tag_match = f"\<\|(.*)\|\>"
pattern = re.compile(tag_match)
expected_prompt = "I love these bizarro eras of weirdness in AI development after it becomes possible to do but before it's perfected. Image synthesis itself was in the same place for several years back in the mid to late 2010s."

prompt = "<|soss r/dalle2|><|sot|>Detailed scientific diagram depicting the anatomy of a tomato, full colour, realistic<|sost|>https://i.imgur.com/7adBOXn.jpg<|sor u/AsterJ|>It's going to be sad day when it learns to properly spell.  I feel like this era is a fleeting moment in AI history.  We must cherish it.<|eor|><|sor"

generated = tokenizer(f"<|startoftext|> {prompt}", return_tensors="pt")

sample_outputs = model.generate(inputs=generated.input_ids.cuda(),
								attention_mask=generated['attention_mask'].cuda(),
                                do_sample=True,
                                top_k=40,
								max_length=1024,
                                top_p=0.8,
                                temperature=0.8,
                                num_return_sequences=10,
								repetition_penalty=1.08,
                                stop_token='<|endoftext|>')

for i, sample_output in enumerate(sample_outputs):
    result = tokenizer.decode(sample_output, skip_special_tokens=True)
    print("{}: {}".format(i, result.replace(prompt, "")))