In [None]:
%%capture
!git clone https://github.com/Xirider/finetune-gpt2xl.git
!chmod -R 777 finetune-gpt2xl/
!pip install transformers
!pip install wandb
!pip install transformers[deepspeed]
!pip install datasets

In [None]:
import random
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, AutoModelForCausalLM
from tqdm import tqdm
import csv
import os

In [None]:
# Add key if you wish to use wandb
# os.environ["WANDB_API_KEY"] = ""

In [None]:
def sort_files_by_name(target_dir):
    files = os.listdir(target_dir)
    files_with_prefix = [f for f in files if f.__contains__("raw_training")]
    sorted_files = sorted(files_with_prefix, key=lambda x: float(x.split('_')[-1].split('.txt')[0]), reverse=True)
    return [os.path.join(target_dir, item) for item in sorted_files]

In [None]:
out_model = "" # Adjust this path to reflect the directory where you want to save your model
assert out_model != "", "Please set the out_model variable to a valid path"

In [None]:
# Adjust this path to reflect the directory where your training script is at
target_dir = "/content/drive"

target_file = sort_files_by_name(target_dir)

print(target_file)

d_lines = []
for t_file in target_file:
    with open(t_file, 'rb') as f:
        lines = f.readlines()
        for line in lines:
            line = line.decode('unicode_escape')
            d_lines.append(line)

In [None]:
__tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
sorted_lines = sorted(d_lines, key=len, reverse=True)
data_lines = []
encoded_tokens = []
for line in tqdm(sorted_lines, total=len(sorted_lines)):
        line = line.strip()
        line = line.split("<|endoftext|>")[0]
        line += "<|endoftext|>"
        if line.__contains__('[deleted]') or line.__contains__('[removed]'):
            continue
        encoded = __tokenizer.encode(line)
        if len(encoded) > 1024:
            continue
        data_lines.append(line)
        encoded_tokens.append(encoded)

display(data_lines[:10])

In [None]:
import torch
from torch.utils.data import random_split

generator = torch.Generator()

generator.manual_seed(0)

train_size = int(0.8 * len(data_lines))

train_dataset_file, eval_dataset_file = random_split(list(data_lines), [train_size, len(data_lines) - train_size], generator=generator)

random.shuffle(data_lines)

In [None]:
print(f"Train: {len(train_dataset_file)}")
print(f"Eval: {len(eval_dataset_file)}")
print(f"Total: {len(train_dataset_file)  + len(eval_dataset_file)}")

In [None]:
with open('/content/finetune-gpt2xl/train.csv', mode='w', encoding='utf-8') as csv_file:
    fieldnames = ['text']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for line in train_dataset_file:
        writer.writerow({'text': line})

In [None]:
%%bash

head /content/finetune-gpt2xl/train.csv -n 5

In [None]:
with open('/content/finetune-gpt2xl/validation.csv', mode='w', encoding='utf-8') as csv_file:
    fieldnames = ['text']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for line in eval_dataset_file:
        writer.writerow({'text': line})

In [None]:
%%bash

head /content/finetune-gpt2xl/validation.csv -n 5

In [None]:
!deepspeed --num_gpus=1 /content/finetune-gpt2xl/run_clm.py \
--deepspeed /content/finetune-gpt2xl/ds_config.json \
--model_name_or_path gpt2-xl \
--train_file /content/finetune-gpt2xl/train.csv \
--validation_file /content/finetune-gpt2xl/validation.csv \
--do_train \
--do_eval \
--fp16 \
--overwrite_cache \
--evaluation_strategy "steps" \
--output_dir {out_model} \
--eval_steps 500 \
--num_train_epochs 1 \
--gradient_accumulation_steps 2 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--save_total_limit=2

In [None]:
# DO THIS IF YOU NEED TO CONTINUE TRAINING, SET THE  --resume_from_checkpoint /content/drive/MyDrive/RawData/gpt/big-bot-2/checkpoint-2000 to your expected path where you are saving checkpoints
# !deepspeed --num_gpus=1 /content/finetune-gpt2xl/run_clm.py \
# --deepspeed /content/finetune-gpt2xl/ds_config.json \
# --model_name_or_path gpt2-xl \
# --resume_from_checkpoint /content/drive/MyDrive/RawData/gpt/big-bot-2/checkpoint-2000 \
# --train_file /content/finetune-gpt2xl/train.csv \
# --validation_file /content/finetune-gpt2xl/validation.csv \
# --do_train \
# --do_eval \
# --fp16 \
# --overwrite_cache \
# --evaluation_strategy "steps" \
# --output_dir {out_model} \
# --eval_steps 500 \
# --num_train_epochs 1 \
# --gradient_accumulation_steps 2 \
# --per_device_train_batch_size 8 \
# --per_device_eval_batch_size 8 \
# --save_total_limit=2

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = GPT2Tokenizer.from_pretrained(out_model)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(out_model)

special_tokens_dict = {
    "bos_token": "<|startoftext|>",
    "eos_token": "<|endoftext|>",
    "additional_special_tokens": [
        "<|endoftext|>",
        "<|startoftext|>",
        "<|subreddit|>",
        "<|title|>",
        "<|text|>",
        "<|context_level|>",
        "<|comment|>"
    ]
}
model.save_pretrained(out_model)
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

print("model loaded")

In [None]:
# this is a single input batch with size 3
texts = ["<|startoftext|>", "<|startoftext|><|subreddit|>", "<|startoftext|><|subreddit|>AskReddit<|title|>"] * 3

encoding = tokenizer(texts, padding=True, return_tensors='pt').to(device)

inputs = encoding['input_ids']
attention_mask = encoding['attention_mask']

with torch.no_grad():
    generated_ids = model.generate(inputs=inputs, attention_mask=attention_mask, max_length=1024, repetition_penalty=1.1, num_return_sequences=1)
    generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)
    for i in range(len(texts)):
        print(f"{generated_texts[i]}")

In [None]:
from google.colab import runtime
runtime.unassign()