In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, Trainer, TrainingArguments
import pandas as pd
from tqdm.auto import tqdm
from torch.utils.data import Dataset, random_split
import torch

In [None]:
# # load pretrained
# model_id = "roneneldan/TinyStories-1M"
# model = AutoModelForCausalLM.from_pretrained(model_id)
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# prompt = "Once upon a time there was"
# input_ids = tokenizer.encode(prompt, return_tensors="pt")
# output = model.generate(input_ids, max_length=100, num_beams=1)
# output_text = tokenizer.decode(output[0], skip_special_tokens=True)
# print(output_text)

In [None]:
# load untrained
model_id = "roneneldan/TinyStories-1M"
config = AutoConfig.from_pretrained(model_id, local_files_only=True)
model = AutoModelForCausalLM.from_config(config)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Once upon a time there was"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = model.generate(input_ids, max_length=100, num_beams=1)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)

In [None]:
def clean_carr_ret(poem):
    return poem.replace("\r", "")
poems = pd.read_csv("data/PoetryFoundationData.csv")["Poem"].apply(clean_carr_ret)
poems.head()

In [None]:
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token = tokenizer.eos_token

In [None]:
max_length = min(max([len(tokenizer.encode(p)) for p in tqdm(poems)]), 2048)
print(f"{max_length = }")

In [None]:
class PoemDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in tqdm(txt_list):
            encodings_dict = tokenizer(
                "<|startoftext|>" + txt + "<|endoftext|>",
                truncation=True,
                max_length=max_length,
                padding="max_length",
            )
            self.input_ids.append(torch.tensor(encodings_dict["input_ids"]))
            self.attn_masks.append(torch.tensor(encodings_dict["attention_mask"]))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
dataset = PoemDataset(poems, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [None]:
for d in dataset:
    if len(d) > 10:
        print(len(d))

In [None]:
training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 10,
    logging_steps=2,
    save_steps = 4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 1,
    warmup_steps=1,
    weight_decay=0.05,
    logging_dir = "./logs",
    report_to = "none",
)

In [14]:
Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = lambda data: {
        "input_ids": torch.stack([f[0] for f in data]),
        "attention_mask": torch.stack([f[1] for f in data]),
        "labels": torch.stack([f[0] for f in data])
    }
).train()

  0%|          | 0/1950 [01:36<?, ?it/s]
  0%|          | 0/15590 [00:00<?, ?it/s]

RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 2147483648 bytes.

In [None]:
model = torch.load("./model.pth")
generated = tokenizer("<|startoftext|>", return_tensors="pt").input_ids

In [None]:
sample_outputs = model.generate(
    generated,
    do_sample=True,
    top_k=5,
    max_length=50,
    top_p=0.95,
    temperature=1,
    num_return_sequences=2000,
)

for i, sample_output in enumerate(sample_outputs):
    print(f"{i}: {tokenizer.decode(sample_output, skip_special_tokens=True)}")

In [None]:
torch.save(model, "model.pt")