In [None]:
import datasets
from datasets import load_dataset, load_from_disk
import accelerate
import torch

In [None]:
!mkdir /mnt/data/ml-data -p

In [None]:
config

In [None]:
from transformers import AutoConfig, AutoModel, GPTNeoForCausalLM
device="cuda"
model_checkpoint = "roneneldan/TinyStories-1M"
config = AutoConfig.from_pretrained(model_checkpoint)
#model =  AutoModel.from_config(config).to(device)
model =   GPTNeoForCausalLM(config).to(device)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        print(p)
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp
get_n_params(model)

In [None]:
import os
if not os.path.exists("/mnt/data/ml-data/tinystory-tokenized/"):
    dataset = load_dataset('roneneldan/TinyStories', cache_dir="/mnt/data/ml-data")
    def transform(examples):
        return tokenizer(examples['text'], truncation=True, max_length=512)
    tokenized_dataset = dataset["train"].map(transform, batched=True).remove_columns("text")
    tokenized_dataset.save_to_disk("/mnt/data/ml-data/tinystory-tokenized")
else:
    tokenized_dataset = load_from_disk("/mnt/data/ml-data/tinystory-tokenized")

In [None]:
from transformers import DataCollatorWithPadding, DataCollatorForLanguageModeling
#DataCollatorWithPadding(tokenizer, padding='max_length', pad_to_multiple_of=8) 
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)

In [None]:
import torch
from transformers import TrainingArguments, Trainer, get_cosine_schedule_with_warmup
batch_size=8
steps_per_epoch = len(tokenized_dataset) /batch_size
num_train_epochs = 30
training_args = TrainingArguments(
    output_dir="/mnt/data/tiny-bert",
    #evaluation_strategy="epoch",
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    logging_steps=100,
    logging_dir='/mnt/data/tiny-bert/logs',
    per_device_train_batch_size=batch_size,
)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, betas=(0.9,0.99))
warmup_iters = 2000 # not super necessary potentially
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_iters, num_training_steps=num_train_epochs * steps_per_epoch)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    #eval_dataset=val_dataset,
    data_collator=data_collator,
    #compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)
trainer.train()

In [None]:
import gradio as gr
from transformers import pipeline
model = model.to("cuda")
def generator(text, max_length=30, num_return_sequences=1):
    model_inputs = tokenizer(text, return_tensors='pt').to(device)
    greedy_output = model.generate(**model_inputs, max_length=160,
                                  num_beams=5,
                                   do_sample=True,no_repeat_ngram_size=2,
    top_k=0,
    temperature=1,
    early_stopping=True)
    return tokenizer.decode(greedy_output[0], skip_special_tokens=True)
# generate 40 new tokens

def generate(text):
    result = generator(text, max_length=30, num_return_sequences=1)
    return result

demo = gr.Interface(
    fn=generate,
    inputs=gr.Textbox(lines=5, label="Input Text"),
    outputs=gr.Textbox(label="Generated Text"),
    examples = ["Once upon a time",""]
)

demo.launch()


In [None]:
model.device