<a href="https://colab.research.google.com/github/Fjallripa/TinyStories/blob/main/1M_replication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Use the Colab UI instead. Select the folder button on the left and then the drive button.
# This way you will not be asked every execution for permissions
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# install dataset
import os

!pip install datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.18.0 dill-0.3.8 multiprocess-0.70.16


In [None]:
if "COLAB_TPU_ADDR" in os.environ:
    !pip install --upgrade jax jaxlib
    !pip install torch~=2.2.0 torch_xla[tpu]~=2.2.0 -f https://storage.googleapis.com/libtpu-releases/index.html

In [None]:
from transformers import GPTNeoConfig, GPTNeoForCausalLM, get_scheduler, DataCollatorForSeq2Seq, AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM
from datasets import load_dataset, DatasetDict
import torch
from torch.utils.data import DataLoader
from datetime import datetime
from copy import deepcopy
from tqdm.auto import tqdm

# Check for TPU availability
if "COLAB_TPU_ADDR" in os.environ:
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.parallel_loader as pl
    import torch_xla.distributed.xla_multiprocessing as xmp
    is_tpu = True
    device = "xla"
elif torch.cuda.is_available():
    is_tpu = False
    device = torch.device("cuda")  # GPU
else:
    is_tpu = False
    device = torch.device("cpu")  # CPU

print("Selected device:", device)


Selected device: cuda


In [None]:
config = GPTNeoConfig(
    activation_function="gelu_new",
    architectures=["GPTNeoForCausalLM"],
    attention_dropout=0,
    attention_layers=["global", "local", "global", "local", "global", "local", "global", "local"],
    attention_types=[[["global", "local"], 4]],
    bos_token_id=50256,
    context_length=512,
    embed_dropout=0,
    eos_token_id=50256,
    gradient_checkpointing=False,
    hidden_size=64,
    initializer_range=0.02,
    intermediate_size=None,
    layer_norm_epsilon=1e-05,
    max_position_embeddings=2048,
    model_type="gpt_neo",
    num_heads=16,
    num_layers=8,
    resid_dropout=0,
    summary_activation=None,
    summary_first_dropout=0.1,
    summary_proj_to_labels=True,
    summary_type="cls_index",
    summary_use_proj=True,
    torch_dtype="float32",
    transformers_version="4.28.0",
    use_cache=True,
    vocab_size=50257,
    window_size=256
)

model = GPTNeoForCausalLM(config)
#model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token

# Create a collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # False means Causal Language Modeling
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [None]:
dataset_directory = "/content/drive/My Drive/genAI project/dataset_prepared"

if os.path.exists(dataset_directory):
    tokenized_datasets = DatasetDict.load_from_disk(dataset_directory)
else:
    raw_dataset = load_dataset("roneneldan/TinyStories")


    # Tokenize text
    tokenized_datasets = raw_dataset.map(
        lambda examples: tokenizer(examples["text"], truncation=True, padding="max_length", max_length=1024),
        batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(["text"])
    tokenized_datasets.set_format("torch")

    tokenized_datasets.save_to_disk(dataset_directory)


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

In [None]:
#batch_size = 4
batch_size = 16


train_dataloader = DataLoader(
    #tokenized_datasets["train"].select(range(700)), shuffle=True, batch_size=batch_size, collate_fn=data_collator
    tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator
)

for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([16, 1024]),
 'attention_mask': torch.Size([16, 1024]),
 'labels': torch.Size([16, 1024])}

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, betas=(0.9, 0.95), weight_decay=0.1)
original_batch_size = 80
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
gradient_accumulation_steps=16 * int(original_batch_size / batch_size)
lr_scheduler = get_scheduler(
    "constant",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
print(gradient_accumulation_steps)

80


In [None]:

model_size = sum(t.numel() for t in model.parameters())
print(f"Training GPT-Neo type model with {model_size/1000**2:.1f}M parameters")

losses = []

if not is_tpu:
    # CUDA and CPU training loop
    progress_bar = tqdm(range(num_training_steps))
    model.train().to(device)
    for epoch in range(num_epochs):
        for index, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / gradient_accumulation_steps
            loss.backward()
            losses.append(loss)

            if (index + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            progress_bar.update(1)
else:
    # TPU training loop
    def _mp_fn(index):
        progress_bar = tqdm(range(num_training_steps))
        device = xm.xla_device()
        mp_device_loader = pl.MpDeviceLoader(train_dataloader, device)
        this_proc_model = model.train().to(device)
        for epoch in range(num_epochs):
            #for data, target in mp_device_loader:
            for example in mp_device_loader:
                optimizer.zero_grad()
                output = this_proc_model(input_ids=example.input_ids, attention_mask=example.attention_mask, labels=example.labels)
                loss = output.loss
                loss.backward()

                if (index + 1) % gradient_accumulation_steps == 0:
                    xm.optimizer_step(optimizer)
                    lr_scheduler.step()
                    optimizer.zero_grad()

            progress_bar.update(1)

    if __name__ == '__main__':
        xmp.spawn(_mp_fn, args=(), nprocs=1)
model.save_pretrained(f"/content/drive/My Drive/genAI project/model-{datetime.now().isoformat()}")

Training GPT-Neo type model with 3.7M parameters


  0%|          | 0/132483 [00:00<?, ?it/s]

In [None]:

input_ids = tokenizer.encode(f"Once upon a time Tom ate apples and", return_tensors="pt").to(device)
attention_mask = torch.ones(input_ids.shape, device=device)
output = model.generate(input_ids, max_length = 1024, num_beams=1, attention_mask=attention_mask, pad_token_id = 50256)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)