<a href="https://colab.research.google.com/github/Fjallripa/TinyStories/blob/main/1M_german_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Use the Colab UI instead. Select the folder button on the left and then the drive button.
# This way you will not be asked every execution for permissions
from google.colab import drive
drive.mount('/content/drive')

!pip install -q datasets

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from transformers import GPTNeoConfig, GPTNeoForCausalLM, get_scheduler, DataCollatorForSeq2Seq, AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, AutoConfig
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets, Sequence, Value
import torch
from torch.utils.data import DataLoader
from datetime import datetime
from copy import deepcopy
from tqdm.auto import tqdm
import numpy as np
import os
import matplotlib.pyplot as plt
import re

if torch.cuda.is_available():
    device = torch.device("cuda")  # GPU
else:
    device = torch.device("cpu")  # CPU

# for debugging
#os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

print("Selected device:", device)

lang = "de" # Can be "en" or "de"
print(f"Selected language: {lang}")


Selected device: cuda
Selected language: de


### Tokenizing dataset

In [None]:
tokenized_datasets_path = f"/content/drive/My Drive/genAI project/tiny-stories-{lang}-tokenized"

if os.path.exists(tokenized_datasets_path):
    tokenized_datasets = DatasetDict.load_from_disk(tokenized_datasets_path)
else:
    if lang == "en":
        raw_dataset = load_dataset("roneneldan/TinyStories")
    elif lang == "de":
        raw_dataset = DatasetDict.load_from_disk("/content/drive/MyDrive/genAI project/tiny-stories-de-raw")
    else:
        raise NotImplementedError(f"Language unknown: {lang}")


    # Tokenize text
    tokenized_datasets = raw_dataset.map(
        lambda examples: tokenizer(examples["text"]),
        batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(["text"])
    tokenized_datasets.set_format("torch")
    tokenized_datasets.save_to_disk(tokenized_datasets_path)



### Preprocessing tokenized dataset into chunks

In [None]:
# Create 1024 length arrays starting with a given story and filling up the
# remaining space with additional ones. Every story should be split by an eos token.
preprocessed_datasets_path = f"/content/drive/My Drive/genAI project/tiny-stories-{lang}-preprocessed"

def split_dataset_generator(chunks):
    # Layout of temporary data in memory:
    # dim1: Number of story
    # dim2: input_ids (0) or labels (1)
    # dim3: Concatination of stories beginnnig with story number from dim1.
    #       Stories are separated by eos token.
    for chunk in chunks:
        for row_id in chunk:
            batched_input_ids = torch.zeros(1024, dtype=torch.float32)
            batch_write_start = 0
            batch_write_end = 1024
            batch_row_id = row_id

            # Loop while there is still data to be added to this batch
            while batch_write_end > batch_write_start:
                this_row_read_count = min((batch_write_end - batch_write_start,
                                        len(tokenized_datasets[split][batch_row_id]["input_ids"])))
                batched_input_ids[batch_write_start:batch_write_start + this_row_read_count] = tokenized_datasets[split][batch_row_id]["input_ids"][0:this_row_read_count]
                batch_write_start = batch_write_start + this_row_read_count
                batch_row_id = (batch_row_id + 1) % len(tokenized_datasets[split]) # This is resetted in the for loop automatically
                if batch_write_start > batch_write_end:
                    batched_input_ids[batch_write_start] = tokenizer.eos_token_id
                    batch_write_start = batch_write_start + 1

            yield {
                "input_ids": list(batched_input_ids),
                "labels": list(batched_input_ids.clone())
            }


preprocessed_datasets_chunks_path = f"/content/drive/My Drive/genAI project/tiny-stories-{lang}-preprocessed-chunks"
def get_chunks_on_disk():
    filenames = {}
    start_ids = {}

    chunk_map = {
        "train": {},
        "validation": {}
    }
    for filename in os.listdir(preprocessed_datasets_chunks_path):
        matches = chunk_pattern.match(filename)
        if not matches:
            continue
        split, chunk_start = matches.groups()
        chunk_map[split][int(chunk_start)] = filename

    for split in chunk_map.keys():
        start_ids[split] = sorted(chunk_map[split].keys())
        filenames[split] = [chunk_map[split][start_id] for start_id in start_ids[split]]

    return start_ids, filenames


if os.path.exists(preprocessed_datasets_path):
    preprocessed_datasets = DatasetDict.load_from_disk(preprocessed_datasets_path)
else:
    checkpoint_frequency = 100
    chunk_size = 1000

    chunk_pattern = re.compile(f"tiny-stories-{lang}-preprocessed-chunk-(.+)-(.+)")
    start_ids_on_disk, _ = get_chunks_on_disk()
    resume_point = {}
    for split in start_ids_on_disk.keys():
        if start_ids_on_disk[split]:
            resume_point[split] = max(start_ids_on_disk[split])
        else:
            resume_point[split] = 0
    print(f"resume points: {resume_point}")

    for split in tokenized_datasets.keys():
        print(f"working on split {split}")
        dataset_size = len(tokenized_datasets[split])

        chunks = [range(start, min(start + chunk_size, dataset_size)) for start in range(0, dataset_size, chunk_size)]
        start_chunk_index = None
        for chunk_index, chunk in enumerate(chunks):
            if resume_point[split] in chunk:
                start_chunk_index = chunk_index
                break
        if start_chunk_index is None:
            continue
        for checkpoint_chunk_start_id in range(start_chunk_index, len(chunks), checkpoint_frequency):
            checkpoint_chunks = chunks[checkpoint_chunk_start_id:checkpoint_chunk_start_id + checkpoint_frequency]
            print(f"Working on checkpoint beginning with: {checkpoint_chunks[0][0]}")
            chunk_dataset = Dataset.from_generator(split_dataset_generator, gen_kwargs={"chunks": checkpoint_chunks}, num_proc=4)
            chunk_dataset.save_to_disk(f"{preprocessed_datasets_chunks_path}/tiny-stories-{lang}-preprocessed-chunk-{split}-{checkpoint_chunks[0][0]}")

    preprocessed_datasets = DatasetDict({})
    _, sorted_filenames_on_disk = get_chunks_on_disk()
    for split, filenames in sorted_filenames_on_disk.items():
        datasets_to_concatinate = []
        for filename in filenames:
            datasets_to_concatinate.append(Dataset.load_from_disk(f"{preprocessed_datasets_chunks_path}/{filename}"))
        preprocessed_datasets[split] = concatenate_datasets(datasets_to_concatinate)
    preprocessed_datasets.save_to_disk(preprocessed_datasets_path)

Loading dataset from disk:   0%|          | 0/35 [00:00<?, ?it/s]

### Creating DataLoader

In [None]:
# 12GB RAM CPU
#batch_size = 4

# V100:
#batch_size = 8

# A100
batch_size = 20

# Loading tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/german-gpt2")

# Loading training data
preprocessed_datasets_path = f"/content/drive/My Drive/genAI project/tiny-stories-{lang}-preprocessed"
if os.path.exists(preprocessed_datasets_path):
    preprocessed_datasets = DatasetDict.load_from_disk(preprocessed_datasets_path)
    preprocessed_datasets.set_format("torch")
    print(f"Preprocessed datasets loaded from {preprocessed_datasets_path}")

# Creating dataloader
train_dataloader = DataLoader(preprocessed_datasets["train"], batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(preprocessed_datasets["validation"], batch_size=batch_size)

for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

Loading dataset from disk:   0%|          | 0/35 [00:00<?, ?it/s]

Preprocessed datasets loaded from /content/drive/My Drive/genAI project/tiny-stories-de-preprocessed


{'input_ids': torch.Size([20, 1024]), 'labels': torch.Size([20, 1024])}

In [None]:
print(preprocessed_datasets["train"][0]["input_ids"])
print(preprocessed_datasets["train"][0]["labels"])

tensor([14693.,  3585.,  2961.,  ...,  1608.,   444.,   418.])
tensor([14693.,  3585.,  2961.,  ...,  1608.,   444.,   418.])


## Model setup

In [None]:
architecture = "roneneldan/TinyStories-1M"
config = AutoConfig.from_pretrained(architecture)

model_directory = f"/content/drive/My Drive/genAI project/model_1m_{lang}"

if os.path.exists(model_directory):
    model = GPTNeoForCausalLM.from_pretrained(model_directory)
    print(f"Taking existing model from {model_directory}.")
else:
    if tokenizer.vocab_size != config.vocab_size:
        print(f"Note: Tokenizer vocabulary size ({tokenizer.vocab_size}) differs from the Model's ({config.vocab_size}). Will change that of the Model.")
        config.vocab_size = tokenizer.vocab_size
        print(f"Model vocabulary size now {config.vocab_size}")
    model = GPTNeoForCausalLM(config)
    print(f"Creating fresh model from {architecture} architecture")

Note: Tokenizer vocabulary size (50265) differs from the Model's (50257). Will change that of the Model.
Model vocabulary size now 50265
Creating fresh model from roneneldan/TinyStories-1M architecture


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, betas=(0.9, 0.95), weight_decay=0.1)
original_batch_size = 80
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
gradient_accumulation_steps=16 * int(original_batch_size / batch_size)
lr_scheduler = get_scheduler(
    "constant",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
training_checkpoint_frequency = 1_000

In [None]:
model_size = sum(t.numel() for t in model.parameters())
print(f"Training GPT-Neo type model with {model_size/1000**2:.1f}M parameters")

losses = []

progress_bar = tqdm(range(num_training_steps))
model.train().to(device)
for epoch in range(num_epochs):
    for index, batch in enumerate(train_dataloader):
        #with torch.autograd.detect_anomaly():
        batch = {k: v.type(torch.int64).to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps
        loss.backward()
        losses.append(loss.detach().cpu().numpy())

        if (index + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        progress_bar.update(1)

        if (index + 1) % training_checkpoint_frequency == 0:
            model.save_pretrained(f"/content/drive/My Drive/genAI project/model-1m-{lang}-{datetime.now().isoformat()}")
            torch.save(np.array(losses), f"/content/drive/My Drive/genAI project/losses_tensor-1m-{lang}-{datetime.now().isoformat()}.pt")

model.save_pretrained(f"/content/drive/My Drive/genAI project/model-1m-{lang}-{datetime.now().isoformat()}")
torch.save(np.array(losses), f"/content/drive/My Drive/genAI project/losses_tensor-1m-{lang}-{datetime.now().isoformat()}.pt")

Training GPT-Neo type model with 3.7M parameters


  0%|          | 0/105677 [00:00<?, ?it/s]