<a href="https://colab.research.google.com/github/Aditya-Shandilya1182/coom_experiments/blob/main/NEMO_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nemo_toolkit['all']

In [None]:
import nemo
print(nemo.__version__)

In [None]:
!git clone https://github.com/NVIDIA/NeMo.git

In [None]:
import os
print(os.getcwd())
os.chdir('NeMo')

/kaggle/working


In [None]:
!pip install omegaconf

In [None]:
import torch
from nemo.collections.nlp.modules.common.megatron.transformer import ParallelTransformerLayer
from nemo.collections.nlp.modules.common.megatron.utils import init_method_normal, scaled_init_method_normal, get_linear_layer
from nemo.collections.nlp.modules.common.transformer.transformer_modules import TransformerEmbedding
from datasets import load_dataset
import tiktoken
from tiktoken import get_encoding
from omegaconf import OmegaConf

In [None]:
os.chdir('/kaggle/working')

In [None]:
print(os.getcwd())

In [None]:
class NemoGPT(torch.nn.Module):
    def __init__(self, vocab_size, hidden_size=256, num_layers=4, num_heads=4, max_seq_len=128):
        super().__init__()

        self.embedding = TransformerEmbedding(
            hidden_size=hidden_size,
            vocab_size=vocab_size,
            max_sequence_length=max_seq_len,
        )

        self.layers = torch.nn.ModuleList()
        for i in range(num_layers):
            layer_cfg = OmegaConf.create({
                "apply_query_key_layer_scaling": True,
                "bias_activation_fusion": False,
                "openai_gelu": False,
                "onnx_safe": False,
                "use_cpu_initialization": False,
                "apply_residual_connection_post_layernorm": False,
                "precision": 16,
                "activation": "gelu",
                "normalization": "layernorm",
            })

            layer = ParallelTransformerLayer(
                hidden_size=hidden_size,
                ffn_hidden_size=4 * hidden_size,
                num_attention_heads=num_heads,
                init_method=init_method_normal(0.02),
                output_layer_init_method=scaled_init_method_normal(0.02, num_layers),
                layer_number=i,
                config=layer_cfg,
                attention_dropout=0.1,
                hidden_dropout=0.1,
            )
            self.layers.append(layer)

        self.ln_f = torch.nn.LayerNorm(hidden_size)
        self.lm_head = get_linear_layer(
                        hidden_size,
                        vocab_size,
                        init_method=init_method_normal(0.02)
                        )


    def forward(self, input_ids, labels=None):
        x = self.embedding(input_ids=input_ids)
        bsz, seq_len, _ = x.size()
        attention_mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device)).view(1, 1, seq_len, seq_len)

        for layer in self.layers:
            x = layer(hidden_states=(x, x), attention_mask=attention_mask)[0]

        x = self.ln_f(x)
        logits = self.lm_head(x)

        if labels is not None:
            loss = torch.nn.functional.cross_entropy(
                logits.view(-1, logits.size(-1)),
                labels.view(-1),
            )
            return logits, loss

        return logits


In [None]:
dataset = load_dataset("roneneldan/TinyStories")

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
encoded_train_chunks = []
for sample in dataset['train']:
    encoded_train_chunks.append(tokenizer.encode(sample['text']))
train_encoded = torch.tensor([token for chunk in encoded_train_chunks for token in chunk], dtype=torch.long)
encoded_val_chunks = []
for sample in dataset['validation']:
    encoded_val_chunks.append(tokenizer.encode(sample['text']))
val_encoded = torch.tensor([token for chunk in encoded_val_chunks for token in chunk], dtype=torch.long)

In [None]:
print(len(train_encoded))
print(len(val_encoded))

In [None]:
device = "cuda"
train_encoded.to(device)
val_encoded.to(device)

In [None]:
vocab_size = tokenizer.n_vocab
model = NemoGPT(
        vocab_size=vocab_size,
        hidden_size=256,
        num_layers=4,
        num_heads=4,
        max_seq_len=128,
    )

In [None]:
batch_size = 8
block_size = 512

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        data = train_encoded if split == 'train' else val_encoded

        if data.size(0) <= block_size:
            raise ValueError(f"{split.capitalize()} dataset size is too small for the requested block size.")

        losses = torch.zeros(eval_iters)

        for k in range(eval_iters):
            ix = torch.randint(0, data.size(0) - block_size, (batch_size,))
            x = torch.stack([data[i:i+block_size] for i in ix])
            y = torch.stack([data[i+1:i+block_size+1] for i in ix])

            x, y = x.to(device), y.to(device)
            logits, loss = model(x, y)
            losses[k] = loss.item()

        out[split] = losses.mean().item()

    model.train()
    return out

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
max_iters = 100
gradient_accumulation_steps = 5
eval_iters = 10
model = model.to(device)
for iter in range(max_iters):
    print(iter)
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    ix = torch.randint(len(train_encoded) - block_size, (batch_size,))
    x = torch.stack([train_encoded[i:i+block_size] for i in ix])
    y = torch.stack([train_encoded[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)

    logits, loss = model.forward(x, y)

    loss = loss / gradient_accumulation_steps
    loss.backward()

    if (iter + 1) % gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)

    if (iter + 1) % gradient_accumulation_steps == 0:
        print(f"Loss at step {iter + 1}: {loss.item() * gradient_accumulation_steps:.3f}")