<a target="_blank" href="https://colab.research.google.com/github/Blaizzy/Coding-LLMs-from-scratch/blob/main/Llama-2/Part 3/BabyLLaMA.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

 # BabyLLaMA

Coding the LLaMA-2 research paper from scratch to create models with sizes 100M, 250M and 500M params.

## Model Arch

Decoder only: Composed of identical `n_layers`. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple position-wise fully connected FFN. We employ residual connection around each of the sub-layers, followed by layers normalizatin. That is:
LayerNorm(x + Sublayer(x))
 -- A Vaswani et al., 2017.

In [None]:
!pip install -U -q accelerate transformers[torch] datasets huggingface_hub

In [None]:
def flush():
    gc.collect()
    torch.cuda.empty_cache()

def count_parameters(model):
    return f"BabyLlama size: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 10 ** 6:.2f}M parameters"

In [None]:
import gc
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from math import sqrt
from transformers import PretrainedConfig
import math
from typing import Tuple, Optional, List
from transformers import logging, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutputWithPast


logger = logging.get_logger(__name__)


In [None]:
class BabyLlamaConfig(PretrainedConfig):
    model_type = "llama"
    keys_to_ignore_at_inference = ["past_key_values"]
    def __init__(
        self,
        vocab_size=32000,
        hidden_size=1024, # 2048 Tiny LLaMA
        intermediate_size=2048,
        num_hidden_layers=6,
        num_attention_heads=16, # 32 Tiny LLaMA
        num_key_value_heads=2,
        hidden_act="silu",
        max_position_embeddings=2048,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=False,
        pad_token_id=None,
        bos_token_id=1,
        eos_token_id=2,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        use_bias=False,
        lm_head_bias=False,
        residual_dropout=0.0,
        device='cpu',
        **kwargs,
    ):

        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        self.residual_dropout = residual_dropout
        self.use_bias = use_bias
        self.lm_head_bias = lm_head_bias
        self.device = device

        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )

### MHA
<img src="https://data-science-blog.com/wp-content/uploads/2022/01/mha_img_original.png" width=500>

- MQA
- GQA

In [None]:
def build_mask_cache(max_seq_length: int, device: Optional[torch.device] = None) -> torch.Tensor:
    ones = torch.ones((max_seq_length, max_seq_length), device=device, dtype=torch.bool)
    return torch.tril(ones).unsqueeze(0).unsqueeze(0)

def repeat_kv(hidden_states:torch.tensor, n_repeats:int):
    batch, n_kv_heads, seq_len, head_dim = hidden_states.shape
    if n_repeats == 1:
        return hidden_states
    hidden_states = hidden_states.unsqueeze(2).expand(batch, n_kv_heads, n_repeats, seq_len, head_dim) # (B, nh, T, hs) -> (B, nh, 1, T, hs) -> # (B, nh, n_repeats, T, hs)
    return hidden_states.reshape(batch, n_kv_heads * n_repeats, seq_len, head_dim) # # (B, nh * n_repeats, T, hs)


class RotaryPositionalEmbeddings(nn.Module):
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
        super().__init__()
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.device=device
        self.scaling_factor = scaling_factor
        self.base = base
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # Build here to make `torch.jit.trace` work.
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )

    def _set_cos_sin_cache(self, seq_len, device, dtype):
        self.max_seq_len_cached = seq_len
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
        t = t / self.scaling_factor
        freqs = torch.outer(t, self.inv_freq)
        # Different from paper, but it uses a different permutation in order to obtain the same calculation
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    @torch.no_grad()
    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )

    def apply_rope(self, x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, position_ids, unsqueeze_dim=1) -> torch.Tensor:
        cos = cos[position_ids].unsqueeze(unsqueeze_dim)
        sin = sin[position_ids].unsqueeze(unsqueeze_dim)
        x1 = x[..., : x.shape[-1] // 2] # (B, nh, T, hs/2)
        x2 = x[..., x.shape[-1] // 2 :] # (B, nh, T, hs/2)
        rotated = torch.cat((-x2, x1), dim=-1) # (B, nh, T, hs)
        roped = (x * cos) + (rotated * sin)
        return roped.to(dtype=x.dtype)


    @property
    def sin_cached(self):
        logger.warning_once(
            "The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use "
            "the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class"
        )
        return self._sin_cached

    @property
    def cos_cached(self):
        logger.warning_once(
            "The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use "
            "the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class"
        )
        return self._cos_cached


class KVCache(nn.Module):
    def __init__(
        self,
        k_shape: Tuple[int, int, int, int],
        v_shape: Tuple[int, int, int, int],
        device: Optional[torch.device] = None,
        dtype: Optional[torch.dtype] = None,
    ) -> None:
        super().__init__()
        self.register_buffer("k", torch.zeros(k_shape, device=device, dtype=dtype), persistent=False)
        self.register_buffer("v", torch.zeros(v_shape, device=device, dtype=dtype), persistent=False)

    def forward(self, input_pos: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        # move the buffer to the activation dtype for when AMP is used
        self.k = self.k.to(k.dtype)
        self.v = self.v.to(v.dtype)
        # update the cache
        k = self.k.index_copy_(2, input_pos, k)
        v = self.v.index_copy_(2, input_pos, v)
        return k, v

    def reset_parameters(self) -> None:
        torch.nn.init.zeros_(self.k)
        torch.nn.init.zeros_(self.v)

In [None]:
# KV caching
batch_size = 2
seq_len = 2
head_dim = 3
k_cache = torch.zeros([batch_size, seq_len, head_dim]) # (B, [n_heads], seq_len,  head_dim)
k = torch.rand([batch_size, seq_len, head_dim])
position_ids = torch.arange(k.size(2), dtype=torch.long)

k_cache.index_copy_(2, position_ids, k) # copies k values to k_cache

tensor([[[0.2160, 0.3963, 0.2684],
         [0.0642, 0.0370, 0.8778]],

        [[0.7536, 0.7330, 0.5991],
         [0.6476, 0.2874, 0.9562]]])

In [None]:
class BabyLlamaAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.hidden_dim = hidden_dim = config.hidden_size
        self.n_heads = n_heads = config.num_attention_heads
        self.n_kv_heads = n_kv_heads = config.num_key_value_heads
        self.head_dim = head_dim = config.hidden_size // n_heads
        use_bias = config.use_bias

        if (head_dim * n_heads) != self.hidden_dim:
            raise ValueError(
                f"hidden_dim must be divisible by num_heads (got `hidden_dim`: {self.hidden_dim}"
                f" and `num_heads`: {self.n_heads})."
            )

        self.repeats = n_heads // n_kv_heads # q_per_kv

        self.q_proj = nn.Linear(hidden_dim, n_heads * head_dim, bias=use_bias)
        self.k_proj = nn.Linear(hidden_dim, n_kv_heads * head_dim, bias=use_bias)
        self.v_proj = nn.Linear(hidden_dim, n_kv_heads * head_dim, bias=use_bias)
        self.o_proj = nn.Linear(n_heads * head_dim, hidden_dim, bias=use_bias)

        self.rotary_emb = RotaryPositionalEmbeddings(
            head_dim,
            max_position_embeddings=config.max_position_embeddings,
            device=config.device,
            base=config.rope_theta,
        )

        self.kv_cache: Optional[KVCache] = None

    def forward(
        self,
        hidden_states: torch.Tensor,
        mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
    ):
        B, T, _ = hidden_states.size() # bsz, seq_len, embed_dim

        queries = self.q_proj(hidden_states)
        keys = self.k_proj(hidden_states)
        values = self.v_proj(hidden_states)


        queries = queries.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)  # bsz, seq_len, n_heads, head_dim
        keys = keys.view(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2) # bsz, seq_len, n_kv_heads, head_dim
        values = values.view(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)

        kv_seq_len = keys.shape[-2]
        cos, sin = self.rotary_emb(values, seq_len=kv_seq_len)

        queries = self.rotary_emb.apply_rope(queries, cos, sin, position_ids)
        keys = self.rotary_emb.apply_rope(keys, cos, sin, position_ids)


        # TODO: KV caching
        keys = repeat_kv(keys, self.repeats)
        values = repeat_kv(values, self.repeats)

        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
        # Reference: https://github.com/pytorch/pytorch/issues/112577.
        if queries.device.type == "cuda" and mask is not None:
            queries = queries.contiguous()
            keys = keys.contiguous()
            values = values.contiguous()

        y = self.scaled_dot_product_attention(queries, keys, values, mask) # (B, T, n_heads, head_dim)

        y = y.reshape(B, T, self.hidden_dim) # (B, T, hidden_dim)

        return self.o_proj(y)


    def scaled_dot_product_attention(
        self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None
    ) -> torch.Tensor:

        print()
        scale = 1.0 / math.sqrt(self.head_dim)
        y = torch.nn.functional.scaled_dot_product_attention(
            q, k, v, attn_mask=None, dropout_p=0.0, scale=scale, is_causal=True
        )
        return y.transpose(1, 2).contiguous()


In [None]:
config = BabyLlamaConfig()
# Generate random input data
d_model=config.hidden_size
sequence_length = config.max_position_embeddings # number of tokens
batch_size = 5
input_data = torch.rand((batch_size, sequence_length, d_model), device=config.device) # [bs, sequence_length, d_model]
position_ids = torch.arange(sequence_length, dtype=torch.long, device=config.device).unsqueeze(0)


# attn = BabyLlamaAttention(config).to(config.device)
# attn(input_data,position_ids=position_ids).shape

In [None]:
attn

BabyLlamaAttention(
  (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
  (k_proj): Linear(in_features=1024, out_features=128, bias=False)
  (v_proj): Linear(in_features=1024, out_features=128, bias=False)
  (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
  (rotary_emb): RotaryPositionalEmbeddings()
)

In [None]:
count_parameters(attn)

'BabyLlama size: 2.36M parameters'

In [None]:
class LLaMAMLP(nn.Module):
    def __init__(self, hidden_dim, intermediate_dim): # in MLP: intermediate_dim= 4 * hidden_dim
        super(LLaMAMLP, self).__init__()
        self.linear_1 = nn.Linear(hidden_dim, intermediate_dim)
        self.linear_2 = nn.Linear(hidden_dim, intermediate_dim) # Original: intermediate -> hidden.
        self.activation_fn = nn.SiLU()
        self.out_proj = nn.Linear(intermediate_dim, hidden_dim) # Original: dropout


    def forward(self, hidden_states):
        x_fc_1 = self.linear_1(hidden_states)
        x_fc_2 = self.linear_2(hidden_states)
        x = self.activation_fn(x_fc_1) * x_fc_2
        return self.out_proj(x)

In [None]:
d_model=config.hidden_size
intermediate_dim = config.intermediate_size
mlp = LLaMAMLP(d_model, intermediate_dim)
mlp(input_data).shape

torch.Size([5, 2048, 1024])

In [None]:
count_parameters(mlp)

'BabyLlama size: 6.30M parameters'

In [None]:
class LlamaRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True) # (1/n) * Σ x_i^2
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)

class Block(nn.Module):
    def __init__(self, config: BabyLlamaConfig):
        super(Block, self).__init__()
        self.hidden_dim = hidden_dim = config.hidden_size
        self.intermediate_dim = intermediate_dim = config.intermediate_size

        self.attn = BabyLlamaAttention(config)

        self.mlp = LLaMAMLP(hidden_dim, intermediate_dim)
        self.input_layernorm = LlamaRMSNorm(hidden_dim, eps=config.rms_norm_eps)
        self.post_attention_layernorm = LlamaRMSNorm(hidden_dim, eps=config.rms_norm_eps)


    def forward(
        self,
        hidden_states,
        mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
    ):
        r = self.attn(self.input_layernorm(hidden_states), mask,position_ids,)
        h = hidden_states + r
        r = self.mlp(self.post_attention_layernorm(h))
        out = h + r
        return out


In [None]:
block = Block(config)
block(input_data).shape




torch.Size([5, 2048, 1024])

In [None]:
count_parameters(block)

'BabyLlama size: 8.66M parameters'

In [None]:
class BabyLlamaModel(nn.Module):
    def __init__(self, config):
        super(BabyLlamaModel, self).__init__()
        self.config = config
        self.hidden_dim = hidden_dim = config.hidden_size
        self.vocab_size = vocab_size = config.vocab_size
        assert self.vocab_size > 0
        self.num_hidden_layers = num_hidden_layers = config.num_hidden_layers

        self.embed_tokens = nn.Embedding(vocab_size, hidden_dim)
        self.blocks = nn.ModuleList(
            [Block(config) for _ in range(num_hidden_layers)]
        )
        self.norm = LlamaRMSNorm(hidden_dim, eps=config.rms_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
    ):

        x = self.embed_tokens(hidden_states)

        seq_len = hidden_states.size(1)
        if position_ids is None:
            position_ids = torch.arange(seq_len, dtype=torch.long, device=self.config.device).unsqueeze(0)


        for b in self.blocks:
            x = b(x, mask, position_ids)

        return self.norm(x)


class BabyLlamaPreTrainedModel(PreTrainedModel):
    config_class = BabyLlamaConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _skip_keys_device_placement = "past_key_values"

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


class BabyLlamaForCausalLM(BabyLlamaPreTrainedModel):

    def __init__(self, config):
        super().__init__(config)
        self.model = BabyLlamaModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, self.vocab_size, bias=config.lm_head_bias)
        self.post_init()

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        labels: Optional[torch.LongTensor] = None,
    ):

        outputs = self.model(
            hidden_states=input_ids,
            mask=attention_mask,
            position_ids=position_ids,
        )
        logits = self.lm_head(outputs)
        logits = logits.float()

        loss = None
        if labels is not None:
            # shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous() # all elements expect the last one
            shift_labels = labels[..., 1:].contiguous() # all elements except the first
            # Flatten the tokens
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Ensure tensors are on the same device
            shift_labels = shift_labels.to(shift_logits.device)
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits, shift_labels)


        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
        )


In [None]:
device="cpu"
config = BabyLlamaConfig(device=device)
llm = BabyLlamaForCausalLM(config).to(config.device)
input_ids = torch.randint(1, config.vocab_size, (batch_size, sequence_length), device=config.device)

In [None]:
outputs = llm(input_ids, labels=input_ids)

In [None]:
outputs

CausalLMOutputWithPast(loss=tensor(10.5440, grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.5685,  0.4991,  0.0887,  ..., -0.3191, -0.7050,  0.2773],
         [ 0.0144, -0.3414,  1.0155,  ...,  0.1460,  0.8013, -0.5059],
         [-0.0986,  0.8761,  1.0045,  ...,  0.3690,  0.3854, -0.3163],
         ...,
         [-0.1034,  0.1237, -0.2352,  ..., -0.5905,  0.5109,  0.5246],
         [ 0.9357, -0.0770, -0.0780,  ..., -0.4440,  0.4297,  0.0879],
         [ 1.5495,  0.1727, -0.3187,  ...,  0.5463, -0.1798, -0.7975]],

        [[-0.4585, -0.8214,  0.5855,  ..., -0.4665,  0.3416, -0.8313],
         [-0.8473, -0.7340,  0.0093,  ...,  0.0965,  0.2206, -0.3870],
         [-0.7174, -0.8436,  0.1930,  ..., -0.1607,  0.4220,  0.2594],
         ...,
         [-0.1197, -0.1746,  0.0941,  ..., -0.2939,  0.8470, -1.0239],
         [-0.7850,  0.4762,  0.6056,  ...,  0.3802, -0.1474, -0.1400],
         [-1.0205,  0.7370,  0.5013,  ..., -0.4368,  0.2129,  0.5696]],

        [[ 0.8300, -0.0724,  0.3148,

In [None]:
llm

BabyLlamaForCausalLM(
  (model): BabyLlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (blocks): ModuleList(
      (0-5): 6 x Block(
        (attn): BabyLlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=128, bias=False)
          (v_proj): Linear(in_features=1024, out_features=128, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): RotaryPositionalEmbeddings()
        )
        (mlp): LLaMAMLP(
          (linear_1): Linear(in_features=1024, out_features=2048, bias=True)
          (linear_2): Linear(in_features=1024, out_features=2048, bias=True)
          (activation_fn): SiLU()
          (out_proj): Linear(in_features=2048, out_features=1024, bias=True)
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear

In [None]:
count_parameters(llm)

'BabyLlama size: 117.48M parameters'

## Dataset

In [None]:
from datasets import load_dataset

torch.manual_seed(64)

train_dataset = load_dataset("huggingface-course/codeparrot-ds-train", split="train[:1%]")
val_dataset = load_dataset("huggingface-course/codeparrot-ds-valid")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/8.25G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/46.1M [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
train_dataset

Dataset({
    features: ['repo_name', 'path', 'copies', 'size', 'content', 'license'],
    num_rows: 121344
})

### Tokenizer

In [None]:
from transformers import AutoTokenizer
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", add_eos_token = True)

In [None]:
tokenizer

LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-7b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
tokenizer.pad_token = tokenizer.unk_token

In [None]:
text = "Hi I'm Prince"
tokenizer(text).tokens()

['<s>', '▁Hi', '▁I', "'", 'm', '▁Prince', '</s>']

In [None]:
context_length = 10
tokens = tokenizer(
    train_dataset["train"][1]["content"],
    padding=True,
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(tokens['input_ids'])}")
print(f"Input chunk lengths: {(tokens['length'])}")
print(f"Chunk mapping: {tokens['overflow_to_sample_mapping']}")

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="pt")

In [None]:
sample = data_collator.torch_call([tokens])

In [None]:
sample

{'input_ids': tensor([[[    1,  6324,   306, 29915, 29885, 10787,     2]]]), 'attention_mask': tensor([[[1, 1, 1, 1, 1, 1, 1]]]), 'length': tensor([[7]]), 'overflow_to_sample_mapping': tensor([[0]]), 'labels': tensor([[[    1,  6324,   306, 29915, 29885, 10787,     2]]])}

In [None]:
sample['input_ids'][0].shape

torch.Size([1, 7])

In [None]:
outputs = llm(input_ids=sample['input_ids'][0], labels=sample['labels'][0])

In [None]:
outputs

CausalLMOutputWithPast(loss=tensor(10.7713, grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.1392,  0.5465, -0.6703,  ...,  0.3825,  0.2823,  0.4007],
         [ 0.2046,  0.7748, -0.8714,  ...,  1.0657,  0.8742,  1.2220],
         [-0.4293,  1.1300, -0.9396,  ...,  0.9672,  0.6798,  1.4405],
         ...,
         [ 0.0519,  0.5752, -1.1562,  ...,  1.1749,  1.2020,  0.8308],
         [-0.1337,  0.5425, -1.2072,  ...,  1.0181,  1.3545,  0.7549],
         [ 0.0383,  0.6263, -1.1611,  ...,  0.9036,  1.4354,  0.4984]],

        [[ 0.1392,  0.5465, -0.6703,  ...,  0.3825,  0.2823,  0.4007],
         [-0.2063,  0.9668, -0.6615,  ...,  0.5127,  0.8345,  0.8666],
         [-0.2866,  1.1074, -0.4776,  ...,  0.0209,  1.4189,  1.0887],
         ...,
         [-0.7038,  0.1038, -0.6015,  ..., -0.2658,  0.9642,  0.8789],
         [-0.4464, -0.0600, -0.6642,  ..., -0.2598,  0.9721,  0.7510],
         [-0.8228,  0.0829, -1.0053,  ..., -0.2114,  0.6899,  0.7875]],

        [[ 0.1392,  0.5465, -0.6703,

In [None]:
outputs.logits.shape

torch.Size([261, 10, 32000])

In [None]:
torch.argmax(F.softmax(outputs.logits[0], dim=-1), dim=-1)

tensor([17442, 21242, 21242,  5358, 24799, 24799,  5711,  5711,  5711, 30203])

In [None]:
tokenizer.batch_decode(torch.argmax(F.softmax(outputs.logits[:, -1, :], dim=-1), dim=-1))

['m']

In [None]:
def tokenize(item):
    context_length = config.max_position_embeddings
    outputs = tokenizer(
        item['content'],
        padding=True,
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)

    return {"input_ids": input_batch}

In [None]:
tokenized_train_dataset = train_dataset.map(
    tokenize, batched=True, remove_columns=train_dataset.column_names
)

In [None]:
flush()

## Training

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    "./babyLlama",
    per_device_train_batch_size=32,
    max_steps=2000,
    # num_train_epochs=2,
    logging_steps=10,
    gradient_accumulation_steps=2,
    weight_decay=0.1,
    warmup_steps= 1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    # save_steps=500,
    fp16=True,
    push_to_hub=False,
)

In [None]:
trainer = Trainer(
    model=llm,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset
)

In [None]:
trainer.train()















Step,Training Loss
10,9.8073
20,7.8955
30,6.714
40,6.3069
50,6.0644
60,5.5305
70,5.721
80,5.4009
90,5.2793
100,5.2819


[1;30;43mStreaming output truncated to the last 5000 lines.[0m









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































KeyboardInterrupt: 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
tokens = tokenizer(
    "import numpy",
    return_tensors='pt'
).to('cuda')
input_ids = tokens['input_ids']

temperature = 1
top_k = None
top_p = None

# Generate the tokens one by one
for _ in range(10):
    # Get the logits from the model
    outputs = llm(input_ids)
    logits = outputs.logits[:, -1, :]

    # Apply temperature scaling
    logits = logits / temperature

    # Apply top-k or top-p sampling if specified
    if top_k is not None:
        logits = logits.topk(top_k, dim=-1)[0]
    elif top_p is not None:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
        sorted_indices_to_remove[:, 0] = 0
        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
        logits = logits.masked_fill(indices_to_remove, -float('inf'))

    # Sample the next token from the logits
    next_token_id = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)

    # Update the input with the new token
    input_ids = torch.cat([input_ids, next_token_id], dim=-1)

# Decode the generated text
generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
































































In [None]:
count_parameters(llm)

'BabyLlama size: 117.48M parameters'

In [None]:
print(generated_text)

import numpy as np
import matplotlib.pyplot as plt


In [None]:
trainer.push_to_hub()

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

events.out.tfevents.1710096969.2320e368804f.711.0:   0%|          | 0.00/6.94k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

events.out.tfevents.1710100129.2320e368804f.17295.0:   0%|          | 0.00/35.5k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/prince-canuma/babyLlama/commit/bc022b8a87000e45238bb57dd9118fe6acf6fbbe', commit_message='End of training', commit_description='', oid='bc022b8a87000e45238bb57dd9118fe6acf6fbbe', pr_url=None, pr_revision=None, pr_num=None)

## Register Config and Model to HF Auto Class

If your model is very similar to a model inside the library, you can re-use the same configuration as this model.