In [None]:
!pip install lightning

Collecting lightning
  Downloading lightning-2.5.5-py3-none-any.whl.metadata (39 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Collecting torchmetrics<3.0,>0.7.0 (from lightning)
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.5.5-py3-none-any.whl.metadata (20 kB)
Downloading lightning-2.5.5-py3-none-any.whl (828 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m828.5/828.5 kB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytorch_lightning-2.5.5-py3-none-any.whl (832 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.nn as nn
# import torch.functional as F # Removing incorrect import
from torch.optim import Adam
import lightning
from torch.utils.data import TensorDataset, DataLoader
from datasets import load_dataset

In [None]:
class PositionEncoding(nn.Module):

    def __init__(self, d_model, max_len):

        super().__init__()

        pe = torch.zeros(max_len, d_model)

        position = torch.arange(start=0, end=max_len, step=1).float().unsqueeze(1)
        embedding_index = torch.arange(start=0, end=d_model, step=2).float()

        div_term = 1/torch.tensor(10000.0)**(embedding_index / d_model)

        pe[:, 0::2] = torch.sin(position * div_term) ## every other column, starting with the 1st, has sin() values
        pe[:, 1::2] = torch.cos(position * div_term) ## every other column, starting with the 2nd, has cos() values


        self.register_buffer('pe', pe)
    def forward(self, word_embeddings):

        return word_embeddings + self.pe[:word_embeddings.size(0), :]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F # Importing correct functional module

class Attention(nn.Module):
  def __init__(self,d_model):
    super().__init__()

    self.q_linear = nn.Linear(d_model, d_model)
    self.v_linear = nn.Linear(d_model, d_model)
    self.k_linear = nn.Linear(d_model, d_model)
    self.softmax = nn.Softmax(dim=-1) # Using nn.Softmax

  def forward(self, q, k, v, mask=None):

    self.q = self.q_linear(q)
    self.k = self.k_linear(k)
    self.v = self.v_linear(v)

    self.formula = torch.matmul(self.q, self.k.transpose(-2, -1))
    self.scaled_formula = self.formula / (k.size(-1) ** 0.5)

    if mask is not None:
      self.scaled_formula = self.scaled_formula.masked_fill(mask == 0, -1e9)

    self.scaled_formula = self.softmax(self.scaled_formula) # Applying nn.Softmax
    self.output_V = torch.matmul(self.scaled_formula, self.v)

    return self.output_V,self.scaled_formula

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F # Importing correct functional module
from torch.optim import Adam
import lightning
from torch.utils.data import TensorDataset, DataLoader
from datasets import load_dataset

class DecoderOnlyTransformer(lightning.LightningModule):
  def __init__(self,max_len,d_model):
    super().__init__()

    self.embedding = nn.Embedding(max_len,d_model)
    self.postional_encoded_embeddings = PositionEncoding(d_model,max_len)
    self.attention_part = Attention(d_model)

    self.fc_layer = nn.Linear(in_features=d_model, out_features=max_len)

    self.loss = nn.CrossEntropyLoss()

  def forward(self,x):
    self.we = self.embedding(x)
    self.pe = self.postional_encoded_embeddings(self.we)

    mask = torch.tril(torch.ones(x.size(0), x.size(0), device=x.device))

    self.attention = self.attention_part(self.pe,self.pe,self.pe,mask)
    self.output = self.fc_layer(self.attention[0]+self.pe)

    return self.output

  def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.1)


  def training_step(self, batch, batch_idx):
        ## training_step() is called by Lightning trainer when
        input_tokens, labels = batch
        output = self.forward(input_tokens[0])
        loss = self.loss(output, labels[0])

        return loss

In [None]:
ds = load_dataset("roneneldan/TinyStories")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00004-2d5a1467fff108(…):   0%|          | 0.00/249M [00:00<?, ?B/s]

data/train-00001-of-00004-5852b56a2bd28f(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/train-00002-of-00004-a26307300439e9(…):   0%|          | 0.00/246M [00:00<?, ?B/s]

data/train-00003-of-00004-d243063613e5a0(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/validation-00000-of-00001-869c898b5(…):   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer

# Use GPT-2 tokenizer (or train your own)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# GPT-2 doesn’t have pad token, so set it
tokenizer.pad_token = tokenizer.eos_token
vocab_size = len(tokenizer)


In [None]:
max_len = 128  # choose based on GPU memory

def preprocess(example):
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=max_len,
    )["input_ids"]

    # Create labels (next-token prediction)
    labels = tokens[1:] + [tokenizer.pad_token_id]  # shift left
    return {
        "input_tokens": tokens,
        "labels": labels
    }

tokenized_ds = ds.map(preprocess, remove_columns=ds["train"].column_names)


Map:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Map:   0%|          | 0/21990 [00:00<?, ? examples/s]

In [None]:
import torch
from torch.utils.data import DataLoader

def collate_fn(batch):
    input_tokens = torch.tensor([item["input_tokens"] for item in batch])
    labels = torch.tensor([item["labels"] for item in batch])
    return input_tokens, labels

train_loader = DataLoader(tokenized_ds["train"], batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(tokenized_ds["validation"], batch_size=32, collate_fn=collate_fn)



In [None]:
model = DecoderOnlyTransformer(max_len=vocab_size, d_model=256)
trainer = lightning.Trainer(max_epochs=3)
trainer.fit(model, train_loader, val_loader)

INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:lightning.pytorch.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/lightning/pytorch/trainer/configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. S

Training: |          | 0/? [00:00<?, ?it/s]