In [2]:
!pip install lightning

Collecting lightning
  Downloading lightning-2.1.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.8.0 (from lightning)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.1.3-py3-none-any.whl (777 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m777.7/777.7 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightning-utilities, torchmetrics, pytorch-lightning, lightning
Successfully installed lightning-2.1.3 lightning-utilities-0.10.0 pytorch-lightning-2.1.3 torchmetrics-1.2.1


In [3]:
import random
import math
import numpy as np
import torch
import lightning


In [35]:

#Implements basic class for manipulation of data
class BaseDataModule(lightning.LightningModule):
  def __init__(self, batch_size=32, split=0.8, *args, **kwargs):
    super().__init__()
    self.ds_X, self.ds_Y = self.get_dataset(*args, **kwargs)
    shuffler = np.random.permutation(self.ds_X.shape[0])
    self.ds_X = self.ds_X[shuffler]
    self.ds_Y = self.ds_Y[shuffler]
    self.split = int(self.ds_X.shape[0]*split)
    self.batch_size = batch_size

  def train_dataloader(self):
    ds_X_train, ds_Y_train = self.ds_X[0:self.split], self.ds_Y[0:self.split]
    return torch.utils.data.DataLoader(list(zip(ds_X_train, ds_Y_train)), batch_size=self.batch_size)

  def val_dataloader(self):
    ds_X_test, ds_Y_test = self.ds_X[self.split:], self.ds_Y[self.split:]
    return torch.utils.data.DataLoader(list(zip(ds_X_test, ds_Y_test)), batch_size=self.batch_size)



class AdditionDataModule(BaseDataModule):
   def get_dataset(self):
    ret = []
    for i in range(100):
      for j in range(100):
        s = i+j
        ret.append([i//10, i%10, j//10, j%10, s//100, (s//10)%10, s%10])
    ds = np.array(ret)
    return ds[:, 0:6], np.copy(ds[:, 1:])

In [36]:
def attention(queries, keys, values):
  d = queries.shape[-1]
  scores = torch.matmul(queries, keys.transpose(-2,-1))/math.sqrt(d)
  attention_weights = torch.functional.softmax(scores, dim=-1)
  return torch.matmul(attention_weights, values)

class MultiHeadAttention(torch.nn.Module):
    def __init__(self, embed_dim, num_heads):
      super(MultiHeadAttention, self).__init__()
      self.embed_dim, self.num_heads = embed_dim, num_heads
      assert embed_dim % num_heads == 0
      self.projection_dim = embed_dim // num_heads

      self.W_q = torch.nn.Linear(embed_dim, embed_dim)
      self.W_k = torch.nn.Linear(embed_dim, embed_dim)
      self.W_v = torch.nn.Linear(embed_dim, embed_dim)
      self.W_o = torch.nn.Linear(embed_dim, embed_dim)

    def transpose(self, x):
      x = x.reshape(x.shape[0], x.shape[1], self.num_heads, self.projection_dim)
      return x.permute(0, 2, 1, 3)

    def transpose_output(self, x):
      x = x.permute(0, 2, 1, 3)
      return x.reshape(x.shape[0], x.shape[1], self.embed_dim)

    def forward(self, q, k, v):
      q = self.transpose(self.W_q(q))
      k = self.transpose(self.W_k(k))
      v = self.transpose(self.W_v(v))
      output = attention(q, k, v)
      return self.W_o(self.transpose_output(output))

class TransformerBlock(torch.nn.Module):
  def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
    super(TransformerBlock, self).__init__()
    self.att = MultiHeadAttention(embed_dim, num_heads)
    self.ffn = torch.nn.Sequential(
      torch.nn.Linear(embed_dim, ff_dim), torch.nn.ReLU(), torch.nn.Linear(ff_dim, embed_dim)
    )
    self.layernorm1 = torch.nn.LayerNorm(embed_dim)
    self.layernorm2 = torch.nn.LayerNorm(embed_dim)
    self.dropout = torch.nn.Dropout(rate)

  def forward(self, x):
    x = self.layernorm1(x + self.dropout(self.att(x, x, x)))
    x = self.layernorm2(x + self.dropout(self.ffn(x)))
    return x

class TokenAndPositionEmbedding(torch.nn.Module):
  def __init__(self, maxlen, vocab_size, embed_dim):
    super(TokenAndPositionEmbedding, self).__init__()
    self.token_emb = torch.nn.Embedding(vocab_size, embed_dim)
    self.pos_emb = torch.nn.Embedding(maxlen, embed_dim)
  def forward(self, x):
    pos = torch.arange(0, x.size(1), dtype=torch.int32, device=x.device)
    return self.token_emb(x) + self.pos_emb(pos).view(1, x.size(1), -1)


In [37]:


class LittleTransformer(lightning.LightningModule):
  def __init__(self, seq_len=6, max_value=10, layer_count=2, embed_dim=128, num_heads=4, ff_dim=32):
    super().__init__()
    self.max_value = max_value
    self.model = torch.nn.Sequential(
      TokenAndPositionEmbedding(seq_len, max_value, embed_dim),
      *[TransformerBlock(embed_dim, num_heads, ff_dim) for x in range(layer_count)],
      torch.nn.Linear(embed_dim, max_value),
      torch.nn.LogSoftmax(dim=-1))

  def forward(self, x):
    return self.model(x)

  def training_step(self, batch, batch_idx):
    x, y = batch
    output = self.model(x)
    loss = torch.nn.functional.nll_loss(output.view(-1, self.max_value), y.view(-1))
    self.log("train_loss", loss)
    return loss

  def validation_step(self, val_batch, batch_idx):
    x, y = val_batch
    pred = self.model(x).argmax(dim=2)
    val_accuracy = (pred == y).type(torch.float).mean()
    self.log("val_accuracy", val_accuracy, prog_bar=True)

  def configure_optimizers(self):
      return torch.optim.Adam(self.parameters(), lr=3e-4)


In [38]:
models = LittleTransformer(seq_len=6)
trainer = lightning.Trainer(enable_progress_bar=True, max_epochs=5, accelerator="cuda")
data = AdditionDataModule(batch_size=64)
#print(data)
trainer.fit(models , data)
#trainer.fit(model=autoencoder, trai_dataloaders=train_loader)

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name  | Type       | Params
-------------------------------------
0 | model | Sequential | 153 K 
-------------------------------------
153 K     Trainable params
0         Non-trainable params
153 K     Total params
0.613     Total estimated model params size (MB)
INFO:lightning.pytorch.callbacks.model_summary:
  | Name  | Type       | Params
------------

RuntimeError: ignored