In [None]:
#

In [3]:
!pip install lightning


Collecting lightning
  Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.11.8-py3-none-any.whl.metadata (5.2 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning)
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Downloading lightning-2.4.0-py3-none-any.whl (810 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m811.0/811.0 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.8-py3-none-any.whl (26 kB)
Downloading torchmetrics-1.6.0-py3-none-any.whl (926 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader

import lightning as L

In [5]:
ttid = {
    'what': 0,
    'is': 1,
    'statquest': 2,
    'awesome': 3,
    '<E>': 4
} #token to id

idtt = dict(map(reversed, ttid.items())) #id to token
idtt

{0: 'what', 1: 'is', 2: 'statquest', 3: 'awesome', 4: '<E>'}

In [6]:
prompt = torch.tensor([
    [ttid['what'],
     ttid['is'],
     ttid['statquest'],
     ttid['<E>'],
     ttid['awesome']],

    [ttid['statquest'],
     ttid['is'],
     ttid['what'],
     ttid['<E>'],
     ttid['awesome']]
])

prompt.shape

torch.Size([2, 5])

In [7]:
lables = torch.tensor([
    [ttid['is'], # after what its need to gen is,, same as others
     ttid['statquest'],
     ttid['<E>'],
     ttid['awesome'],
     ttid['<E>']],

    [ttid['is'],
     ttid['what'],
     ttid['<E>'],
     ttid['awesome'],
     ttid['<E>']]

])

lables.shape

torch.Size([2, 5])

In [8]:
dataset = TensorDataset(prompt, lables)
dataset[0]

(tensor([0, 1, 2, 4, 3]), tensor([1, 2, 4, 3, 4]))

In [9]:
dataloadr = DataLoader(dataset)

# torch. Unsqueeze():

   + Turns the **seq of num** into a **Column Matrix**

* ex:
   + if maxLen=3, then we'll get this comlumn matrix
  

    tensor=([[0.],
             [1.],
             [2.]])   

In [10]:
class PositionEnc(nn.Module):
    # dimModel=Num word Emb / dim of model.. , maxLen=Num of token to tansF can proces
    def __init__(self, dimModel=2, maxLen=6):
        super(PositionEnc, self).__init__()

        pe = torch.zeros(maxLen, dimModel)
        print('pe shape ',pe.shape)

        position = torch.arange(start=0, end=maxLen, step=1).float().unsqueeze(1)
        print('pos shape ',position.shape)

        embIx = torch.arange(start=0, end=dimModel, step=2).float()

        divTerm = 1/torch.tensor(10000.0)**(embIx/dimModel)

        pe[:, 0::2] = torch.sin(position * divTerm)
        pe[:, 1::2] = torch.cos(position * divTerm)

        # reg buffr to ensure that pe gets moved to GPU if use one
        self.register_buffer('pe', pe)

    def forward(self, wordEmb):
        return wordEmb + self.pe[:wordEmb.size(0), :]

# torch.matmul:
  * mul Q by the Transpose of K

In [11]:
class Attention(nn.Module):
    def __init__(self, dimModel=2):
        super(Attention, self).__init__()
        self.weightQury = nn.Linear(dimModel, dimModel, bias=False)
        self.weightKey = nn.Linear(dimModel, dimModel, bias=False)
        self.weightValue = nn.Linear(dimModel, dimModel, bias=False)

        self.rowDim = 0
        self.columnDim = 1

    def forward(self, encForQ, encForK, encForV, mask=None):
        Q = self.weightQury(encForQ)
        K = self.weightKey(encForK)
        V = self.weightValue(encForV)

        sims = torch.matmul(Q, K.transpose(dim0=self.rowDim, dim1=self.columnDim))

        scaledSims = sims / torch.tensor(K.size(self.columnDim)**0.5)

        if mask is not None:
            scaledSims  = scaledSims.masked_fill(mask=mask, value=-1e9)

        attenPrecents = F.softmax(scaledSims, dim=self.columnDim)
        attenScores = torch.matmul(attenPrecents, V)

        return attenScores


In [17]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
class DecoderOnlyTransformer(L.LightningModule):
    def __init__(self, numToken=4, dimModel=2, maxLen=6):
        super(DecoderOnlyTransformer, self).__init__()
        L.seed_everything(seed=42)

        self.wordEmb = nn.Embedding(numToken, dimModel)

        self.positionEnc = PositionEnc(dimModel, maxLen)

        self.atten = Attention(dimModel)

        self.fulyConctLayr = nn.Linear(dimModel, numToken)

        self.loss = nn.CrossEntropyLoss()

    def forward(self, tokenIds):
        wordEmb = self.wordEmb(tokenIds)
        positionEnc = self.positionEnc(wordEmb)

        mask = torch.tril(torch.ones((tokenIds.size(dim=0), tokenIds.size(dim=0)), device=device))
        mask = mask == 0

        selfAttenVal = self.atten(
            positionEnc, #for Query
            positionEnc, #for Key
            positionEnc, #dor Value
            mask=mask    #mask for early token cant cheat by look ahead at later token
        )

        residualConntection = positionEnc + selfAttenVal

        fullyConnctLayrOutput = self.fulyConctLayr(residualConntection)

        return fullyConnctLayrOutput

    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.1)

    def training_step(self, batch, batchIx):
            inputToken, label = batch
            output = self.forward(inputToken[0])
            loss = self.loss(output, label[0])

            return loss




In [18]:
model = DecoderOnlyTransformer(numToken=len(ttid), dimModel=2, maxLen=6)

modelInput = torch.tensor([
    ttid['what'],
    ttid['is'],
    ttid['statquest'],
    ttid['<E>']
])

inputLen = modelInput.size(dim=0)

predict = model(modelInput)
predictId = torch.tensor([torch.argmax(predict[-1, :])]) #use -1 to indx what gen aftetr <E>
predictIds = predictId

maxLen = 6
for i in range(inputLen, maxLen):
    if (predictId == ttid['<E>']):
        break

    modelInput = torch.cat((modelInput, predictId))

    predicts = model(modelInput)
    predictId = torch.tensor([torch.argmax(predicts[-1, :])])
    predictIds = torch.cat((predictIds, predictId))

print('Predicted Tokens\n')
for id in predictIds:
    print('\t, predict Token:', idtt[id.item()])

INFO:lightning_fabric.utilities.seed:Seed set to 42


pe shape  torch.Size([6, 2])
pos shape  torch.Size([6, 1])
Predicted Tokens

	, predict Token: <E>


In [19]:
trainr = L.Trainer(max_epochs=30)
trainr.fit(model, train_dataloaders=dataloadr)


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name          | Type             | Params | Mode 
-----------------------------------------------------------
0 | wordEmb       | Embedding        | 10     | train
1 | positionEnc   | PositionEnc      | 0      | train
2 | atten         | Attention        | 12     | train
3 | fulyConctLayr | Linear           | 15     | train
4 | loss          | CrossEntropyLoss | 0      | train
-----------------------------------------------------------
37        Trainable params
0         Non-trainable params
37        Total params
0.000     Total estimated model params size (MB)
8         Modules in train mode
0         Modules in eval mode
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=30` reached.


In [20]:

modelInput = torch.tensor([
    ttid['what'],
    ttid['is'],
    ttid['statquest'],
    ttid['<E>']
])

inputLen = modelInput.size(dim=0)

predict = model(modelInput)
predictId = torch.tensor([torch.argmax(predict[-1, :])]) #use -1 to indx what gen aftetr <E>
predictIds = predictId

maxLen = 6
for i in range(inputLen, maxLen):
    if (predictId == ttid['<E>']):
        break

    modelInput = torch.cat((modelInput, predictId))

    predicts = model(modelInput)
    predictId = torch.tensor([torch.argmax(predicts[-1, :])])
    predictIds = torch.cat((predictIds, predictId))

print('Predicted Tokens\n')
for id in predictIds:
    print('\t, predict Token:', idtt[id.item()])

Predicted Tokens

	, predict Token: awesome
	, predict Token: <E>
