In [9]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import time

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

block_size = 8
batch_size = 4
max_i = 10000
# eval_interval = 2500
learning_rate = 3e-1
eval_iters = 250

cuda


In [10]:
with open('Beren_and_Luthien.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
vocabulary_size = len(chars)

['\n', ' ', '!', '(', ')', '*', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'á', 'ë', 'ó', 'ú', '—', '‘', '’', '“', '”']


In [11]:
string_to_int = { ch:i for i, ch in enumerate(chars) }
int_to_string = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([31, 60, 52,  1, 40, 45, 46, 49, 41, 55, 42, 51,  1, 45, 38, 41,  1, 31,
        46, 51, 60, 42, 49, 46, 51, 57,  1, 57, 45, 42, 51,  6,  1, 15, 38, 46,
        55, 52, 51,  1, 38, 51, 41,  1, 31, 46, 51, 67, 59, 46, 42, 49,  6,  1,
        38, 51, 41,  1, 31, 46, 51, 67, 59, 46, 42, 49,  1, 60, 38, 56,  1, 38,
         0, 50, 38, 46, 41, 42, 51,  6,  1, 38, 51, 41,  1, 57, 45, 42,  1, 50,
        52, 56, 57,  1, 39, 42, 38, 58, 57, 46])


In [12]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[:n]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # print(ix)
    x = torch.stack([data [i:i+block_size] for i in ix])
    y = torch.stack([data [i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs: ')
print(x)
print('targets: ')
print(y)

inputs: 
tensor([[19, 58, 38, 51,  1, 56, 38, 46],
        [45, 42,  1, 45, 42, 60, 46, 51],
        [68, 69, 38, 51, 41, 70,  6,  1],
        [ 1, 38, 51, 41,  1, 38, 56,  1]], device='cuda:0')
targets: 
tensor([[58, 38, 51,  1, 56, 38, 46, 41],
        [42,  1, 45, 42, 60, 46, 51, 44],
        [69, 38, 51, 41, 70,  6,  1, 56],
        [38, 51, 41,  1, 38, 56,  1, 57]], device='cuda:0')


In [13]:
x = train_data[:block_size]
y = train_data[1:block_size+1]


for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('When input is', context,'target is', target)

When input is tensor([31]) target is tensor(60)
When input is tensor([31, 60]) target is tensor(52)
When input is tensor([31, 60, 52]) target is tensor(1)
When input is tensor([31, 60, 52,  1]) target is tensor(40)
When input is tensor([31, 60, 52,  1, 40]) target is tensor(45)
When input is tensor([31, 60, 52,  1, 40, 45]) target is tensor(46)
When input is tensor([31, 60, 52,  1, 40, 45, 46]) target is tensor(49)
When input is tensor([31, 60, 52,  1, 40, 45, 46, 49]) target is tensor(41)


In [14]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [15]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocabulary_size):
        super().__init__()
         # Initialize an embedding table with self-loop for each token
        self.token_embedding_table = nn.Embedding(vocabulary_size, vocabulary_size)
    
    def forward(self, index, targets=None):
        # Compute logits using the token embedding table
        logits = self.token_embedding_table(index)  # Why here? Token embedding is computed here
        
        if targets is None:
            # If no targets provided, return None for loss
            loss = None
        else:
            # If targets are provided, calculate cross-entropy loss
            N, T, C = logits.shape
            logits = logits.view(N*T, C)
            targets = targets.view(N*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # Generate new tokens using the trained model
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=1)
            # Sample the next token based on the computed probabilities
            index_next = torch.multinomial(probs, num_samples=1)
            # Concatenate the new token to the input sequence
            index = torch.cat((index, index_next), dim=1)
        return index
# Create an instance of the BigramLanguageModel and move it to the specified device
model = BigramLanguageModel(vocabulary_size)
m = model.to(device)

# Initialize a context tensor with zeros for generating new tokens
context = torch.zeros((1, 1), dtype=torch.long, device=device)
# Generate a sequence of new tokens and decode them
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist()) 
print(generated_chars)



BsGNSi‘AK,EYVAMn
[
)n”vYpóLNg!hjkCiuvibs[pJV’tá[*MrKwKvkCóE[ QsnMqGdCsáGbY SEqW
KHEmKw-;‘áFhNLF(]Q’)qYVIa‘“DáSADR[avEVtótóGHUYoeHpAb-axs—rDkCEqmwxFjN-duújóyRáFuyBJKgd?geGhju[].—N:U‘xHSWri-H*”vk?:DCqfgëyhEqoO;owachE[[UIzwK
xSd]fQI:hbneëcA*VK,á*nkQ;
R(SëboTr”wI-d—nlqek(xo”K“ r”wK,fLqfhút;PV,zówndLTuTtnRótGPK?aU?LVVH‘d[YgFúNmvkiJëtS’]wtGPRáDTfW
y!MrKhlK-YQe*ol—WWj)OtmHáySDyxB—[z— jKIG;-EV]GRU?o,ár]pvM*-E‘*ëDE””,akLHUCPEn”Nh
Im[uQ.adH—cOSY-O.aEpesgNymá[QjMm rlQ)trKn*zwDT.;-W:)GYyAU*dJV U*áTAMnëq“ pd


**Where:**
- _C_ = Number of Classes
- _N_ = Batch Size
- _T_ = Time

In [16]:
optimiser = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for i in range (max_i):
    if i % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {i}, train loss {losses['train']:.4f} val loss {losses['val']:.4f}")
        
    xb, yb = get_batch('train')
    
    logits, loss = model.forward(xb, yb)
    optimiser.zero_grad(set_to_none=True)
    loss.backward()
    optimiser.step()
    
print(loss.item())

step: 0, train loss 4.7543 val loss 4.7290
step: 250, train loss 2.5491 val loss 2.5910
step: 500, train loss 2.5582 val loss 2.5428
step: 750, train loss 2.5186 val loss 2.4795
step: 1000, train loss 2.5460 val loss 2.5477
step: 1250, train loss 2.4865 val loss 2.5407
step: 1500, train loss 2.5188 val loss 2.5043
step: 1750, train loss 2.4986 val loss 2.5092
step: 2000, train loss 2.5507 val loss 2.5121
step: 2250, train loss 2.6014 val loss 2.5735
step: 2500, train loss 2.6062 val loss 2.6054
step: 2750, train loss 2.5235 val loss 2.5383
step: 3000, train loss 2.5153 val loss 2.5138
step: 3250, train loss 2.5774 val loss 2.5655
step: 3500, train loss 2.5466 val loss 2.5588
step: 3750, train loss 2.5838 val loss 2.5482
step: 4000, train loss 2.5416 val loss 2.5318
step: 4250, train loss 2.4818 val loss 2.5237
step: 4500, train loss 2.5466 val loss 2.5579
step: 4750, train loss 2.5527 val loss 2.4924
step: 5000, train loss 2.5605 val loss 2.5529
step: 5250, train loss 2.5495 val loss 2

In [17]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


Han f O Ellmo is thicasheve yow have in jopew thad an hof Nimoshe mane he oof Dan, thofofoshind f thand’ hirid Handofas trnd golr nangaty t And hemoned tee aratts os hournd grnd jow ad and Me, lk g r Me gine tr st relknúverindewzither ye ssho hayof Tin
Pracara
bedilknd ins oterw raft howoyof hin wheranghosof g lyow Meleathe d of ‘Had ad an newndinor anof t jodid ow ser were dosin hetyeesit hin ninss t hillof ‘Haberr wigroste t
an jond; ow o hins hin Kasin avindand haf
he Irathise, hopur f feelkn


In [18]:
x = torch.tensor([-1], dtype=torch.float32)
print(F.tanh(x))

tensor([-0.7616])
