In [8]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [9]:
# opening the downloaded shakespeare file
with open('input.txt', 'r') as f:
  text = f.read()
print(len(text))

1115393


In [10]:
print(text[:1000]) # These are some of the characters

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [11]:
chars = sorted(list(set(text)))
print(''.join(chars), len(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz 65


In [12]:
stoi = {ch:i for i, ch in enumerate(chars)}
itos  = {i:ch for ch,i in stoi.items()}
encode = lambda s : [stoi[ch] for ch in s]
decode = lambda l : [itos[i] for i in l]

In [13]:
# Now encoding the entire dataset in the given format
import torch
data = torch.tensor(encode(text), dtype = torch.long )
print(data[:100])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [14]:
# Splitting into training and validation sets
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[:n]

In [15]:
block_size = 8
train_data[:block_size+1] # We are doing this since we are preducting the next character for all the given indivivdual characters

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [16]:
x_train = train_data[:block_size]
y_train = train_data[1:block_size+1]
for t in range(block_size):
  context = x_train[:t+1]
  target = y_train[t]
  print(f'when input is {context} target is {target}')

when input is tensor([18]) target is 47
when input is tensor([18, 47]) target is 56
when input is tensor([18, 47, 56]) target is 57
when input is tensor([18, 47, 56, 57]) target is 58
when input is tensor([18, 47, 56, 57, 58]) target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) target is 58


In [638]:
batch_size = 4
block_size = 8

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y



xb, yb = get_batch('train')
print(xb)

tensor([[59, 58, 47, 53, 52,  1, 61, 39],
        [52,  1, 51, 63,  1, 59, 52, 54],
        [ 0, 13, 26, 19, 17, 24, 27, 10],
        [43, 56, 57,  6,  1, 43, 56, 43]])


In [736]:
# making a single headed transformer
# Transforming into something understandable in the system and trying to make  a
block_size = 8
head = 8 
n_embd = 16
block_size = 8
class Head(nn.Module):
    def __init__(self,n_embd, head):
        super().__init__()
        self.keys = nn.Linear(n_embd, head, bias = False)
        self.queries = nn.Linear(n_embd, head, bias = False)
        self.values = nn.Linear(n_embd, head, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))


    def forward(self, x):
        B, T, C = x.shape
        K = self.keys(x)
        Q =  self.queries(x)
        V = self.values(x)
        wei = (K @ Q.transpose(-2, -1))*(C**(-0.5)) # B, T, C @ B, C, T  -->  B, T, T # Here C = Head 
        # print(wei)
        wei = wei.masked_fill_(self.tril[:T, :T] == 0, float('-inf')) # masking would mean the nodes are only communicating with nodes in the past and not the future ones 
        wei =  F.softmax(wei, -1)
        # print(wei)

        out = wei @ V # B,T,T
        # out = self.ln(out)
        # print(out.shape)
        
        return out 
    
# also joining a feedforward network in between


In [737]:
class MultiHead(nn.Module):
    def __init__(self, n_heads, head):
        super().__init__()
        self.heads = nn.ModuleList([Head(n_embd, head) for _ in range(n_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        print(out.shape)
        out = self.proj(out)
        return out 

In [738]:
# position embedding and token embedding
# token embedding  --> vocab_size to n_embd
# position_embedding --> 
head = 8
n_embd = 16
block_size = 8
# input  B,T format meh 
# what we want to find out 

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.sa = nn.Sequential(MultiHead(4, head//4),
                      MultiHead(4, head//4),
                      MultiHead(4, head//4),
                     )

        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.linear = nn.Linear(n_embd, vocab_size)
        self.ffwd1 = nn.Linear(n_embd, n_embd*4) # we use all of this to make sure the model learns from the output of the 
        self.ffwd2 = nn.Linear(n_embd*4, n_embd)
        self.act1 = nn.ReLU()
        self.lm_head = nn.Linear(n_embd, vocab_size)


    def forward(self, idx, targets = None):
        B, T = idx.shape
        token_embedding = self.token_embedding(idx)
        position_embedding  = self.position_embedding(torch.arange(T))
        x =  token_embedding  + position_embedding
        x = x +self.sa(self.ln1(x))  
        x = self.ffwd1(x)
        x = self.act1(x)
        x =x + self.ffwd2(x)
        logits = self.lm_head(x)

        if targets is None:
            return logits 
        else:
            B,T,C  = x.shape
            x = x.view(B*T, -1)
            targets = targets.view(-1)
            loss = F.cross_entropy(x, targets)
            return logits, loss
        
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # becomes (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

        
       
    
    

In [739]:
m = BigramLanguageModel()

In [None]:
import torch.optim as optim 

optimizer = optim.Adam(m.parameters(), lr = 1e-3)

In [741]:


for i in range(10):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(f'loss for  step {i+1}: {loss}')

torch.Size([4, 8, 8])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x8 and 16x16)