In [2]:
import pandas as pd
from pathlib import Path

In [3]:
file = Path.cwd() / 'human_chat.txt'
with open(file, 'r', encoding='utf-8') as f:
    text = f.read()
    print(text[:1000])

Human 1: Hi!
Human 2: What is your favorite holiday?
Human 1: one where I get to meet lots of different people.
Human 2: What was the most number of people you have ever met during a holiday?
Human 1: Hard to keep a count. Maybe 25.
Human 2: Which holiday was that?
Human 1: I think it was Australia
Human 2: Do you still talk to the people you met?
Human 1: Not really. The interactions are usually short-lived but it's fascinating to learn where people are coming from and what matters to them
Human 2: Yea, me too. I feel like God often puts strangers in front of you, and gives you an opportunity to connect with them in that moment in deeply meaningful ways. Do you ever feel like you know things about strangers without them telling you?
Human 1: what do you mean?
Human 2: I think it's like a 6th sense, often seen as "cold readings" to people, but can be remarkably accurate. I once sat next to a man in a coffee and I felt a pain in my back. I asked the stranger if he had a pain. It turns o

In [4]:
# all unique characters that occur in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))


 !"%&'()*+,-./012345679:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ\_abcdefghijklmnopqrstuvwxyz~é’“”…湘留）：😀😂😆😉😐😛😞🙂


In [5]:
# string to integer
stoi = {ch:i for i, ch in enumerate(chars)}
# integer to string
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encoder: receive string, output a list of integers
decode = lambda l:''.join([itos[i] for i in l]) # decoder: receive a list of integers, output a string

print(encode("hii there!"))
print(decode(encode("hii there!")))

[64, 65, 65, 1, 76, 64, 61, 74, 61, 2]
hii there!


In [6]:
import torch # PyTorch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([115782]) torch.int64
tensor([36, 77, 69, 57, 70,  1, 16, 24,  1, 36, 65,  2,  0, 36, 77, 69, 57, 70,
         1, 17, 24,  1, 51, 64, 57, 76,  1, 65, 75,  1, 81, 71, 77, 74,  1, 62,
        57, 78, 71, 74, 65, 76, 61,  1, 64, 71, 68, 65, 60, 57, 81, 28,  0, 36,
        77, 69, 57, 70,  1, 16, 24,  1, 71, 70, 61,  1, 79, 64, 61, 74, 61,  1,
        37,  1, 63, 61, 76,  1, 76, 71,  1, 69, 61, 61, 76,  1, 68, 71, 76, 75,
         1, 71, 62,  1, 60, 65, 62, 62, 61, 74, 61, 70, 76,  1, 72, 61, 71, 72,
        68, 61, 13,  0, 36, 77, 69, 57, 70,  1, 17, 24,  1, 51, 64, 57, 76,  1,
        79, 57, 75,  1, 76, 64, 61,  1, 69, 71, 75, 76,  1, 70, 77, 69, 58, 61,
        74,  1, 71, 62,  1, 72, 61, 71, 72, 68, 61,  1, 81, 71, 77,  1, 64, 57,
        78, 61,  1, 61, 78, 61, 74,  1, 69, 61, 76,  1, 60, 77, 74, 65, 70, 63,
         1, 57,  1, 64, 71, 68, 65, 60, 57, 81, 28,  0, 36, 77, 69, 57, 70,  1,
        16, 24,  1, 36, 57, 74, 60,  1, 76, 71,  1, 67, 61, 61, 72,  1, 57,  1,
       

In [7]:
n = int(0.9*len(data)) # 90% of text will be for training
train_data = data[:n]
val_data = data[n:]

In [8]:
block_size = 8
train_data[:block_size+1]

tensor([36, 77, 69, 57, 70,  1, 16, 24,  1])

In [9]:
# showing how the prediction works
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([36]) the target: 77
when input is tensor([36, 77]) the target: 69
when input is tensor([36, 77, 69]) the target: 57
when input is tensor([36, 77, 69, 57]) the target: 70
when input is tensor([36, 77, 69, 57, 70]) the target: 1
when input is tensor([36, 77, 69, 57, 70,  1]) the target: 16
when input is tensor([36, 77, 69, 57, 70,  1, 16]) the target: 24
when input is tensor([36, 77, 69, 57, 70,  1, 16, 24]) the target: 1


In [10]:
torch.manual_seed(1337) # set seed for reproducibility
batch_size = 4 # number of independent sequences being processed
block_size = 8 # maximum context length for predictions

# generates a batch of data
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('-------')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[ 2,  0, 36, 77, 69, 57, 70,  1],
        [75, 61, 76, 75,  1, 76, 64, 61],
        [57, 74, 61, 57,  0, 36, 77, 69],
        [37,  1, 75, 61, 61, 13,  1, 48]])
targets:
torch.Size([4, 8])
tensor([[ 0, 36, 77, 69, 57, 70,  1, 16],
        [61, 76, 75,  1, 76, 64, 61,  1],
        [74, 61, 57,  0, 36, 77, 69, 57],
        [ 1, 75, 61, 61, 13,  1, 48, 64]])
-------
when input is [2] the target: 0
when input is [2, 0] the target: 36
when input is [2, 0, 36] the target: 77
when input is [2, 0, 36, 77] the target: 69
when input is [2, 0, 36, 77, 69] the target: 57
when input is [2, 0, 36, 77, 69, 57] the target: 70
when input is [2, 0, 36, 77, 69, 57, 70] the target: 1
when input is [2, 0, 36, 77, 69, 57, 70, 1] the target: 16
when input is [75] the target: 61
when input is [75, 61] the target: 76
when input is [75, 61, 76] the target: 75
when input is [75, 61, 76, 75] the target: 1
when input is [75, 61, 76, 75, 1] the target: 76
when input is [75, 61, 76

In [11]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337) # set seed for reproducibility

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token reads off the logits for the next token
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        
        # idx and targets are both (B, T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape # batch, time, channels
            # adjusting to match cross_entropy
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
        
            # checking quality of predictions
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    # extends each batch (B) in the time (T) dimension for max_new_tokens
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context of some characters in a batch
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
# generating symbols
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens = 100)[0].tolist()))

torch.Size([32, 101])
tensor(5.2723, grad_fn=<NllLossBackward0>)

VY😉(g/g6M<gUHS(c&D15😂🙂uu/r😐W-q😆TkK?o🙂el湘qXog<😛YH(m"f?>😂%E😐ez:4XOuBE7e~_AeeéqF9aRmQiB(留E5%t”
pk'!”<kf


In [12]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3) # learning rate (lr) is 1e-3 due to small network

In [13]:
# training with bigram model

batch_size = 32
for steps in range(10000):
    
    # sample a batch of data
    xb, yb = get_batch('train')
    
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True) # zeroing all the gradients from previous step
    loss.backward() # getting the gradients from all of the parameters
    optimizer.step() # using gradients to update parameters
    
print(loss.item())

2.2629356384277344


In [14]:
# generation of data after training
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens = 100)[0].tolist()))


Humooo Hut bantrigou rtitreherad mast awe Ste ftooume ou?f lfoug. be. i5Aheanbuse O cek ind, y an t 


In [15]:
# self-attention
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# a single Head performing self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
# weight calculation providing averages for past and current tokens in a data-dependent way
wei =  q @ k.transpose(-2, -1) # matrix multiplication: (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T)) # lower triangular matrix
# wei = torch.zeros((T,T)) # provides a constant weighting regardless of the data
wei = wei.masked_fill(tril == 0, float('-inf')) # triangular masking
wei = F.softmax(wei, dim=-1) # exponentiates and normalizes in order to create the weighting (-inf will turn into 0)

v = value(x) # aggregated elements
out = wei @ v # changes output to head_size dimensions

out.shape

torch.Size([4, 8, 16])

In [16]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [17]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089

In [18]:
# scaled attention
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5 # prevents extreme numbers from converging on one specific number in Softmax