## Read training dataset

In [1]:
with open('data/Synthetic-Persona-Chat_train.csv', encoding='utf-8') as f:
    training_set = f.read()

with open('data/Synthetic-Persona-Chat_valid.csv', encoding='utf-8') as f:
    validation_set = f.read()
    
print("training dataset length:", len(training_set))

training dataset length: 15880770


In [2]:
print(training_set[:1000])

user 1 personas,user 2 personas,Best Generated Conversation
"I am 32.
I do not want a job.
I play video games all day.
I still live at home with my parents.","My favorite drink is iced coffee.
I have a black belt in karate.
I m in a jazz band and play the saxophone.
I vacation along lake michigan every summer.","User 1: Hi! I'm [user 1's name].
User 2: Hi [user 1's name], I'm [user 2's name].
User 1: What do you do for fun?
User 2: I like to play video games, go to the beach, and read.
User 1: I like to play video games too! I'm not much of a reader, though.
User 2: What video games do you like to play?
User 1: I like to play a lot of different games, but I'm really into competitive online games right now.
User 2: I'm not really into competitive games, I like to play more relaxing games.
User 1: That's cool. What kind of relaxing games do you like to play?
User 2: I like to play puzzle games, simulation games, and story-based games.
User 1: I've never been much of a puzzle game person,

## Tokenize the dataset

Simple tokenizer

In [25]:
chars = sorted(list(set(training_set + validation_set)))
vocab_size = len(chars)

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print("vocab size: ", vocab_size)

import torch

training_data = torch.tensor(encode(training_set), dtype=torch.long)
val_data = torch.tensor(encode(validation_set), dtype=torch.long)

print(training_data.shape, training_data.dtype)

vocab size:  113
torch.Size([15880770]) torch.int64


Normal tokenizer for the future

In [5]:
# import tiktoken
# import torch

# enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
# data = torch.tensor(enc.encode(trainingSet), dtype=torch.long)

# print(data.shape, data.dtype)


torch.Size([86540]) torch.int64


## Self-attention

In [5]:
# just averaging past example

torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C)

In [5]:
# logic example

xbow = torch.zeros((B,T,C))

for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C)
        xbow[b,t] = torch.mean(xprev, 0)

In [6]:
# vectorizing

weights = torch.tril(torch.ones(T, T))
weights = weights / weights.sum(1, keepdim=True)
xbow2 = weights @ x # (B, T, T) @ (B, T, C) -> (B, T, C)

torch.allclose(xbow, xbow2)

False

In [6]:
# softmax

import torch.nn as nn
from torch.nn import functional as F

tril = torch.tril(torch.ones(T, T))
weights = torch.zeros((T, T))
weights = weights.masked_fill(tril == 0, float('-inf'))
weights = F.softmax(weights, dim=-1)
xbow3 = weights @ x

torch.allclose(xbow2, xbow3)

NameError: name 'xbow2' is not defined

In [18]:
# self-attenton

import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# single Head
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)

weights = q @ k.transpose(-2, -1) * head_size**-0.5 # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# weights = torch.zeros((T, T))
weights = weights.masked_fill(tril == 0, float('-inf'))
weights = F.softmax(weights, dim=-1)

v = value(x)
out = weights @ v

out.shape

torch.Size([4, 8, 16])

## Sample the dataset

In [19]:
# hyperparameters

context_size = 8
batch_size = 32
max_iters = 5000
eval_interval = 300
learning_rate = 1e-3
device = 'cuda'
eval_iters = 200
n_embd = 32

def get_batch(data: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
    ix = torch.randint(len(data) - context_size, (batch_size,)) # get batch_size random points of data
    x = torch.stack([data[i:i+context_size] for i in ix])
    y = torch.stack([data[i+1:i+context_size+1] for i in ix])

    x, y = x.to(device), y.to(device)
    return x, y

## Feed output to the transformer

In [20]:
# Attention head

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(C, head_size, bias=False)
        self.query = nn.Linear(C, head_size, bias=False)
        self.value = nn.Linear(C, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) # (B, T, 16)
        q = self.query(x) # (B, T, 16)

        # compute attention scores (affinities)
        weights = q @ k.transpose(-2,-1) * C**-0.5 # (B,T,C) @ (B,C,T) --> (B,T,T)
        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        weights = F.softmax(weights, dim=-1)
        # weighted aggregation
        v = self.value(x) # (B,T,C)
        out = weights @ v # (B,T,T) @ (B,T,C) -> (B,T,C)

        return out

In [21]:
class MultiHeadAttention(nn.Module):
    def __init__(self, head_count, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(head_count)])

    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim=-1)


Linear layer

In [27]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [26]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self) -> None:
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(context_size, n_embd)
        self.sa_heads = MultiHeadAttention(4, n_embd//4)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.ffwd = FeedFoward(n_embd)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (Batch, Time) tensor of integers
        tok_embd = self.token_embedding_table(idx) # (Batch, Time, Channels)

        # save token position information
        pos_embd = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)

        # add token position information to the input
        x = tok_embd + pos_embd # (B,T,C)
        x = self.sa_heads(x) # apply one head of self-attention
        x = self.ffwd(x)
        logits = self.lm_head(x) # (Batch, Time, vocab_size)

        if targets is None:
            loss = None
        else:
            # reshape logits and targets to the format torch.nn.functional.cross_entropy expects
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx: torch.Tensor, max_new_tokens) -> torch.Tensor:
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to context_size tokens
            idx_cond = idx[:, -context_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :] # (B, C)

            # Softmax converts a vector of K real values 
            # into a vector of K real values that sum to 1 while preserving relative value of the elements
            probs = F.softmax(logits, dim=1) # (B, C)

            # Returns a tensor where each row contains num_samples indices sampled from the multinomial prob distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx


xb, yb = get_batch(training_data)
    
model = BigramLanguageModel()
m = model.to(device)
logits, loss = m(xb, yb)

print(logits.shape)
print(loss)

TypeError: linear(): argument 'input' (position 1) must be Tensor, not NoneType

## Training

In [10]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in [ 'train', 'val' ]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(training_data if split == 'train' else val_data)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    
    return out

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch(training_data)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.7372, val loss 4.7387
step 300: train loss 2.8573, val loss 2.8624
step 600: train loss 2.5186, val loss 2.5034
step 900: train loss 2.2859, val loss 2.2958
step 1200: train loss 2.2236, val loss 2.1927
step 1500: train loss 2.1538, val loss 2.1477
step 1800: train loss 2.1294, val loss 2.1460
step 2100: train loss 2.1028, val loss 2.1244
step 2400: train loss 2.1124, val loss 2.1115
step 2700: train loss 2.0831, val loss 2.1113
step 3000: train loss 2.0442, val loss 2.0804
step 3300: train loss 2.0650, val loss 2.0641
step 3600: train loss 2.0613, val loss 2.0558
step 3900: train loss 2.0232, val loss 2.0406
step 4200: train loss 2.0473, val loss 2.0185
step 4500: train loss 2.0074, val loss 2.0352
step 4800: train loss 2.0289, val loss 2.0092

Userd sentery het Ii ng and.
User 2: Thane alike bethand angre tamelik senverachet dierin. Myo!
Iloukeche liks thesut adf ondo ply."
Use orle 1!
I ut'mt chado.
I whor 1's Lks sounepit'snb er ikerd T 2: Yeonmeer 2: NUse Whis