## Read training dataset

In [7]:
with open('data/Synthetic-Persona-Chat_train.csv', encoding='utf-8') as f:
    training_set = f.read()[:1000000]

with open('data/Synthetic-Persona-Chat_valid.csv', encoding='utf-8') as f:
    validation_set = f.read()[:1000000]
    
print("training dataset length:", len(training_set))

training dataset length: 1000000


In [2]:
print(training_set[:1000])

user 1 personas,user 2 personas,Best Generated Conversation
"I am 32.
I do not want a job.
I play video games all day.
I still live at home with my parents.","My favorite drink is iced coffee.
I have a black belt in karate.
I m in a jazz band and play the saxophone.
I vacation along lake michigan every summer.","User 1: Hi! I'm [user 1's name].
User 2: Hi [user 1's name], I'm [user 2's name].
User 1: What do you do for fun?
User 2: I like to play video games, go to the beach, and read.
User 1: I like to play video games too! I'm not much of a reader, though.
User 2: What video games do you like to play?
User 1: I like to play a lot of different games, but I'm really into competitive online games right now.
User 2: I'm not really into competitive games, I like to play more relaxing games.
User 1: That's cool. What kind of relaxing games do you like to play?
User 2: I like to play puzzle games, simulation games, and story-based games.
User 1: I've never been much of a puzzle game person,

## Tokenize the dataset

Simple tokenizer

In [9]:
chars = sorted(list(set(training_set + validation_set)))
vocab_size = len(chars)

stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print("vocab size: ", vocab_size)

import torch

training_data = torch.tensor(encode(training_set), dtype=torch.long)
val_data = torch.tensor(encode(validation_set), dtype=torch.long)

print(training_data.shape, training_data.dtype)

vocab size:  90
torch.Size([1000000]) torch.int64


Normal tokenizer for the future

In [5]:
# import tiktoken
# import torch

# enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
# data = torch.tensor(enc.encode(trainingSet), dtype=torch.long)

# print(data.shape, data.dtype)


torch.Size([86540]) torch.int64


## Sample the dataset

In [10]:
context_size = 8
batch_size = 4
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda'
eval_iters = 200

def get_batch(data: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
    ix = torch.randint(len(data) - context_size, (batch_size,)) # get batch_size random points of data
    x = torch.stack([data[i:i+context_size] for i in ix])
    y = torch.stack([data[i+1:i+context_size+1] for i in ix])

    x, y = x.to(device), y.to(device)
    return x, y

## Feed output to the transformer

In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size) -> None:
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (Batch, Time) tensor of integers
        logits = self.token_embedding_table(idx) # (Batch, Time, Classes)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx: torch.Tensor, max_new_tokens) -> torch.Tensor:
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=1) # (B, C), check definition
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1), check definition
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx


xb, yb = get_batch(training_data)
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)
logits, loss = m(xb, yb)

print(logits.shape)
print(loss)

torch.Size([32, 90])
tensor(4.6958, device='cuda:0', grad_fn=<NllLossBackward0>)


## Training

In [14]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in [ 'train', 'val' ]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(training_data if split == 'train' else val_data)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    
    return out

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

batch_size = 32

for iter in range(max_iters):

    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch(training_data)

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

step 0: train loss 4.9344, val loss 4.9390
step 300: train loss 2.6250, val loss 2.6356
step 600: train loss 2.2963, val loss 2.3137
step 900: train loss 2.2410, val loss 2.2435
step 1200: train loss 2.2139, val loss 2.2431
step 1500: train loss 2.1943, val loss 2.2190
step 1800: train loss 2.1942, val loss 2.2297
step 2100: train loss 2.1965, val loss 2.2066
step 2400: train loss 2.1939, val loss 2.2018
step 2700: train loss 2.1867, val loss 2.2068

Usushanirn.
Use.
Mam Br ar 1: balalos ourinen, atofBer Whar.
Us aci, I we zzee ang ho d.
Us, l!
My Ro gay?
Use or2: lour ante t I vet?
USound at! I ly thino! 1: me see.
Useo! thar64n atrf d 2: avove ounther likerec Yobe I way mpofery I'r fuin.
Ust ricanks, in oreayoo I bovead sef Thar 2: agha nkes.

I ililse ay I thamingrild loreralyolino s "I y 1: I'se havound selyofubereav lore I 2: gl, fo 1: m t?
Usit'msp Cab.
I'so. wour ilali! boomerio, I pan Jal! me y doferk ple avis.
Us fuildoondou I s! vi
