## Load vocab

In [6]:
import json
with open("../check_points/vocab.json") as f:
    word2idx = json.load(f)
    print(type(word2idx))
    print(list(word2idx.items())[:10])

idx2word = {int(v): k for k, v in word2idx.items()}
# pad_id = word2idx["<pad>"]

<class 'dict'>
[('<s>', 0), ('<pad>', 1), ('</s>', 2), ('<unk>', 3), ('<mask>', 4), ('!', 5), ('"', 6), ('#', 7), ('$', 8), ('%', 9)]


In [5]:
print(word2idx["dog"])

11902


## Testing Tokenization

In [7]:
import sys
sys.dont_write_bytecode = True # disabling __pycache__
sys.path.insert(0, '../')
from utils import Tokenizer
# from utils import clean_text

tokenizer = Tokenizer()
tokenizer.upload_vocab(word2idx)
tokenizer.encode("dog")

[11902]

In [None]:
# ------ generate input target pairs ----- ?

## Testing the Model

In [15]:
# embedding_dim == hidden_size == (D)
# embedding_dim % num_heads == 0
embedding_dim = 128

ff_embedding_dim = 512 # ff_embedding_dim = 4 × embedding_dim
max_seq_len = 200
dropout = 0.1
num_heads = 4
vocab_size = tokenizer.get_vocab_size()
num_layers = 4

In [16]:
print(vocab_size)

19716


In [None]:
sys.path.insert(0, '../model/')
from gpt2 import GPT2Model
import torch
import torch.nn as nn

encoded = tokenizer.encode("dog")
input_tensor = torch.tensor(encoded).unsqueeze(0)

model = GPT2Model(vocab_size,embedding_dim,ff_embedding_dim,max_seq_len,num_heads,num_layers,dropout = 0.1)

# for each position in the sequence, you get a distribution over all vocab tokens.
logits = model(input_tensor)  # (B, T, V)

# Shift targets for next-token prediction
# shift_logits = logits[:, :-1, :].contiguous()
# shift_labels = input_tensor[:, 1:].contiguous()

# Flatten for CrossEntropyLoss
# loss_fn = nn.CrossEntropyLoss()
# loss = loss_fn(
#     shift_logits.view(-1, vocab_size),
#     shift_labels.view(-1)
# )

## Loading Data

In [None]:
from torch.utils.data import DataLoader
from data.dataset import TextDataset

# Assume `tokenized_texts` is a list of word-index lists
dataset = TextDataset(tokenized_texts, seq_len=32)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

## Training the Model

In [None]:
# embedding_dim == hidden_size == (D)
# embedding_dim % num_heads == 0
embedding_dim = 128

ff_embedding_dim = 512 # ff_embedding_dim = 4 × embedding_dim
max_seq_len = 200
dropout = 0.1
num_heads = 4
vocab_size = tokenizer.get_vocab_size()
num_layers = 4

epochs = 3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 16

In [None]:
model = GPT2Model(
        vocab_size,
        embedding_dim,
        ff_embedding_dim,max_seq_len
        ,num_heads,
        num_layers,
        dropout = 0.1
    ).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)

model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch_idx, (x, y) in enumerate(dataloader):
        x, y = x.to(device), y.to(device)  # x: [B, T], y: [B, T]

        logits = model(x)  # [B, T, vocab_size]
        logits = logits.view(-1, vocab_size)   # [B*T, V]
        y = y.view(-1)                                # [B*T]

        loss = criterion(logits, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # scheduler.step()  # if used

        total_loss += loss.item()

        if batch_idx % 100 == 0:
            print(f"Epoch {epoch+1}, Step {batch_idx}, Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} completed. Avg Loss: {avg_loss:.4f}")

# Save model
torch.save(model.state_dict(), "gpt2_tiny.pth")


## Generate Text

In [None]:
# Load model
# GPT2Config.vocab_size = len(word2idx)
# model = GPT2Model(GPT2Config())
# model.load_state_dict(torch.load("gpt2_tiny.pth", map_location="cpu"))
# model.eval()

In [None]:
# initialize tokenizer with texts

def generate_text(prompt, max_new_tokens=50):
    input_ids = tokenizer.encode(prompt)
    input_tensor = torch.tensor([input_ids], dtype=torch.long)  # [1, T]
    
    with torch.no_grad():
        for _ in range(max_new_tokens):
            logits = model(input_tensor)  # [1, T, vocab]
            next_token_logits = logits[:, -1, :]  # last position
            next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)  # [1, 1]
            input_tensor = torch.cat([input_tensor, next_token], dim=1)  # grow the sequence

    return tokenizer.decode(input_tensor[0].tolist())


In [None]:
prompt = "Once upon a time"
print(generate_text(prompt, max_new_tokens=30))