In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import tiktoken
from tqdm import tqdm

In [19]:
import json

with open("instruction-data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

with open("instruction-data.txt", "w", encoding="utf-8") as f:
    for ex in data:
        instr = ex.get("instruction", "")
        inp = ex.get("input", "")
        out = ex.get("output", "")
        f.write(f"Instruction: {instr}\nInput: {inp}\nOutput: {out}\n\n")

In [20]:
class TiktokenDataset(Dataset):
    def __init__(self, filepath, tokenizer, block_size):
        with open(filepath, 'r', encoding='utf-8') as f:
            data = f.read()
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.encoded = tokenizer.encode(data)

    def __len__(self):
        return max(0, len(self.encoded) - self.block_size)

    def __getitem__(self, idx):
        chunk = self.encoded[idx:idx+self.block_size+1]
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y

In [21]:
class GPTConfig:
    def __init__(
        self,
        vocab_size,
        block_size,
        n_layers=4,
        n_heads=4,
        n_embd=128,
        dropout=0.1,
        bias=True,
        tie_weights=True,
        use_learnable_pos_emb=True,
    ):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.n_embd = n_embd
        self.dropout = dropout
        self.bias = bias
        self.tie_weights = tie_weights
        self.use_learnable_pos_emb = use_learnable_pos_emb

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_heads == 0
        self.n_heads = config.n_heads
        self.head_dim = config.n_embd // config.n_heads
        self.scale = self.head_dim ** -0.5

        self.qkv = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        self.proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        self.attn_drop = nn.Dropout(config.dropout)
        self.resid_drop = nn.Dropout(config.dropout)

        self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size)).unsqueeze(0).unsqueeze(0))

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.qkv(x)
        q, k, v = qkv.split(C, dim=2)
        q = q.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        k = k.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * self.scale
        att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.resid_drop(self.proj(y))
        return y

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
        if config.use_learnable_pos_emb:
            self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
        else:
            self.register_buffer('pos_emb', self._get_sinusoidal_pos_emb(config.block_size, config.n_embd))
        self.drop = nn.Dropout(config.dropout)
        self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layers)])
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=config.bias)
        if config.tie_weights:
            self.head.weight = self.tok_emb.weight
        self.block_size = config.block_size
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def _get_sinusoidal_pos_emb(self, seq_len, dim):
        position = torch.arange(seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / dim))
        pe = torch.zeros(seq_len, dim)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.block_size, "Sequence too long"
        tok_emb = self.tok_emb(idx)
        if self.config.use_learnable_pos_emb:
            pos_emb = self.pos_emb[:, :T, :]
        else:
            pos_emb = self.pos_emb[:, :T, :].to(tok_emb.device)
        x = self.drop(tok_emb + pos_emb)
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
block_size = 32
config = GPTConfig(
    vocab_size=tokenizer.n_vocab,
    block_size=block_size,
    n_layers=1,
    n_heads=1,
    n_embd=32,
    dropout=0.1,
    tie_weights=True,
    use_learnable_pos_emb=True,
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Loading pretrained model
model = GPT(config).to(device)
model.load_state_dict(torch.load('gpt_tiktoken.pth', map_location=device))

# Preparing finetuning dataset
finetune_dataset = TiktokenDataset("instruction-data.txt", tokenizer, block_size)
finetune_loader = DataLoader(finetune_dataset, batch_size=4, shuffle=True, num_workers=0)

# Finetuning the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
epochs = 2

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for x, y in tqdm(finetune_loader, desc=f"Finetune Epoch {epoch+1}"):
        x, y = x.to(device), y.to(device)
        logits, loss = model(x, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(finetune_loader)
    print(f"Finetune Epoch {epoch+1}, Loss: {avg_loss:.4f}")

# Saving the finetuned model
torch.save(model.state_dict(), "gpt_tiktoken_finetuned.pth")

Finetune Epoch 1: 100%|██████████| 9906/9906 [17:23<00:00,  9.49it/s]


Finetune Epoch 1, Loss: 4.0744


Finetune Epoch 2:  95%|█████████▍| 9394/9906 [16:20<00:49, 10.43it/s]

In [24]:
def sample(model, tokenizer, start_text, length=50, temperature=1.0):
    model.eval()
    idx = torch.tensor([tokenizer.encode(start_text)], dtype=torch.long).to(next(model.parameters()).device)
    for _ in range(length):
        idx_cond = idx[:, -model.config.block_size:]
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :] / temperature
        probs = torch.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, next_id], dim=1)
    return tokenizer.decode(idx[0].tolist())

prompt = "Instruction: Write a short poem about the sea.\nInput: \nOutput:"
print(sample(model, tokenizer, prompt, length=50))


Instruction: Write a short poem about the sea.
Input: 
Output: A more seen the statement using a sentence in the sentence using the sentence.
Input: 
Output: The quick brown innovative" is veryNO3.

Output: 
Instruction: The formula for the plural form of one of
