In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import sentencepiece as spm
import os
import pandas as pd
import math

# Hyperparameters
batch_size = 512
block_size = 32
max_iters = 5000
eval_interval = 500
learning_rate = 1e-4
weight_decay = 1e-5  # Weight decay for regularization
patience = 3  # Early stopping patience
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 512
n_head = 4
n_layer = 4
dropout = 0.2

# Load the Story Cloze dataset
story_2016 = pd.read_csv('/kaggle/input/story-cloze/cloze_2016.csv')
story_2016['story'] = story_2016['storytitle'] + ' ' + story_2016[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5']].agg(' '.join, axis=1)
story_2018 = pd.read_csv('/kaggle/input/story-cloze/cloze_2017.csv')
story_2018['story'] = story_2018['storytitle'] + ' ' + story_2018[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5']].agg(' '.join, axis=1)
story_cloze = pd.concat([story_2016, story_2018], axis=0)

# with open('/kaggle/working/text_data.txt', 'w') as f:
#     f.write('\n'.join(story_cloze['story'].tolist()))
# # Train SentencePiece model
# spm.SentencePieceTrainer.train(input='/kaggle/working/text_data.txt', model_prefix='spm_model', vocab_size=10000, model_type='bpe')

# Initialize SentencePiece tokenizer
sp = spm.SentencePieceProcessor(model_file='/kaggle/working/spm_model.model')

# Encode and decode functions using SentencePiece
def encode(text):
    return sp.encode(text, out_type=int)

def decode(ids):
    return sp.decode(ids)

# Prepare data
text = ' '.join(story_cloze['story'])
data = torch.tensor(encode(text), dtype=torch.long).to(device)  # Move tensor to device
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

# Data loading
def get_batch(split):
    data = train_data if split == 'train' else val_data
    max_index = len(data) - block_size
    if max_index <= 0:
        raise ValueError("The block_size is too large for the given dataset. Consider reducing it.")
    ix = torch.randint(max_index, (batch_size,), device=device)
    x_batches = []
    y_batches = []
    for i in ix:
        end = min(i + block_size, len(data))
        x = data[i:end]
        y = data[i+1:end+1]
        if len(x) < block_size:
            x = torch.cat([x, torch.zeros(block_size - len(x), dtype=torch.long, device=device)])
            y = torch.cat([y, torch.zeros(block_size - len(y), dtype=torch.long, device=device)])
        x_batches.append(x)
        y_batches.append(y)
    x = torch.stack(x_batches).to(device)
    y = torch.stack(y_batches).to(device)
    return x, y

@torch.no_grad()
def estimate_loss_and_perplexity():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters, device=device)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        avg_loss = losses.mean()
        perplexity = math.exp(avg_loss)
        out[split] = (avg_loss, perplexity)
    model.train()
    return out

class Head(nn.Module):
    """ One head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False).to(device)
        self.query = nn.Linear(n_embd, head_size, bias=False).to(device)
        self.value = nn.Linear(n_embd, head_size, bias=False).to(device)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size, device=device)))
        self.dropout = nn.Dropout(dropout).to(device)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ Multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd).to(device)
        self.dropout = nn.Dropout(dropout).to(device)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    """ A simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd).to(device),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd).to(device),
            nn.Dropout(dropout).to(device),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size).to(device)
        self.ffwd = FeedForward(n_embd).to(device)
        self.ln1 = nn.LayerNorm(n_embd).to(device)
        self.ln2 = nn.LayerNorm(n_embd).to(device)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(sp.get_piece_size(), n_embd).to(device)
        self.position_embedding_table = nn.Embedding(block_size, n_embd).to(device)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head).to(device) for _ in range(n_layer)]).to(device)
        self.ln_f = nn.LayerNorm(n_embd).to(device)
        self.lm_head = nn.Linear(n_embd, sp.get_piece_size()).to(device)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens=100):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = GPTLanguageModel().to(device)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

#Checkpointing function
def save_checkpoint(model, optimizer, epoch, path, loss):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'best_val_loss' : loss
    }, path)

# Training loop with checkpointing
best_val_loss = float('inf')
checkpoint_path = 'model_checkpoint.pth'
no_improvement_count = 0

# Hyperparameters
num_epochs = 1  # Number of epochs

# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    for iter in range(max_iters):
        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses_and_perplexities = estimate_loss_and_perplexity()
            train_loss, train_perplexity = losses_and_perplexities['train']
            val_loss, val_perplexity = losses_and_perplexities['val']

            print(f"Step {iter}: train loss {train_loss:.4f}, train perplexity {train_perplexity:.4f}, val loss {val_loss:.4f}, val perplexity {val_perplexity:.4f}")

            # Save the best model based on validation loss
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                save_checkpoint(model, optimizer, iter, checkpoint_path, best_val_loss)
                no_improvement_count = 0
            else:
                no_improvement_count += 1

            # Early stopping
            if no_improvement_count >= patience:
                print("Early stopping triggered.")
                break

        xb, yb = get_batch('train')
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    # Print the epoch summary
    print(f"Epoch {epoch + 1} completed. Best validation loss: {best_val_loss:.4f}")

    # Early stopping after each epoch
    if no_improvement_count >= patience:
        print("Early stopping triggered after epoch.")
        break

22.8708 M parameters
Epoch 1/2
Step 0: train loss 9.3822, train perplexity 11874.8537, val loss 9.3819, val perplexity 11871.8756
Step 500: train loss 4.7635, train perplexity 117.1586, val loss 4.7887, val perplexity 120.1410
Step 1000: train loss 4.3933, train perplexity 80.9081, val loss 4.4474, val perplexity 85.4054
Step 1500: train loss 4.1505, train perplexity 63.4657, val loss 4.2368, val perplexity 69.1849
Step 2000: train loss 3.9843, train perplexity 53.7467, val loss 4.0935, val perplexity 59.9486
Step 2500: train loss 3.8605, train perplexity 47.4880, val loss 3.9956, val perplexity 54.3606
Step 3000: train loss 3.7643, train perplexity 43.1328, val loss 3.9225, val perplexity 50.5275
Step 3500: train loss 3.6876, train perplexity 39.9501, val loss 3.8651, val perplexity 47.7062
Step 4000: train loss 3.6102, train perplexity 36.9722, val loss 3.8211, val perplexity 45.6522
Step 4500: train loss 3.5541, train perplexity 34.9554, val loss 3.7846, val perplexity 44.0200
Step 

KeyboardInterrupt: 

In [20]:
def generate_story(prompt, num_sentences=5):
    context = torch.tensor(encode(prompt), dtype=torch.long, device=device).unsqueeze(0)
    generated = model.generate(context, max_new_tokens=500)
    generated_text = decode(generated[0].tolist())
    sentences = generated_text.split('. ')
    return '.\n'.join(sentences[:num_sentences]) + '.'

print("'Good doctor'")
print(generate_story('Good doctor'))

'Good doctor'
Good doctor Sally was driving one night down the street.
Suddenly she saw a pup.
She gave some needles but it was my reason.
So when she saw the strained out, I was smashing it out.
She ran away and eventually asked to be out in talk.


In [24]:
print("'Little girl'")
print(generate_story('Little Girl'))

'Little girl'
Little Girl Recini Joce Jill entered a baking competition.
First, she was waiting for her start.
It was Halloween and Handleg careless.
She saw a scary bed of clowns.
When the sun was there when her mom found out her.


In [25]:
print(generate_story('Good doctor'))

Good doctor for the countless appointment The doctor told Tina his workout patches were remodeling there.
Kate decided not to wear sunscreen.
She pulled out her trip to Mexico.
Everyone had a great day at work.
She was so happy to be on the roller coaster for a long time.


In [3]:
# Load the model and optimizer state
def load_checkpoint(model, optimizer, path):
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    best_val_loss = checkpoint['best_val_loss']
    return model, optimizer, epoch, best_val_loss

# Example usage before resuming training or inference
model = GPTLanguageModel().to(device)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
final_model_path = '/kaggle/working/model_checkpoint.pth'

# Load the saved model
model, optimizer, start_epoch, best_val_loss = load_checkpoint(model, optimizer, final_model_path)
print(f"Model loaded from {final_model_path}, starting from epoch {start_epoch} with best validation loss {best_val_loss:.4f}")

Model loaded from /kaggle/working/model_checkpoint.pth, starting from epoch 6500 with best validation loss 3.9046


NANOGPT with pretrained inputs

|

|

v

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import sentencepiece as spm
import os
import pandas as pd
import math
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Hyperparameters
batch_size = 32  # Increased batch size
block_size = 8
max_iters = 5000  # Reduced iterations
eval_interval = 1000  # Increased eval interval
learning_rate = 1e-4
weight_decay = 1e-5
patience = 3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 50  # Reduced eval iterations
n_embd = 16
n_head = 2
n_layer = 2
dropout = 0.0

# Load the Story Cloze dataset
story_2016 = pd.read_csv('/kaggle/input/story-cloze/cloze_2016.csv')
story_2016['story'] = story_2016['storytitle'] + ' ' + story_2016[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5']].agg(' '.join, axis=1)
story_2018 = pd.read_csv('/kaggle/input/story-cloze/cloze_2017.csv')
story_2018['story'] = story_2018['storytitle'] + ' ' + story_2018[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5']].agg(' '.join, axis=1)
story_cloze = pd.concat([story_2016, story_2018], axis=0)

# Initialize SentencePiece tokenizer
sp = spm.SentencePieceProcessor(model_file='/kaggle/working/spm_model.model')

# Encode and decode functions using SentencePiece
def encode(text):
    return sp.encode(text, out_type=int)

def decode(ids):
    return sp.decode(ids)

# Prepare data
text = ' '.join(story_cloze['story'])
data = torch.tensor(encode(text), dtype=torch.long).to(device)
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

# Data loading
def get_batch(split):
    data = train_data if split == 'train' else val_data
    max_index = len(data) - block_size
    if max_index <= 0:
        raise ValueError("The block_size is too large for the given dataset. Consider reducing it.")
    ix = torch.randint(max_index, (batch_size,), device=device)
    x_batches = []
    y_batches = []
    for i in ix:
        end = min(i + block_size, len(data))
        x = data[i:end]
        y = data[i+1:end+1]
        if len(x) < block_size:
            x = torch.cat([x, torch.zeros(block_size - len(x), dtype=torch.long, device=device)])
            y = torch.cat([y, torch.zeros(block_size - len(y), dtype=torch.long, device=device)])
        x_batches.append(x)
        y_batches.append(y)
    x = torch.stack(x_batches).to(device)
    y = torch.stack(y_batches).to(device)
    return x, y

@torch.no_grad()
def estimate_loss_and_perplexity():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters, device=device)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        avg_loss = losses.mean()
        perplexity = math.exp(avg_loss)
        out[split] = (avg_loss, perplexity)
    model.train()
    return out

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False).to(device)
        self.query = nn.Linear(n_embd, head_size, bias=False).to(device)
        self.value = nn.Linear(n_embd, head_size, bias=False).to(device)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size, device=device)))
        self.dropout = nn.Dropout(dropout).to(device)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd).to(device)
        self.dropout = nn.Dropout(dropout).to(device)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd).to(device),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd).to(device),
            nn.Dropout(dropout).to(device),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size).to(device)
        self.ffwd = FeedForward(n_embd).to(device)
        self.ln1 = nn.LayerNorm(n_embd).to(device)
        self.ln2 = nn.LayerNorm(n_embd).to(device)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(sp.get_piece_size(), n_embd).to(device)
        self.position_embedding_table = nn.Embedding(block_size, n_embd).to(device)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head).to(device) for _ in range(n_layer)]).to(device)
        self.ln_f = nn.LayerNorm(n_embd).to(device)
        self.lm_head = nn.Linear(n_embd, sp.get_piece_size()).to(device)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens=100):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = GPTLanguageModel().to(device)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Checkpointing function
def save_checkpoint(model, optimizer, epoch, path, loss):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, path)

# Training with mixed precision
scaler = torch.cuda.amp.GradScaler()
best_val_loss = float('inf')
early_stop_counter = 0

for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = estimate_loss_and_perplexity()
        print(f"Step {iter}: train loss {losses['train'][0]:.4f}, val loss {losses['val'][0]:.4f}, val perplexity {losses['val'][1]:.4f}")

        if losses['val'][0] < best_val_loss:
            best_val_loss = losses['val'][0]
            save_checkpoint(model, optimizer, iter, "best_model.pt", best_val_loss)
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("Early stopping.")
                break

    model.train()
    xb, yb = get_batch('train')

    optimizer.zero_grad()
    with torch.cuda.amp.autocast():
        logits, loss = model(xb, yb)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

context = "Good doctor"
input_ids = torch.tensor(encode(context), dtype=torch.long).unsqueeze(0).to(device)
generated_ids = model.generate(input_ids, max_new_tokens=100)[0].tolist()
generated_text = decode(generated_ids)
print(generated_text)


0.336624 M parameters
Step 0: train loss 9.2110, val loss 9.2115, val perplexity 10011.1559
Step 1000: train loss 7.6166, val loss 7.6235, val perplexity 2045.6615
Step 2000: train loss 6.6963, val loss 6.6888, val perplexity 803.3978


KeyboardInterrupt: 

In [None]:
def generate_story(prompt, num_sentences=5):
    context = torch.tensor(encode(prompt), dtype=torch.long, device=device).unsqueeze(0)
    generated = model.generate(context, max_new_tokens=500)
    generated_text = decode(generated[0].tolist())
    sentences = generated_text.split('. ')
    return '. '.join(sentences[:num_sentences]) + '.'

print(generate_story('Good doctor'))