In [1]:
!pip install -U datasets



In [2]:
from datasets import load_dataset

#  Correct approach
dataset = load_dataset(
    "roneneldan/TinyStories",
    split="train"  # Use "default" config implicitly
)

# Verify the dataset
print(dataset[0]["text"])  # Should output a story

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.


In [3]:
from google.colab import drive  # Required in every new session
from datasets import load_from_disk

drive.mount('/content/drive')  # Re-authenticate (tiny bandwidth)
from datasets import load_dataset

dataset = load_dataset("roneneldan/TinyStories", split="train")  # Smaller subset for testing


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:

#  Display the first story sample (NO INTERNET REQUIRED)
print("After restarting session:")
print(dataset[0]["text"][:100] + "...")


After restarting session:
One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with...


In [5]:
# Required Libraries
import torch
import torch.nn as nn
from torch.nn import functional as F
from datasets import load_from_disk
from google.colab import drive
import numpy as np
import os
from glob import glob

In [6]:
# Hyperparameters
batch_size = 16
block_size = 32
max_iters = 10000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0

cuda


In [7]:
# Set random seed
torch.manual_seed(1337)
# 2. Load only 75% of the dataset
n = int(0.75 * len(dataset))
dataset = dataset.select(range(n))

In [8]:
# 3. Concatenate and Tokenize
text = "".join(dataset[i]["text"] for i in range(len(dataset)))
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [9]:
# 4. Encode and save in chunks
output_dir = "/content/encoded_chunks"
os.makedirs(output_dir, exist_ok=True)

chunk_size = 1000
counter = 0
for i in range(0, len(dataset), chunk_size):
    chunk = [dataset[j]["text"] for j in range(i, min(i + chunk_size, len(dataset)))]
    encoded = [encode(text) for text in chunk]
    flat = [item for sublist in encoded for item in sublist]
    chunk_tensor = torch.tensor(flat, dtype=torch.int32)
    torch.save(chunk_tensor, os.path.join(output_dir, f"chunk_{counter}.pt"))
    counter += 1

In [10]:
# 5. Combine into single binary file
final_output_path = "/content/full_encoded_data.bin"
with open(final_output_path, 'wb') as f:
    chunk_paths = sorted(glob(os.path.join(output_dir, "chunk_*.pt")))
    total_tokens = 0
    for path in chunk_paths:
        chunk = torch.load(path)
        f.write(chunk.numpy().astype('int32').tobytes())
        total_tokens += chunk.numel()

In [11]:
# 6. Split into train/val
split_point = int(0.9 * total_tokens)
train_path = "/content/train_data.bin"
val_path = "/content/val_data.bin"
with open(final_output_path, 'rb') as f:
    train_bytes = f.read(split_point * 4)
    with open(train_path, 'wb') as train_file:
        train_file.write(train_bytes)
    val_bytes = f.read()
    with open(val_path, 'wb') as val_file:
        val_file.write(val_bytes)

In [12]:
# 7. Memmap train/val
train_data = np.memmap(train_path, dtype=np.int32, mode='r')
val_data = np.memmap(val_path, dtype=np.int32, mode='r')

# 8. Data loader

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size - 1, (batch_size,))
    x = torch.stack([torch.tensor(data[i:i+block_size], dtype=torch.long) for i in ix])
    y = torch.stack([torch.tensor(data[i+1:i+block_size+1], dtype=torch.long) for i in ix])
    return x.to(device), y.to(device)


In [13]:
def estimate_loss():
    out = {}
    model.eval()
    with torch.no_grad():  # Add no_grad context here
        for split in ['train', 'val']:
            losses = torch.zeros(eval_iters)
            perplexities = torch.zeros(eval_iters)  # Store perplexities
            for k in range(eval_iters):
                X, Y = get_batch(split)
                logits, loss = model(X, Y)
                losses[k] = loss.item()
                # Calculate perplexity (exponential of the loss)
                perplexities[k] = torch.exp(torch.tensor(loss.item()))
            out[split] = {
                'loss': losses.mean(),
                'perplexity': perplexities.mean()  # Add perplexity to output
            }
    model.train()
    return out

In [14]:
# 9. Model definition
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

In [15]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))

In [16]:
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [17]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [18]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        if targets is None:
            loss = None
        else:
            logits = logits.view(B*T, -1)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [20]:
# 10. Instantiate model
model = BigramLanguageModel().to(device)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

# 11. Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']['loss']:.4f}, val loss {losses['val']['loss']:.4f}")
        print(f"step {iter}: train ppl {losses['train']['perplexity']:.2f}, val ppl {losses['val']['perplexity']:.2f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

0.223145 M parameters
step 0: train loss 5.2347, val loss 5.2343
step 0: train ppl 187.75, val ppl 187.67
step 100: train loss 2.5309, val loss 2.5169
step 100: train ppl 12.60, val ppl 12.41
step 200: train loss 2.3241, val loss 2.3244
step 200: train ppl 10.24, val ppl 10.24
step 300: train loss 2.2350, val loss 2.2309
step 300: train ppl 9.36, val ppl 9.32
step 400: train loss 2.1414, val loss 2.1367
step 400: train ppl 8.53, val ppl 8.49
step 500: train loss 2.0628, val loss 2.0582
step 500: train ppl 7.88, val ppl 7.85
step 600: train loss 1.9853, val loss 1.9873
step 600: train ppl 7.30, val ppl 7.31
step 700: train loss 1.9358, val loss 1.9362
step 700: train ppl 6.95, val ppl 6.95
step 800: train loss 1.8485, val loss 1.8469
step 800: train ppl 6.37, val ppl 6.36
step 900: train loss 1.8145, val loss 1.8208
step 900: train ppl 6.16, val ppl 6.20
step 1000: train loss 1.7654, val loss 1.7644
step 1000: train ppl 5.86, val ppl 5.85
step 1100: train loss 1.7295, val loss 1.7261
st

In [21]:
# 12. Generate text
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=5000)[0].tolist()))

	here. It's train again. So the er adventually. Everywhere you, Emmon.Once one day, Lily how Ben. Every and frastled and great in rack, she cand up. Mit was so harp her to the little mounce mean, she down. If things People langes acane to met started her doll. Then hugs the flower and flew away named Lily are very went's get about hand with her numbled and take her now and take it's place broker back and make a book nicel?" Daisy got to "My, make to the penny, he see they flew him away many helpider lemong and real your both brick, the king scared fun with her whenered the lessors. They played to called her graceful facen. It tide a drecided to help from decided the mair.Once upon a the sang. But It was oa too palace the land?" 

The parents too. They say for being, Mar. Win to real incret hewa. When the ways him and prayed with and aftered! The nice was too long to Mom a could. Ben said,

Mayally, Mom, it is not ing. The dog true after her her tade you can was so she could baby fight.

In [29]:
# Save model weights (PyTorch format)
torch.save(model.state_dict(), 'model_weights.pth')

# Save vocabulary and metadata (JSON format)
data_to_save = {
    'vocab': {
        'stoi': stoi,  # Your string-to-index mapping
        'itos': itos,  # Your index-to-string mapping
    },
    'metrics': {  # Optional training stats
        'train_loss': losses['train']['loss'].item(),
        'val_loss': losses['val']['loss'].item(),
        'train_ppl': losses['train']['perplexity'].item(),
        'val_ppl': losses['val']['perplexity'].item(),
    }
}

with open('model_vocab_metrics.json', 'w') as f:
    json.dump(data_to_save, f, indent=4)

print("Saved: model_weights.pth + model_vocab_metrics.json")

Saved: model_weights.pth + model_vocab_metrics.json


In [33]:
# ==================== LOAD MODEL + VOCAB + METRICS ====================
import torch
import json

# Load model architecture
model = BigramLanguageModel().to(device)

# Load trained weights
model.load_state_dict(torch.load('model_weights.pth'))

# Load vocabulary and training metrics
with open('model_vocab_metrics.json', 'r') as f:
    saved_data = json.load(f)

# Extract vocab mappings
stoi = saved_data['vocab']['stoi']
itos = saved_data['vocab']['itos']

# Define robust encode/decode functions
def encode(s: str) -> list[int]:
    """Encode string to token IDs, skipping unknown characters."""
    return [stoi[c] for c in s if c in stoi]

def decode(l: list[int]) -> str:
    """Decode token IDs to string, replacing unknown IDs with '<?>'."""
    return ''.join([itos.get(str(i), '<?>') for i in l])  # Ensure string key

# Access training metrics
metrics = saved_data.get('metrics', {})
print(f"Loaded model with vocab size {len(stoi)} | Val loss: {metrics.get('val_loss', 'N/A')}")
print(f"First 5 vocab items: {dict(list(stoi.items())[:5])}")

# Test encoding and decoding
test_text = ''.join([c for c in "Hello Word" if c in stoi])
if test_text:
    encoded = encode(test_text)
    decoded = decode(encoded)
    print(f"Test encode/decode: '{test_text}' -> {encoded} -> '{decoded}'")
else:
    print("Warning: Test text contains no known characters from vocabulary!")


Loaded model with vocab size 169 | Val loss: 1.245513677597046
First 5 vocab items: {'\t': 0, '\n': 1, ' ': 2, '!': 3, '"': 4}
Test encode/decode: 'Hello Word' -> [42, 70, 77, 77, 80, 2, 57, 80, 83, 69] -> 'Hello Word'
