In [None]:
import os
import requests
import torch
import numpy as np
import torch.optim.adamw
from GearedMLA import Model
import bitsandbytes
import json
from torch import GradScaler

torch.set_float32_matmul_precision('high')

input_file_path = 'input.txt'
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w', encoding='utf-8') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, "r", encoding="utf-8") as f:
    data = f.read()

print(len(data))

# from transformers import AutoTokenizer

import os
save_dir = os.path.join(os.getcwd(), f"tokenizers/E11a5_Tokenizer{2*8192-4}")


from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(save_dir)
# tokenizer = AutoTokenizer.from_pretrained("gpt2")

# vocab_size = tokenizer.vocab_size
# tokens = tokenizer.encode(data)
# from datasets import load_from_disk

vocab_size = 8192 * 2
# tokens = load_from_disk("input_ids_only")["train"]['input_ids']
from datasets import load_from_disk
import numpy as np, os, json
from tqdm import tqdm

# Load the input_ids-only dataset (random-access, on-disk)
#ds = load_from_disk("input_ids_only")["train"]

pad_id = 16380
eos_id = 16382

# Choose compact dtype
max_token_id = 16383
dtype = np.uint16 if max_token_id <= np.iinfo(np.uint16).max else np.uint32
bin_path = f"tokens_{np.dtype(dtype).name}.bin"
meta_path = "tokens_meta.json"

if not (os.path.exists(bin_path) and os.path.exists(meta_path)):
    # Pass 1: count total length (strip pads, add EOS per example)
    total_len = 0
    for ex in tqdm(ds, desc="Counting tokens"):
        ids = ex["input_ids"]
        if pad_id is not None:
            total_len += sum(1 for t in ids if t != pad_id) + 1
        else:
            total_len += len(ids) + 1

    mm = np.memmap(bin_path, mode="w+", dtype=dtype, shape=(total_len,))
    pos = 0

    # Pass 2: write tokens
    for ex in tqdm(ds, desc="Writing memmap"):
        ids = ex["input_ids"]
        if pad_id is not None:
            ids = [t for t in ids if t != pad_id]
        ids.append(eos_id)
        L = len(ids)
        mm[pos:pos+L] = np.asarray(ids, dtype=dtype)
        pos += L
    mm.flush()

    with open(meta_path, "w") as f:
        json.dump({"length": int(total_len), "dtype": str(np.dtype(dtype)), "eos_id": int(eos_id), "pad_id": int(pad_id) if pad_id is not None else None}, f)

# Re-open memmap for reading
with open(meta_path, "r") as f:
    meta = json.load(f)
N = int(meta["length"])
tokens_1d = np.memmap(bin_path, mode="r", dtype=dtype, shape=(N,))

def get_batch():
    ix = torch.randint(0, N - blocksize - 1, (batchSize,))
    X = torch.empty((batchSize, blocksize), dtype=torch.long)
    Y = torch.empty((batchSize, blocksize), dtype=torch.long)
    for k, i in enumerate(ix.tolist()):
        x_np = np.asarray(tokens_1d[i:i+blocksize], dtype=np.int64)
        y_np = np.asarray(tokens_1d[i+1:i+1+blocksize], dtype=np.int64)
        X[k] = torch.from_numpy(x_np)
        Y[k] = torch.from_numpy(y_np)
    return X.to("cuda", non_blocking=True), Y.to("cuda", non_blocking=True)



# ##data = "data"
# vocab = sorted(list(set(data)))
# vocab_size = len(vocab)

# print(vocab.index('t'))

# tokens = [vocab.index(v) for i, v in enumerate(data)]

# print(tokens[:40])
# #print(vocab)

TOKSEEN = 524288//2

blocksize = 1024
maxSteps = 6000
batchSize = 16
gradAccum = int((TOKSEEN // blocksize) // batchSize)
n_head = 8
n_layers = 48 #32
n_embd = 512 #512

import math

def getLR(step, max_steps, base_lr=1e-3, min_lr=1e-5, warmup_steps=1000):
    if step < warmup_steps:
        return base_lr * step / warmup_steps
    progress = (step - warmup_steps) / (max_steps - warmup_steps)
    progress = min(max(progress, 0.0), 1.0)
    cosine_decay = 0.5 * (1 + math.cos(math.pi * progress))
    lr = min_lr + (base_lr - min_lr) * cosine_decay
    return lr


#MHA
#model = Model(n_layers,n_embd,n_head,vocab_size,blocksize).to("cuda")
print("compiling model....")
model = Model(n_layers,n_embd,n_head,vocab_size,blocksize, LCompression=128, flashATTN=True, attn_dropout=0.3).to("cuda")
model = torch.compile(model)
torch.cuda.empty_cache()

tot = 0
for param in model.parameters():
    tot += param.numel()

print(f"total params: {tot//1e6}M params")

# optimizer = bitsandbytes.optim.AdamW8bit(
#     model.parameters(),
#     lr=3e-4,
#     min_8bit_size=4096,       # not 512
#     percentile_clipping=0,    # avoid extra kernel
# )
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, fused=True)
import wandb
wandb.init()
# def get_batch():
#     ix = torch.randint(len(tokens) - blocksize, (batchSize,))
#     x = torch.stack([torch.tensor(tokens[i:i+blocksize], dtype=torch.long) for i in ix])
#     y = torch.stack([torch.tensor(tokens[i+1:i+blocksize+1], dtype=torch.long) for i in ix])
#     return x.to("cuda"), y.to("cuda")
torch.manual_seed(69696969)
torch.cuda.reset_peak_memory_stats()
import time
for i in range(maxSteps):
    xo = time.time()
    optimizer.zero_grad(set_to_none=True)
    for _ in range(gradAccum):
        with torch.amp.autocast("cuda", dtype=torch.bfloat16):
            x, y = get_batch()
            out, loss = model(x, y)
        loss.backward()
    optimizer.step()

    lr = getLR(i, maxSteps, 5e-4, 5e-5, 20)
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr
    if i % 10 == 0:
        Mmem = (torch.cuda.max_memory_allocated() / 1e9)
        t1 = time.time() - xo
        tleft = t1 * (maxSteps-i)
        wandb.log({
            "Loss": loss.item(),
            "lr": lr,
            "t/step:": t1,
            "peak mem": Mmem,
            "T-left": tleft
        }, step=i)
        print(f"step: {i}/{maxSteps}, loss: {loss.item():.6f}, {time.localtime()}, t/step: {t1:.4f}s, time left: {tleft:.2f}, LR: {lr:.2e}, peak mem: {Mmem:.2f} GB")
    torch.cuda.synchronize()
# Save model weights
torch.save(model.state_dict(), "E11a5.pt")

# Save config
config = {
    "n_layers": n_layers,
    "n_embd": n_embd,
    "n_head": n_head,
    "vocab_size": vocab_size,
    "blocksize": blocksize,
    "LCompression": 128,  # or your actual value
    "flashATTN": True,
    "attn_dropout": 0.3,
    "tokenizer": "Custom_gpt2"
}
with open("model_config.json", "w") as f:
    json.dump(config, f, indent=2)


generated = torch.tensor([0], device="cuda").unsqueeze(0)
from torch import nn
t0 = time.time()
for i in range(400):
    out = model(generated)
    out = out[:,-1,:]
    
    out = nn.functional.softmax(out, dim=-1)
    predchar = torch.multinomial(out, num_samples=1)
    
    
    generated = torch.concat([generated, predchar], dim=-1)

#generated = model.generate(torch.tensor([0], device="cuda").unsqueeze(0), 200)
t1 = time.time() - t0
print(f"time to generate: {t1}, tokens per second: {200/t1}")
#outText = "".join([vocab[int(v.item())] for i, v in enumerate(generated.squeeze(0))])

outText = tokenizer.decode(generated.squeeze(0))
print(outText)

1115394
compiling model....
total params: 142.0M params


wandb: Currently logged in as: elias-dovkrans (mamichul) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


step: 0/6000, loss: 9.822682, 1754920620.030256, t/step: 18.1936s, time left: 109161.83, LR: 0.000 1e-5, peak mem: 17.96 GB
step: 10/6000, loss: 8.579470, 1754920730.8932679, t/step: 11.4071s, time left: 68328.53, LR: 25.000 1e-5, peak mem: 19.10 GB
step: 20/6000, loss: 7.669939, 1754920842.7234156, t/step: 11.0061s, time left: 65816.63, LR: 50.000 1e-5, peak mem: 19.10 GB
step: 30/6000, loss: 7.449625, 1754920954.7166407, t/step: 10.9835s, time left: 65571.53, LR: 50.000 1e-5, peak mem: 19.10 GB
step: 40/6000, loss: 7.563025, 1754921065.6819263, t/step: 10.9845s, time left: 65467.71, LR: 49.999 1e-5, peak mem: 19.10 GB
step: 50/6000, loss: 7.490933, 1754921178.4531906, t/step: 11.3167s, time left: 67334.13, LR: 49.997 1e-5, peak mem: 19.10 GB
step: 60/6000, loss: 7.528048, 1754921292.960763, t/step: 11.3386s, time left: 67351.41, LR: 49.995 1e-5, peak mem: 19.10 GB
step: 70/6000, loss: 7.503152, 1754921405.515489, t/step: 10.8964s, time left: 64615.86, LR: 49.992 1e-5, peak mem: 19.10

In [None]:

torch.cuda.empty_cache()
generated = torch.tensor(tokenizer.encode("<|bos|>"), device="cuda").unsqueeze(0)
from torch import nn
t0 = time.time()
for i in range(400):
    out = model(generated)
    out = out[:,-1,:]
    
    out = nn.functional.softmax(out, dim=-1)
    predchar = torch.multinomial(out, num_samples=1)
    
    
    generated = torch.concat([generated, predchar], dim=-1)

#generated = model.generate(torch.tensor([0], device="cuda").unsqueeze(0), 200)
t1 = time.time() - t0
print(f"time to generate: {t1}, tokens per second: {200/t1}")
#outText = "".join([vocab[int(v.item())] for i, v in enumerate(generated.squeeze(0))])

outText = tokenizer.decode(generated.squeeze(0))
print(outText)

In [None]:
from datasets import load_dataset

fw = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT")

In [None]:

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("gpt2")

# retrain







In [None]:
# ...existing code...
from transformers import AutoTokenizer, GPT2TokenizerFast
import os

# Set your desired vocab size
target_vocab_size = ((8192 * 2) - 4)  # change as needed

stream = fw["train"]

def batch_iterator_hf_sample(dataset_stream, chars_per_batch=10000, text_column="text", max_examples=100000):
    buf, n, count = [], 0, 0
    for example in dataset_stream:
        if count >= max_examples:
            break
        s = example[text_column].strip()
        if not s:
            continue
        buf.append(s)
        n += len(s)
        count += 1
        if n >= chars_per_batch:
            yield " ".join(buf)
            buf, n = [], 0
    if buf:
        yield " ".join(buf)

# Iterator over your training text (uses `input_file_path` defined earlier)
def batch_iterator(path="", chars_per_batch=10000):
    with open(path, "r", encoding="utf-8") as f:
        buf, n = [], 0
        for line in f:
            s = line.rstrip("\n")
            if not s:
                continue
            buf.append(s)
            n += len(s)
            if n >= chars_per_batch:
                yield " ".join(buf)
                buf, n = [], 0
        if buf:
            yield " ".join(buf)


base_tok = GPT2TokenizerFast.from_pretrained("gpt2")
new_tok = base_tok.train_new_from_iterator(batch_iterator_hf_sample(stream, max_examples=1000000), vocab_size=target_vocab_size)
print("training complete")
# # Start from the base GPT-2 tokenizer and train a new one
# base_tok = GPT2TokenizerFast.from_pretrained("gpt2")
# new_tok = base_tok.train_new_from_iterator(batch_iterator(), vocab_size=target_vocab_size)

# Ensure a pad token exists (GPT-2 doesn't have one by default)
special_tokens_dict = {
    "pad_token": "<|pad|>",
    "bos_token": "<|bos|>",
    "eos_token": "<|eos|>"
}
new_tok.add_special_tokens(special_tokens_dict)

# Optional: set a practical max length for your use case
try:
    new_tok.model_max_length = 1024  # uses your existing variable
except NameError:
    pass

# Save tokenizer
save_dir = os.path.join(os.getcwd(), f"tokenizers/E11a5_Tokenizer{target_vocab_size}")
os.makedirs(save_dir, exist_ok=True)
new_tok.save_pretrained(save_dir)
print(f"Tokenizer saved to: {save_dir}")

# Load it back and use it
tokenizer = AutoTokenizer.from_pretrained(save_dir)
print("New tokenizer vocab size:", tokenizer.vocab_size)

# Optional: re-encode your data with the new tokenizer
# tokens = tokenizer.encode(data)
# vocab_size = tokenizer.vocab_size
# ...existing code...

In [None]:
import os
save_dir = os.path.join(os.getcwd(), f"tokenizers/E11a5_Tokenizer{2*8192-4}")


from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(save_dir)

In [None]:
# Tokenize the entire dataset
def tokenize_function(example):
    import os
    target_vocab_size = ((8192 * 2) - 4)  # change as needed

    save_dir = os.path.join(os.getcwd(), f"tokenizers/E11a5_Tokenizer{target_vocab_size}")


    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(save_dir)
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=1024)

# Apply tokenization (batched for speed)
tokenized_dataset = fw.map(tokenize_function, batched=True, num_proc=4)

# Now tokenized_dataset contains tokenized inputs
print("done: ",len(tokenized_dataset))

In [None]:
save_path = "tokenized_fineweb_edu"
tokenized_dataset.save_to_disk(save_path)
print(f"Tokenized dataset saved to: {save_path}")

In [None]:
dd = []
for i in range(10):
    dd += tokenizer.encode(str(i))
    print(i)

print(dd)

In [None]:
f = tokenizer.decode(dd)
print(f)

In [None]:
8192*2

In [None]:
tokenizer.special_tokens_map

In [None]:
print("Before:", tokenizer.vocab_size)
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
print("After:", tokenizer.vocab_size)

In [None]:
with open("input_ids.txt", "w", encoding="utf-8") as f:
    for example in tokenized_dataset["train"]:
        f.write(" ".join(map(str, example["input_ids"])) + "\n")

In [None]:
total_tokens = sum(len(example["input_ids"]) for example in tokenized_dataset["train"])
print("Total tokens:", total_tokens)

In [None]:
print(f"{(9672101 * 1024 / 1e9):.2f} B")

In [None]:
tokenizer.special_tokens_map
tokenizer.encode("<|eos|>")#16382
tokenizer.encode("<|pad|>")#16380
tokenizer.encode("<|bos|>")#16381



In [None]:
from datasets import load_dataset

fw = load_dataset("tokenized_fineweb_edu")

In [None]:
# ...existing code...
# Keep only 'input_ids' column
input_ids_dataset = fw.remove_columns(
    [col for col in fw["train"].column_names if col != "input_ids"]
)

# Save to disk (Arrow format, fast and efficient)
input_ids_dataset.save_to_disk("input_ids_only")
# ...existing code...

In [None]:
from tqdm import tqdm

eos_token_id = tokenizer.eos_token_id
with open("input_ids.txt", "w", encoding="utf-8") as f:
    for example in tqdm(fw["train"], desc="Writing input_ids"):
        ids = example["input_ids"] + [eos_token_id]
        f.write(" ".join(map(str, ids)) + "\n")

In [None]:
# ...existing code...
eos_token_id = 16382
with open("input_ids.txt", "w", encoding="utf-8") as f:
    for example in tokenized_dataset["train"]:
        ids = example["input_ids"] + [eos_token_id]
        f.write(" ".join(map(str, ids)) + "\n")
# ...existing code...

In [None]:
import math
for i in range(24):
    print(i, 2**i)