In [1]:
import json

def load_file(file_name):
    with open(file_name, 'r') as f:
        return json.load(f)
    

vocab_to_int = load_file("vocab_to_int.json")
int_to_vocab = load_file("int_to_vocab.json")

print(f"Loaded vocabulary contains {len(vocab_to_int)} items.")

Loaded vocabulary contains 262 items.


In [2]:
import re

def tokenize_delay(token_str):
    delay_value_str = token_str.split('#')[1].strip('|>')
    delay_value = float(delay_value_str)
    
    # Apply our binning rules to pick the final token
    if delay_value <= 1.0:
        final_token = '<|delay_short|>'
    elif delay_value <= 5.0:
        final_token = '<|delay_medium|>'
    else:
        final_token = '<|delay_long|>'
    
    return final_token

def custom_tokenizer(byte_stream: bytes, vocab: dict) -> list[int]:
    token_ids = []

    # This regex pattern is the core of the parser. It finds one of two things:
    # 1. (<\|[^|>]+?\|>): A full special token, like '<|client|>' or '<|delay#1.23|>'.
    # 2. (.): Any single character (byte) that is NOT part of a special token.
    # The rb'' prefix means the pattern is a raw *byte* pattern.
    pattern = re.compile(rb'(<\|[^|>]+?\|>)|(.)')

    for match in pattern.finditer(byte_stream):
        special_token_bytes, byte_char = match.groups()

        if special_token_bytes:
            token_str = special_token_bytes.decode('utf-8')

            if token_str.startswith('<|delay#'):
                token_id = vocab[tokenize_delay(token_str)]
            else:
                token_id = vocab.get(token_str)
            
            token_ids.append(token_id)

        elif byte_char:
            byte_value = int.from_bytes(byte_char, 'big')
            token_ids.append(byte_value)


    return token_ids

# test the tokenizer
sample_data = b'<|client|><|delay#0.85|>\x48\x65\x6c\x6c\x6f<|server|><|delay#9.9|>\x57\x6f\x72\x6c\x64'
tokenized_output = custom_tokenizer(sample_data, vocab_to_int)
actual = len(tokenized_output)
expected = 14
assert actual == expected, f'Expected {expected}, Actual {actual}'

In [3]:
import torch
from torch.utils.data import Dataset

class ByteStreamDataset(Dataset):
    def __init__(self, paths, vocab):
        self.file_paths = paths
        self.vocab = vocab

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        with open(file_path, 'rb') as f:
            raw_byte_stream = f.read()

        token_ids = custom_tokenizer(raw_byte_stream, self.vocab)

        return torch.tensor(token_ids, dtype=torch.long)

In [4]:
import torch

def collate_fn(batch, pad_value):
    max_len = max(len(seq) for seq in batch)

    input_ids = torch.full((len(batch), max_len), pad_value, dtype=torch.long)
    attention_mask = torch.zeros((len(batch), max_len), dtype=torch.long)
    
    for i, seq in enumerate(batch):
        input_ids[i, :len(seq)] = seq
        attention_mask[i, :len(seq)] = 1
        
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }

In [5]:
def decode(tokens):
    return [int_to_vocab.get(str(t.item()), f"byte_{t.item()}") for t in tokens]

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
import os
from pathlib import Path

data_dir = Path(r'C:\Users\exide\code\c-telnet-proxy')
file_pattern = r'*T*_*-*-*-*-*'
file_paths = sorted(data_dir.glob(file_pattern))
if not file_paths: raise FileNotFoundError

dataset = ByteStreamDataset(paths=file_paths, vocab=vocab_to_int)
pad_token_id = vocab_to_int['<|pad|>']

batch_size = 1
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=lambda batch: collate_fn(batch, pad_token_id))

print(f"Created a DataLoader with {len(dataset)} samples and a batch size of {batch_size}.")

try:
    # 'next(iter(data_loader))' gets the first batch
    batch = next(iter(data_loader))
    print("pulled a batch from the DataLoader")
except Exception as e:
    print(f"could not pull a batch: {e}")
    raise e

# 2. Check the batch structure (type and keys)
print("\n--- Checking Batch Structure ---")
assert isinstance(batch, dict), f"Batch should be a dict, but got {type(batch)}"
print("✅ [PASS] Batch is a dictionary.")

expected_keys = {'input_ids', 'attention_mask'}
assert set(batch.keys()) == expected_keys, f"Expected keys {expected_keys}, but got {set(batch.keys())}"
print(f"✅ [PASS] Batch has the correct keys: {list(batch.keys())}")

# 3. Check the tensor shapes
print("\n--- Checking Tensor Shapes ---")
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']

assert input_ids.shape == attention_mask.shape, "Shapes of input_ids and attention_mask must match!"
print(f"✅ [PASS] Tensors have matching shapes: {input_ids.shape}")
print(f"   (Batch Size, Max Sequence Length in this Batch)")

# 4. Check the input_ids value range (THE MOST IMPORTANT CHECK)
print("\n--- Checking Token ID Value Range ---")
vocab_size = model.config.vocab_size
min_id = input_ids.min()
max_id = input_ids.max()

print(f"   Model vocabulary size: {vocab_size}")
print(f"   Minimum ID in batch: {min_id}")
print(f"   Maximum ID in batch: {max_id}")

assert min_id >= 0, f"Found a negative token ID: {min_id}"
assert max_id < vocab_size, f"FATAL: Max ID ({max_id}) is out of bounds for vocab size ({vocab_size})!"
print("✅ [PASS] All token IDs are within the valid range [0, vocab_size - 1].")

# 5. Check the attention mask correspondence
print("\n--- Checking Attention Mask ---")
# We'll check the last sequence in the batch as an example
last_sequence_ids = input_ids[-1]
last_sequence_mask = attention_mask[-1]

# Find where the padding begins in this sample
# (This requires you to know your pad_token_id)
pad_indices = (last_sequence_ids == pad_token_id).nonzero(as_tuple=True)[0]

if len(pad_indices) > 0:
    first_pad_index = pad_indices[0].item()
    print(f"   Sample has padding, which starts at index: {first_pad_index}")
    # The token before padding should have a mask of 1
    if first_pad_index > 0:
        assert last_sequence_mask[first_pad_index - 1] == 1, "Mask should be 1 right before padding starts."
    # The first padding token should have a mask of 0
    assert last_sequence_mask[first_pad_index] == 0, "Mask should be 0 at the first padding token."
    print("✅ [PASS] Attention mask correctly marks padding with 0s (checked one sample).")
else:
    # If there's no padding, the mask should be all 1s
    assert last_sequence_mask.all(), "If there is no padding, mask should be all 1s."
    print("✅ [PASS] Sample has no padding and mask is all 1s, which is correct.")


print("\n\n🎉 --- Sanity Check Complete: DataLoader appears to be in a good state! --- 🎉")

# tests for the input tensor
input_ids = batch['input_ids']

print(f"\nShape of the batch tensor: {input_ids.shape}")
print(input_ids)

sequence_id = 0
first_sequence = input_ids[sequence_id]

first_50_tokens = first_sequence[:50]
print(f"\nSample {sequence_id} start: {decode(first_50_tokens)}...")

last_50_tokens = first_sequence[-50:]
print(f"\nSample {sequence_id} end: ...{decode(last_50_tokens)}")

# tests for the attention mask
attention_mask = batch['attention_mask']

print(f"\nShape of the attention_mask tensor: {attention_mask.shape}")
print(attention_mask[0, -50:])

Created a DataLoader with 72 samples and a batch size of 1.
pulled a batch from the DataLoader

--- Checking Batch Structure ---
✅ [PASS] Batch is a dictionary.
✅ [PASS] Batch has the correct keys: ['input_ids', 'attention_mask']

--- Checking Tensor Shapes ---
✅ [PASS] Tensors have matching shapes: torch.Size([1, 75657])
   (Batch Size, Max Sequence Length in this Batch)

--- Checking Token ID Value Range ---


NameError: name 'model' is not defined

In [28]:
from transformers import GPT2Config, AutoModelForCausalLM

vocab_size = len(int_to_vocab)
print(f"vocab size: {vocab_size}")

config = GPT2Config(
    vocab_size=vocab_size,
    n_positions=1024,   # The maximum sequence length the model can handle
    n_embd=256,         # The embedding dimension (vector size for each token)
    n_layer=6,          # The number of Transformer layers
    n_head=8            # The number of attention heads
)

def Transformer():
    return AutoModelForCausalLM.from_config(config)

vocab size: 262


In [29]:
model = Transformer() 

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 3
print(f"running {num_epochs} iterations")

model.train()
print("model in training mode")

for epoch in range(num_epochs):
    print(f"training epoch {epoch}")

    for batch in data_loader:
        # Move data to the GPU/CPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # ---> The Key Idea for Training <---
        # The model's input is the sequence.
        # The 'labels' are the same sequence, shifted one to the left.
        # The model tries to predict token[i+1] using all tokens up to i.
        # The model's loss function handles this shifting internally.
        labels = input_ids.clone()

        # --- DEBUGGING STEP ---
        # Check the min and max values in the batch *before* calling the model.
        # The max value should be less than your vocab_size.
        print(f"vocab size: {model.config.vocab_size}")
        print(f"batch min ID: {input_ids.min()}, max ID: {input_ids.max()}")
        # --- END DEBUGGING STEP ---

        print("performing forward pass...")
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        print("calculating loss...")
        loss = outputs.loss

        print("performing backward pass...")
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"epoch {epoch} complete, Loss: {loss.item()}")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
