# Setup

In [None]:
!git clone https://github.com/Anson2Leung/ece405-assignment1-basics.git

# Install uv
!curl -LsSf https://astral.sh/uv/install.sh | sh

# Add uv to the system path so you can use it directly
import os
os.environ['PATH'] = f"{os.path.expanduser('~/.cargo/bin')}:{os.environ['PATH']}"

In [None]:
%cd ece405-assignment1-basics

!uv pip install -r requirements.txt --system
!pip install "numpy<2.0" jaxtyping typeguard langsmith

## Test

In [None]:
!pytest tests/test_train_bpe.py

# Data files
%cd ..
%mkdir -p data
%cd data

!wget https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStoriesV2-GPT4-train.txt
!wget https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStoriesV2-GPT4-valid.txt

!wget https://huggingface.co/datasets/stanford-cs336/owt-sample/resolve/main/owt_train.txt.gz
!gunzip owt_train.txt.gz
!wget https://huggingface.co/datasets/stanford-cs336/owt-sample/resolve/main/owt_valid.txt.gz
!gunzip owt_valid.txt.gz

%cd ..
%cd ece405-assignment1-basics

# Problem (train_bpe_tinystories): BPE Training on TinyStories (2 points)

In [None]:
import sys
import os
import json

# 1. Ensure the script is in the path
%cd /content/ece405-assignment1-basics
sys.path.append('/content/ece405-assignment1-basics/assignment_files')
from bpe_tokenizer import train_bpe, save_vocab, save_merges

# 2. Configuration
INPUT_PATH = "/content/data/TinyStoriesV2-GPT4-train.txt"
VOCAB_SIZE = 10000
SPECIAL_TOKENS = ["<|endoftext|>"]

# 3. Execution
# We call the function directly. Your code already handles the timing and memory printing.
vocab, merges = train_bpe(INPUT_PATH, VOCAB_SIZE, SPECIAL_TOKENS)

# 4. Serialization
save_vocab(vocab, "vocab.json")
save_merges(merges, "merges.txt")

In [None]:
# Analysis
def get_decoded_string(token_bytes):
    return token_bytes.decode('utf-8', errors='ignore')

# Find the longest token in the vocabulary
longest_token_id = max(vocab, key=lambda k: len(get_decoded_string(vocab[k])))
longest_token_bytes = vocab[longest_token_id]
longest_text = get_decoded_string(longest_token_bytes)

print(f"--- Analysis Results ---")
print(f"Longest Token ID: {longest_token_id}")
print(f"Longest Token Text: '{longest_text}'")
print(f"Character Length: {len(longest_text)}")