In [None]:
import numpy as np
import os
import pickle
from pathlib import Path


# constants
SEQ_LENGTH = 256
OVERLAP = 50
STRIDE = SEQ_LENGTH - OVERLAP  # 206

SCRIPT_DIR = os.getcwd()
DATA_PATH = os.path.join(SCRIPT_DIR, "tiny-shakespeare", "input.txt")


# text load
def load_text(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

# vocab creation
def create_vocab(text):
    chars = sorted(list(set(text)))
    vocab_size = len(chars)
    
    char_to_idx = {ch: i for i, ch in enumerate(chars)}
    idx_to_char = {i: ch for i, ch in enumerate(chars)}
    
    print(f'Total characters: {len(text)}')
    print(f'Unique characters: {vocab_size}')
    print(f'Characters: {chars[:50]}...')
    
    return char_to_idx, idx_to_char, chars

def create_sequences(text, char_to_idx, seq_length=256, stride=206):
    sequences = []
    text_as_int = [char_to_idx[ch] for ch in text]
    
    for i in range(0, len(text_as_int) - seq_length + 1, stride):
        seq = text_as_int[i:i + seq_length]
        sequences.append(seq)
    
    return np.array(sequences)

# train/val/test split (no shuffle)
def split_sequences(sequences, train_ratio=0.8, val_ratio=0.1):
    n = len(sequences)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))
    
    train = sequences[:train_end]
    val = sequences[train_end:val_end]
    test = sequences[val_end:]
    
    return train, val, test

def main():
    print("loading text...")
    text = load_text(DATA_PATH)
    
    print("creating vocabulary...")
    char_to_idx, idx_to_char, chars = create_vocab(text)
    
    print("creating sequences...")
    stride = SEQ_LENGTH - OVERLAP
    sequences = create_sequences(text, char_to_idx, SEQ_LENGTH, stride)
    print(f"Overall sequences: {len(sequences)}")
    
    print("splitting into train/val/test...")
    train, val, test = split_sequences(sequences)
    
    print(f"Train: {len(train)}")
    print(f"Val: {len(val)}")
    print(f"Test: {len(test)}")
    
    print("saving data...")
    np.save('train.npy', train)
    np.save('val.npy', val)
    np.save('test.npy', test)
    
    with open('char_to_idx.pkl', 'wb') as f:
        pickle.dump(char_to_idx, f)
    with open('idx_to_char.pkl', 'wb') as f:
        pickle.dump(idx_to_char, f)
    with open('chars.pkl', 'wb') as f:
        pickle.dump(chars, f)
    

if __name__ == "__main__":
    main()

loading text...
creating vocabulary...
Total characters: 1115394
Unique characters: 65
Characters: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']...
creating sequences...
Overall sequences: 5414
splitting into train/val/test...
Train: 4331
Val: 541
Test: 542
saving data...
Done!
