In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional

In [2]:
class RmsNorm(nn.Module):
    def __init__(self, dim: int, epsilon=1e-6):
        super().__init__()
        self.epsilon = epsilon
        self.gamma = nn.Parameter(torch.ones(dim))

    def forward(self, x: torch.Tensor):
        # x: (B, T, D)
        rms = x.pow(2).mean(dim=-1, keepdim=True).add(self.epsilon).rsqrt()
        return x * rms * self.gamma

In [3]:
class SwishFFN(nn.Module):
    def __init__(self, d_model: int, hidden_times: int = 3, dropout: float = 0.0):
        super().__init__()
        hidden_dim = int(d_model * hidden_times)
        self.net = nn.Sequential(
            nn.Linear(d_model, hidden_dim, bias=False),
            nn.SiLU(),
            nn.Linear(hidden_dim, d_model, bias=False),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
from tokenizer import byteTokenizer
from utils import create_Dataloader

In [6]:
import torch
from torch import nn

class KVCache(nn.Module):
    def __init__(self, batch_size, max_seq_len, num_heads, head_dim, dtype=torch.float32):
        super().__init__()

        # 1. Type Hinting (Helps the IDE/Linter know these are Tensors)
        self.k_cache: torch.Tensor
        self.v_cache: torch.Tensor
        self.cache_pos: torch.Tensor

        # Pre-allocate empty cache
        self.register_buffer("k_cache", torch.zeros(batch_size, num_heads, max_seq_len, head_dim, dtype=dtype))
        self.register_buffer("v_cache", torch.zeros(batch_size, num_heads, max_seq_len, head_dim, dtype=dtype))

        # Position can be a scalar tensor since batch usually generates in sync
        self.register_buffer("cache_pos", torch.tensor(0, dtype=torch.long))

    def update(self, k_new: torch.Tensor, v_new: torch.Tensor):
        # k_new shape: (B, H, T_new, D)
        seq_len_new = k_new.size(2)

        # 2. Get Scalar Integer for slicing
        # We must use .item() because we cannot use a Tensor to define a slice range like [pos : pos+n]
        pos = self.cache_pos.item()

        # Update the cache
        self.k_cache[:, :, pos : pos + seq_len_new, :] = k_new
        self.v_cache[:, :, pos : pos + seq_len_new, :] = v_new

        # Update position
        self.cache_pos += seq_len_new

        # 3. CRITICAL: Return only the valid data
        # If we return the full buffer, Attention will see the empty zeros and mess up results.
        # We slice up to the current filled length.
        current_len = pos + seq_len_new
        return (
            self.k_cache[:, :, :current_len, :],
            self.v_cache[:, :, :current_len, :]
        )

    def reset(self):
        self.cache_pos.zero_()

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional

# [Keep your imports, RmsNorm, SwishFFN, and KVCache classes as they are]

class Decoder_Multi_Head_Attention(nn.Module):
    def __init__(self, n_head: int, d_model: int, dropout: float = 0.0):
        super().__init__()
        assert d_model % n_head == 0, "d_model must be divisible by n_head"
        self.n_head = n_head
        self.d_head = d_model // n_head
        self.qkv = nn.Linear(d_model, 3 * d_model, bias=False)
        self.proj = nn.Linear(d_model, d_model, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor, is_inference: bool = False, kv_cache: Optional['KVCache'] = None) -> torch.Tensor:
        B, T, C = x.shape
        qkv = self.qkv(x).view(B, T, 3, self.n_head, self.d_head)

        # Split q, k, v
        q, k, v = qkv.unbind(dim=2)

        # Transpose for attention: (B, n_head, T, d_head)
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        # KV Cache Logic
        if is_inference and kv_cache is not None:
            # Update cache with new k, v and get the full history back
            k, v = kv_cache.update(k, v)

        # Scaled Dot Product Attention
        # CRITICAL FIX: During inference decoding (T=1), we attend to ALL past keys (k),
        # so is_causal must be False. During training or prefill, it is True.
        use_causal_mask = not is_inference or (is_inference and T > 1)

        # If we are in inference and T=1 (decoding), we don't apply dropout
        dropout_p = self.dropout.p if self.training else 0.0

        out = F.scaled_dot_product_attention(
            q, k, v,
            attn_mask=None,
            dropout_p=dropout_p,
            is_causal=use_causal_mask
        )

        out = out.transpose(1, 2).contiguous().view(B, T, C)
        out = self.proj(out)
        return out

class Block(nn.Module):
    def __init__(self, d_model: int, n_head: int, dropout: float = 0.2) -> None:
        super().__init__()
        self.lr1Norm = RmsNorm(d_model)
        self.multiHead_attention = Decoder_Multi_Head_Attention(n_head, d_model, dropout)
        self.lr2Norm = RmsNorm(d_model)
        self.ffn = SwishFFN(d_model, 4, dropout)

    def forward(self, x: torch.Tensor, is_inference: bool = False, kv_cache: Optional['KVCache'] = None):
        # Pass inference flags down to attention
        x = x + self.multiHead_attention(self.lr1Norm(x), is_inference=is_inference, kv_cache=kv_cache)
        x = x + self.ffn(self.lr2Norm(x))
        return x

In [9]:
class TinyGPT(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        context_length: int,
        n_block: int = 4,
        n_head: int = 4,
        d_model: int = 256,
        dropout: float = 0.0
    ):
        super().__init__()
        self.embedings = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model
        self.context_length = context_length
        self.n_head = n_head

        self.pos_embedings = nn.Embedding(context_length, d_model)
        self.blocks = nn.ModuleList([Block(d_model, n_head, dropout) for _ in range(n_block)]) # Changed to ModuleList to iterate easily
        self.out_head = nn.Linear(d_model, vocab_size)
        self.lrNorm = nn.LayerNorm(d_model) # Or RmsNorm if you prefer consistency

        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)

    def forward(
        self,
        idx: torch.Tensor,
        targets: Optional[torch.Tensor] = None,
        kv_cache: Optional['KVCache'] = None,
        is_inference: bool = False,
        start_pos: int = 0
    ):
        B, T = idx.shape

        # During training/prefill, crop if needed. During decoding, we assume 1 token.
        if T > self.context_length:
            idx = idx[:, -self.context_length:]
            T = idx.size(1)

        # Create position indices based on start_pos
        pos = torch.arange(start_pos, start_pos + T, device=idx.device).unsqueeze(0) # (1, T)

        x = self.embedings(idx) + self.pos_embedings(pos)

        # Pass args down to blocks
        for block in self.blocks:
            x = block(x, is_inference=is_inference, kv_cache=kv_cache)

        x = self.lrNorm(x)
        logits = self.out_head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.flatten(0, 1), targets.flatten())

        return logits, loss

    @torch.no_grad()
    def generate(
        self,
        prompt: str,
        tokenizer,
        max_new_tokens: int = 200,
        temperature: float = 1.0,
        top_k: Optional[int] = None,
        stream: bool = False
    ):
        self.eval()
        idx = torch.tensor([tokenizer.encode(prompt)], dtype=torch.long, device=next(self.parameters()).device)

        # 1. Initialize KV Cache
        B = idx.shape[0]
        kv_cache = KVCache(B, self.context_length, self.n_head, self.d_model // self.n_head, dtype=next(self.parameters()).dtype).to(idx.device)
        kv_cache.reset()

        # 2. Prefill Phase (Process the prompt)
        # We pass the whole prompt. T > 1, so attention acts causally.
        logits, _ = self(idx, is_inference=True, kv_cache=kv_cache, start_pos=0)

        # Get the last token's logits to predict the first new token
        logits = logits[:, -1, :] / max(temperature, 1e-6)

        # Sampling logic (extract to helper if needed)
        probs = self._sample_logits(logits, top_k)
        next_token = torch.multinomial(probs, num_samples=1)

        # If not streaming, we collect tokens here
        generated_tokens = [next_token.item()]
        if stream:
            yield next_token.item()

        # 3. Decoding Phase (Token by Token)
        # Input is now just (B, 1). kv_cache remembers the past.
        input_token = next_token

        for i in range(max_new_tokens - 1):
            # Calculate current position in sequence (prompt len + generated so far)
            current_pos = idx.shape[1] + i

            # Stop if context limit reached
            if current_pos >= self.context_length:
                break

            # Forward pass with ONLY the new token
            logits, _ = self(input_token, is_inference=True, kv_cache=kv_cache, start_pos=current_pos)

            logits = logits[:, -1, :] / max(temperature, 1e-6)
            probs = self._sample_logits(logits, top_k)
            next_token = torch.multinomial(probs, num_samples=1)

            input_token = next_token # Update input for next iteration
            generated_tokens.append(next_token.item())

            if stream:
                yield next_token.item()

        if not stream:
            # Combine prompt + generated for full output
            full_idx = idx.tolist()[0] + generated_tokens
            return tokenizer.decode(full_idx)

    def _sample_logits(self, logits, top_k):
        if top_k is not None and top_k > 0:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('Inf')
        return torch.softmax(logits, dim=-1)

In [10]:
data_path = "/content/Ramayan.txt"

In [11]:
import torch
import torch.optim as optim

from tqdm.auto import tqdm
import time
from pathlib import Path

DATA_PATH = data_path
CONTEXT_LENGTH = 512
BATCH_SIZE = 64
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Training settings
PRECISION = "float16"
MAX_ITERS = 1000
EVAL_ITERS = 50
LOG_INTERVAL = 10
GRAD_CLIP = 1.0
LEARNING_RATE = 6e-3
WARMUP_ITERS = 150
MIN_LR = 8e-5
WEIGHT_DECAY = 0.1

# Model config
N_LAYER = 4
N_HEAD = 6
D_MODEL = 252
DROPOUT = 0.1

# runs/models path
CKPT_DIR = Path("/content/runs")
CKPT_DIR.mkdir(exist_ok=True)

def get_lr(it: int):
    if it < WARMUP_ITERS:
        return LEARNING_RATE * (it + 1) / WARMUP_ITERS
    if it > MAX_ITERS:
        return MIN_LR
    decay_ratio = (it - WARMUP_ITERS) / (MAX_ITERS - WARMUP_ITERS)
    coeff = 0.5 * (1.0 + torch.cos(torch.pi * torch.tensor(decay_ratio)))
    return MIN_LR + coeff * (LEARNING_RATE - MIN_LR)


@torch.no_grad()
def estimate_loss(model, train_loader, val_loader, device, eval_iters=EVAL_ITERS):
    model.eval()
    losses = {}
    for split, loader in [("train", train_loader), ("val", val_loader)]:
        total_loss = 0.0
        for _ in range(eval_iters):
            x, y = next(iter(loader))
            x, y = x.to(device), y.to(device)
            with torch.autocast(device_type="cuda", dtype=getattr(torch, PRECISION)):
                _, loss = model(x, y)
            total_loss += loss.item()
        losses[split] = total_loss / eval_iters
    model.train()
    return losses


def train():
    print(f"Using device: {DEVICE} | Precision: {PRECISION}")

    # Create data loaders
    train_loader, val_loader, dataset = create_Dataloader(
        path=DATA_PATH,
        batch_size=BATCH_SIZE,
        context_length=CONTEXT_LENGTH,
        train=0.9  # 90% train, 10% val
    )

    # Get vocab size from tokenizer (we need to extract it)
    vocab_size = dataset.tokenizer.vocab_size
    print(f"Vocab size: {vocab_size:,}")
    print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

    # Initialize model
    model = TinyGPT(
        vocab_size=vocab_size,
        context_length=CONTEXT_LENGTH,
        n_block=N_LAYER,
        n_head=N_HEAD,
        d_model=D_MODEL,
        dropout=DROPOUT
    ).to(DEVICE)

    # Compile the model for better training
    if hasattr(torch, "compile"):
        print("Compiling model..")
        model = torch.compile(model)

    optimizer = optim.AdamW(
        model.parameters(),
        lr=LEARNING_RATE,
        betas=(0.9, 0.95),
        weight_decay=WEIGHT_DECAY
    )
    scaler = torch.GradScaler(enabled=(PRECISION == "float16"))

    print(f"Total parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M") # you can change this to billion if you have more bigger model, if you want. 1e9:.2f}B
    print("Starting training...\n")

    iter_num = 0
    best_val_loss = float("inf")
    start_time = time.time()

    data_iter = iter(train_loader)

    pbar = tqdm(range(MAX_ITERS), desc="Training")
    for iter_num in pbar:
        # Learning rate schedule
        lr = get_lr(iter_num)
        for g in optimizer.param_groups:
            g["lr"] = lr

        try:
            x, y = next(data_iter)
        except StopIteration:
            data_iter = iter(train_loader)
            x, y = next(data_iter)

        x, y = x.to(DEVICE), y.to(DEVICE)

        # Forward + backward with AMP
        with torch.autocast(device_type="cuda", dtype=getattr(torch, PRECISION)):
            _, loss = model(x, y)

        scaler.scale(loss).backward()

        if GRAD_CLIP > 0.0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)

        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)

        # Logging
        if iter_num % LOG_INTERVAL == 0:
            tokens_per_sec = (BATCH_SIZE * CONTEXT_LENGTH * LOG_INTERVAL) / (time.time() - start_time + 1e-8)
            start_time = time.time()
            pbar.set_postfix({
                "loss": f"{loss.item():.4f}",
                "lr": f"{lr:.2e}",
                "tok/s": f"{tokens_per_sec:.0f}"
            })

        # Evaluation
        if iter_num % EVAL_ITERS == 0 or iter_num == MAX_ITERS - 1:
            losses = estimate_loss(model, train_loader, val_loader, DEVICE)
            print(f"\nStep {iter_num:,} | "
                  f"Train: {losses['train']:.4f} | "
                  f"Val: {losses['val']:.4f} | "
                  f"LR: {lr:.2e}")

            # Save best model
            if losses["val"] < best_val_loss:
                best_val_loss = losses["val"]
                torch.save({
                    "iter": iter_num,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "scaler_state_dict": scaler.state_dict(),
                    "val_loss": best_val_loss,
                    "config": {
                        "vocab_size": vocab_size,
                        "context_length": CONTEXT_LENGTH,
                        "n_block": N_LAYER,
                        "n_head": N_HEAD,
                        "d_model": D_MODEL,
                    }
                }, CKPT_DIR / "best_model.pt")
                print("New best model saved!")

        # Periodic checkpoint
        if iter_num % 200 == 0 and iter_num > 0:
            torch.save(model.state_dict(), CKPT_DIR / f"checkpoint_iter{iter_num}.pt")

    print(f"\nTraining finished! Best validation loss: {best_val_loss:.4f}")


In [12]:
train()

Using device: cuda | Precision: float16
Unique characters: ['\n', ' ', '!', '"', '&', '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '}', 'Á', 'Æ', 'É', 'Ú', 'Ü', 'à', 'á', 'â', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ô', 'ö', 'ù', 'ú', 'û', 'ü', 'ń', 'Œ', 'œ', 'Ś', 'ś', 'ǹ', 'Α', 'Κ', 'Ο', 'Π', 'Σ', 'ά', 'έ', 'ή', 'ί', 'α', 'β', 'γ', 'δ', 'ε', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ', 'ω', 'ό', 'ύ', 'ώ', 'ה', 'ו', 'י', 'ע', 'צ', 'ḍ', 'ṅ', 'ṇ', 'ṛ', 'ṣ', 'ṭ', 'ἀ', 'ἐ', 'Ἐ', 'Ἕ', 'ἡ', 'ἤ', 'ἰ', 'ἵ', 'Ἰ', 'Ἴ', 'Ἵ', 'ὁ', 'Ὁ', 'Ὅ', 'ὐ', 'ὑ', 'ὰ', 'ὴ', 'ὺ', 'ᾶ', 'ῶ', '—', '‘', '’', '“', '”', '…', '\uf

Training:   0%|          | 0/1000 [00:00<?, ?it/s]

  return torch._C._get_cublas_allow_tf32()
W1124 11:00:35.376000 289 torch/_inductor/utils.py:1558] [0/0] Not enough SMs to use max_autotune_gemm mode



Step 0 | Train: 5.0009 | Val: 5.0043 | LR: 4.00e-05
New best model saved!

Step 50 | Train: 2.5208 | Val: 2.5350 | LR: 2.04e-03
New best model saved!

Step 100 | Train: 2.4681 | Val: 2.4840 | LR: 4.04e-03
New best model saved!

Step 150 | Train: 2.5007 | Val: 2.5190 | LR: 6.00e-03

Step 200 | Train: 2.4248 | Val: 2.4383 | LR: 5.95e-03
New best model saved!

Step 250 | Train: 2.3818 | Val: 2.3944 | LR: 5.80e-03
New best model saved!

Step 300 | Train: 2.0139 | Val: 2.0411 | LR: 5.56e-03
New best model saved!

Step 350 | Train: 1.7902 | Val: 1.8261 | LR: 5.23e-03
New best model saved!

Step 400 | Train: 1.6421 | Val: 1.6944 | LR: 4.82e-03
New best model saved!

Step 450 | Train: 1.5699 | Val: 1.6111 | LR: 4.36e-03
New best model saved!

Step 500 | Train: 1.4955 | Val: 1.5371 | LR: 3.85e-03
New best model saved!

Step 550 | Train: 1.4494 | Val: 1.4940 | LR: 3.31e-03
New best model saved!

Step 600 | Train: 1.4107 | Val: 1.4511 | LR: 2.77e-03
New best model saved!

Step 650 | Train: 1.382

In [15]:
import torch
from pathlib import Path

CKPT_PATH = Path("/content/runs/best_model.pt")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Generation Settings
PROMPT = "sumitra"
MAX_NEW_TOKENS = 2000
TEMPERATURE = 0.8
TOP_K = 50

def load_best_model(ckpt_path, device):
    """Same loading logic as before to ensure config matches"""
    if not ckpt_path.exists():
        raise FileNotFoundError(f"Checkpoint not found at {ckpt_path}")

    checkpoint = torch.load(ckpt_path, map_location=device)
    config = checkpoint['config']

    model = TinyGPT(
        vocab_size=config['vocab_size'],
        context_length=config['context_length'],
        n_block=config['n_block'],
        n_head=config['n_head'],
        d_model=config['d_model'],
        dropout=0.0
    )

    state_dict = checkpoint['model_state_dict']
    unwanted_prefix = '_orig_mod.'
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    return model

@torch.no_grad()
def stream_generate(model, tokenizer, prompt, max_new_tokens, temperature=1.0, top_k=None):
    """
    Generates text and prints it to stdout immediately as tokens are created.
    """
    # 1. Encode and setup
    idx = tokenizer.encode(prompt)
    idx = torch.tensor([idx], dtype=torch.long, device=DEVICE)

    print(f"Prompt: {prompt}", end="", flush=True)

    current_text = tokenizer.decode(idx[0].tolist())
    len_printed = len(current_text)

    # Generation Loop
    for _ in range(max_new_tokens):
        idx_cond = idx if idx.size(1) <= model.context_length else idx[:, -model.context_length:]

        # Forward pass
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :] / max(temperature, 1e-6)

        # Top-K Sampling
        if top_k is not None and top_k > 0:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('Inf')

        probs = torch.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)

        idx = torch.cat([idx, next_token], dim=1)

        full_text = tokenizer.decode(idx[0].tolist())
        new_text = full_text[len_printed:]

        print(new_text, end="", flush=True)
        len_printed += len(new_text)

    print("\n\n--- End of Generation ---")

try:
    model = load_best_model(CKPT_PATH, DEVICE)
    tokenizer = byteTokenizer(data_path)
    # Generation
    print(f"\n--- Streaming Generation (Temp: {TEMPERATURE}) ---\n")
    stream_generate(
        model=model,
        tokenizer=tokenizer, # Requires your tokenizer object
        prompt=PROMPT,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        top_k=TOP_K
    )

except Exception as e:
    print(f"An error occurred: {e}")

Unique characters: ['\n', ' ', '!', '"', '&', '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '}', 'Á', 'Æ', 'É', 'Ú', 'Ü', 'à', 'á', 'â', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ô', 'ö', 'ù', 'ú', 'û', 'ü', 'ń', 'Œ', 'œ', 'Ś', 'ś', 'ǹ', 'Α', 'Κ', 'Ο', 'Π', 'Σ', 'ά', 'έ', 'ή', 'ί', 'α', 'β', 'γ', 'δ', 'ε', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ', 'ω', 'ό', 'ύ', 'ώ', 'ה', 'ו', 'י', 'ע', 'צ', 'ḍ', 'ṅ', 'ṇ', 'ṛ', 'ṣ', 'ṭ', 'ἀ', 'ἐ', 'Ἐ', 'Ἕ', 'ἡ', 'ἤ', 'ἰ', 'ἵ', 'Ἰ', 'Ἴ', 'Ἵ', 'ὁ', 'Ὁ', 'Ὅ', 'ὐ', 'ὑ', 'ὰ', 'ὴ', 'ὺ', 'ᾶ', 'ῶ', '—', '‘', '’', '“', '”', '…', '\ufeff']
Length of unique chars: 188

--- S