
# Urdu Chatbot From Scratch (No Pretrained Models) — Colab Notebook

This notebook trains a **small Transformer encoder–decoder built from scratch** (pure PyTorch) for an Urdu chatbot.  
We synthesize dialogue pairs from your dataset by pairing consecutive sentences: `(utterance_t → utterance_{t+1})`.

**What you get:**
- Urdu text normalization/tokenization (no external tokenizers)
- Vocabulary built **only** from your dataset
- Transformer (multi-head attention, positional enc., masking) **from scratch**
- Training with teacher forcing
- Evaluation: BLEU, ROUGE-L, chrF, Perplexity
- Greedy & beam search decoding
- Checkpoint saved by best **validation BLEU**

> ⚠️ **No pretrained models or embeddings are used.** Everything is trained from scratch.


In [None]:

# If running on Colab, you likely already have torch & numpy.
# Optional: upgrade pip packages if you get version warnings.
# !pip -q install torch pandas numpy

import os, math, re, unicodedata, random, json
from collections import Counter
from typing import List, Tuple
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)



## 1) Load your dataset (`.tsv`)
Choose **one** of the following:
- **A. Upload from your computer** (recommended for this assignment)
- **B. Load from Google Drive** (if you placed the file there)


In [None]:

# (A) Upload from your computer
# After uploading, set DATA_PATH to the exact file name displayed (e.g., 'final_main_dataset.tsv')
try:
    from google.colab import files  # type: ignore
    print("Use the dialog to upload your TSV (e.g., final_main_dataset.tsv)")
    uploaded = files.upload()
    DATA_PATH = list(uploaded.keys())[0] if uploaded else "final_main_dataset.tsv"
except Exception as e:
    print("Not on Colab or files.upload not available. Set DATA_PATH manually if needed.")
    DATA_PATH = "final_main_dataset.tsv"

print("DATA_PATH:", DATA_PATH)


In [None]:

# (B) Load from Google Drive (optional)
# from google.colab import drive  # type: ignore
# drive.mount('/content/drive')
# DATA_PATH = '/content/drive/MyDrive/path/to/final_main_dataset.tsv'
# print("DATA_PATH:", DATA_PATH)



### Inspect the TSV
We expect a **`sentence`** column (as in Common Voice Urdu). If your column name differs, update `TEXT_COL` below.


In [None]:

df = pd.read_csv(DATA_PATH, sep="\t")
print("Columns:", list(df.columns))
df.head()


In [None]:

# Pick the text column (update if needed)
TEXT_COL = 'sentence' if 'sentence' in df.columns else df.columns[0]
texts = df[TEXT_COL].astype(str).tolist()
print("Using text column:", TEXT_COL, " | Total lines:", len(texts))



## 2) Urdu normalization & tokenization
Simple, assignment-friendly rules:
- remove diacritics
- unify select Arabic/Urdu variants
- whitespace/punctuation token split


In [None]:

DIACRITICS = [
    "\u064B","\u064C","\u064D","\u064E","\u064F","\u0650","\u0651","\u0652",
    "\u0653","\u0654","\u0655","\u0670"
]
import re
DIACRITICS_RE = re.compile("|".join(map(re.escape, DIACRITICS)))
PUNCT = r"([\,\.\!\?\؛\:\(\)\[\]\{\}«»\"'،۔:؛؟])"

def normalize_urdu(text: str) -> str:
    t = text.strip()
    t = unicodedata.normalize("NFC", t)
    t = DIACRITICS_RE.sub("", t)
    t = t.replace("آ", "ا").replace("أ", "ا").replace("إ", "ا")
    t = t.replace("ي", "ی").replace("ى", "ی").replace("ك", "ک")
    t = re.sub(r"\s+", " ", t)
    return t

def tokenize_urdu(text: str) -> List[str]:
    t = normalize_urdu(text)
    t = re.sub(PUNCT, r" \1 ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t.split(" ") if t else []

# Quick sanity check:
for s in texts[:3]:
    print(s, " -> ", tokenize_urdu(s)[:10])



## 3) Build synthetic dialogue pairs
We create `(input → response)` by pairing consecutive sentences. Empty/very short items are filtered out.


In [None]:

MAX_LEN = 80  # tokens (incl. SOS/EOS later)

raw_pairs = []
for i in range(len(texts)-1):
    a = tokenize_urdu(texts[i])[:MAX_LEN-2]
    b = tokenize_urdu(texts[i+1])[:MAX_LEN-2]
    if a and b:
        raw_pairs.append((a,b))

len(raw_pairs), raw_pairs[0][:2]



## 4) Build vocabulary (from scratch)
Special tokens: `<pad>`, `<s>`, `</s>`, `<unk>`


In [None]:

PAD, SOS, EOS, UNK = "<pad>", "<s>", "</s>", "<unk>"

class Vocab:
    def __init__(self, tokens, min_freq=2, max_size=30000):
        freq = Counter(tok for sent in tokens for tok in sent)
        self.itos = [PAD, SOS, EOS, UNK]
        for tok, c in freq.most_common():
            if c < min_freq: break
            if tok in self.itos: continue
            if len(self.itos) >= max_size: break
            self.itos.append(tok)
        self.stoi = {s:i for i,s in enumerate(self.itos)}
    def encode(self, toks, add_sos_eos=True):
        ids = [self.stoi.get(t, self.stoi[UNK]) for t in toks]
        return [self.stoi[SOS]] + ids + [self.stoi[EOS]] if add_sos_eos else ids
    def decode(self, ids):
        return [self.itos[i] if i < len(self.itos) else UNK for i in ids]
    def __len__(self): return len(self.itos)

src_tokens = [a for a,_ in raw_pairs]
tgt_tokens = [b for _,b in raw_pairs]
src_vocab = Vocab(src_tokens, min_freq=2, max_size=30000)
tgt_vocab = Vocab(tgt_tokens, min_freq=2, max_size=30000)
len(src_vocab), len(tgt_vocab), src_vocab.itos[:20]



## 5) Dataset & DataLoader


In [None]:

class PairDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab, max_len=80):
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len
    def __len__(self): return len(self.pairs)
    def __getitem__(self, idx):
        a,b = self.pairs[idx]
        s = torch.tensor(self.src_vocab.encode(a, add_sos_eos=True), dtype=torch.long)
        t = torch.tensor(self.tgt_vocab.encode(b, add_sos_eos=True), dtype=torch.long)
        return s,t

def pad_batch(samples, pad_idx):
    srcs, tgts = zip(*samples)
    sl = max(len(s) for s in srcs)
    tl = max(len(t) for t in tgts)
    b = len(samples)
    S = torch.full((b,sl), pad_idx, dtype=torch.long)
    T = torch.full((b,tl), pad_idx, dtype=torch.long)
    for i,(s,t) in enumerate(samples):
        S[i,:len(s)] = s
        T[i,:len(t)] = t
    return S,T

full_ds = PairDataset(raw_pairs, src_vocab, tgt_vocab, max_len=MAX_LEN)
N = len(full_ds)
val_frac, test_frac = 0.10, 0.10
n_test = int(N*test_frac)
n_val  = int(N*val_frac)
n_train = N - n_val - n_test
train_ds, val_ds, test_ds = random_split(full_ds, [n_train, n_val, n_test], generator=torch.Generator().manual_seed(SEED))

collate = lambda batch: pad_batch(batch, pad_idx=tgt_vocab.stoi[PAD])
train_it = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate)
val_it   = DataLoader(val_ds, batch_size=64, shuffle=False, collate_fn=collate)
test_it  = DataLoader(test_ds, batch_size=64, shuffle=False, collate_fn=collate)

len(train_ds), len(val_ds), len(test_ds)



## 6) Transformer (from scratch)
Encoder–decoder with multi-head attention, positional encodings, masking.


In [None]:

def subsequent_mask(sz: int) -> torch.Tensor:
    mask = torch.triu(torch.ones(sz, sz, dtype=torch.bool), diagonal=1)
    return mask.unsqueeze(0)

def make_pad_mask(seq: torch.Tensor, pad_idx: int) -> torch.Tensor:
    return (seq == pad_idx).unsqueeze(1).unsqueeze(2)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        assert d_model % heads == 0
        self.h = heads; self.d_k = d_model // heads
        self.q = nn.Linear(d_model, d_model)
        self.k = nn.Linear(d_model, d_model)
        self.v = nn.Linear(d_model, d_model)
        self.o = nn.Linear(d_model, d_model)
        self.drop = nn.Dropout(dropout)
    def forward(self, q, k, v, mask=None):
        B, Tq, d = q.shape; Tk = k.shape[1]
        q = self.q(q).view(B, Tq, self.h, self.d_k).transpose(1,2)
        k = self.k(k).view(B, Tk, self.h, self.d_k).transpose(1,2)
        v = self.v(v).view(B, Tk, self.h, self.d_k).transpose(1,2)
        scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None: scores = scores.masked_fill(mask, float("-inf"))
        attn = torch.softmax(scores, dim=-1)
        ctx = (self.drop(attn) @ v).transpose(1,2).contiguous().view(B, Tq, self.h*self.d_k)
        return self.o(ctx)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=1024, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff), nn.ReLU(inplace=True), nn.Dropout(dropout),
            nn.Linear(d_ff, d_model)
        )
    def forward(self, x): return self.net(x)

class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, d_ff, dropout):
        super().__init__()
        self.sa = MultiHeadAttention(d_model, heads, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.n1 = nn.LayerNorm(d_model); self.n2 = nn.LayerNorm(d_model)
        self.drop = nn.Dropout(dropout)
    def forward(self, x, src_mask):
        x = self.n1(x + self.drop(self.sa(x,x,x, mask=src_mask)))
        x = self.n2(x + self.drop(self.ff(x)))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, d_ff, dropout):
        super().__init__()
        self.sa = MultiHeadAttention(d_model, heads, dropout)
        self.ca = MultiHeadAttention(d_model, heads, dropout)
        self.ff = FeedForward(d_model, d_ff, dropout)
        self.n1 = nn.LayerNorm(d_model); self.n2 = nn.LayerNorm(d_model); self.n3 = nn.LayerNorm(d_model)
        self.drop = nn.Dropout(dropout)
    def forward(self, x, mem, tgt_mask, tgt_pad_mask, src_mask):
        x = self.n1(x + self.drop(self.sa(x,x,x, mask=tgt_mask | tgt_pad_mask)))
        x = self.n2(x + self.drop(self.ca(x,mem,mem, mask=src_mask)))
        x = self.n3(x + self.drop(self.ff(x)))
        return x

class Encoder(nn.Module):
    def __init__(self, vocab, d_model, layers, heads, d_ff, dropout):
        super().__init__()
        self.emb = nn.Embedding(vocab, d_model, padding_idx=0)
        self.pe = PositionalEncoding(d_model, dropout=dropout)
        self.layers = nn.ModuleList([EncoderLayer(d_model, heads, d_ff, dropout) for _ in range(layers)])
    def forward(self, src, src_mask):
        x = self.pe(self.emb(src) * math.sqrt(self.emb.embedding_dim))
        for l in self.layers: x = l(x, src_mask)
        return x

class Decoder(nn.Module):
    def __init__(self, vocab, d_model, layers, heads, d_ff, dropout):
        super().__init__()
        self.emb = nn.Embedding(vocab, d_model, padding_idx=0)
        self.pe = PositionalEncoding(d_model, dropout=dropout)
        self.layers = nn.ModuleList([DecoderLayer(d_model, heads, d_ff, dropout) for _ in range(layers)])
        self.proj = nn.Linear(d_model, vocab)
    def forward(self, tgt, mem, tgt_mask, tgt_pad_mask, src_mask):
        x = self.pe(self.emb(tgt) * math.sqrt(self.emb.embedding_dim))
        for l in self.layers: x = l(x, mem, tgt_mask, tgt_pad_mask, src_mask)
        return self.proj(x)

class TransformerSeq2Seq(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, enc_layers=2, dec_layers=2, heads=2, d_ff=1024, dropout=0.1):
        super().__init__()
        self.encoder = Encoder(src_vocab_size, d_model, enc_layers, heads, d_ff, dropout)
        self.decoder = Decoder(tgt_vocab_size, d_model, dec_layers, heads, d_ff, dropout)
        self.pad_idx = 0
    def forward(self, src, tgt_inp):
        src_mask = make_pad_mask(src, self.pad_idx)
        tgt_pad = make_pad_mask(tgt_inp, self.pad_idx)
        causal = subsequent_mask(tgt_inp.size(1)).to(tgt_inp.device)
        mem = self.encoder(src, src_mask)
        out = self.decoder(tgt_inp, mem, causal, tgt_pad, src_mask)
        return out



## 7) Metrics
BLEU, ROUGE-L, chrF, Perplexity.


In [None]:

def ngrams(seq, n): return [tuple(seq[i:i+n]) for i in range(len(seq)-n+1)]

def bleu_score(ref: List[str], hyp: List[str], max_n=4, smooth=1e-9):
    weights = [1.0/max_n]*max_n
    hyp_len, ref_len = len(hyp), len(ref)
    if hyp_len == 0: return 0.0
    precisions = []
    for n in range(1, max_n+1):
        h = Counter(ngrams(hyp, n)); r = Counter(ngrams(ref, n))
        overlap = sum(min(c, r[g]) for g,c in h.items())
        total = max(sum(h.values()), 1)
        precisions.append((overlap + smooth) / (total + smooth))
    geo = math.exp(sum(w*math.log(p) for w,p in zip(weights, precisions)))
    bp = 1.0 if hyp_len > ref_len else math.exp(1 - ref_len/max(hyp_len,1))
    return bp * geo

def rouge_l(ref: List[str], hyp: List[str]):
    m, n = len(ref), len(hyp)
    dp = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m):
        for j in range(n):
            dp[i+1][j+1] = dp[i][j]+1 if ref[i]==hyp[j] else max(dp[i][j+1], dp[i+1][j])
    lcs = dp[m][n]
    if lcs == 0: return 0.0
    prec, rec = lcs/max(n,1), lcs/max(m,1)
    return 0.0 if (prec+rec)==0 else (2*prec*rec)/(prec+rec)

def chrf(ref: str, hyp: str, n=6, beta=2.0):
    def counts(s):
        s = s.replace(" ", "")
        out = {}
        for k in range(1, n+1):
            grams = ngrams(list(s), k); out[k] = Counter(grams)
        return out
    R, H = counts(ref), counts(hyp)
    Fs = []
    for k in range(1, n+1):
        overlap = sum((R[k] & H[k]).values())
        r_tot, h_tot = sum(R[k].values()), sum(H[k].values())
        if r_tot == 0 or h_tot == 0: Fs.append(0.0); continue
        prec, rec = overlap / h_tot, overlap / r_tot
        Fs.append(0.0 if (prec+rec)==0 else (1+beta*beta)*prec*rec/(beta*beta*prec+rec))
    return float(np.mean(Fs))

def perplexity(loss): return float(math.exp(min(20, loss)))



## 8) Training / Evaluation / Decoding


In [None]:

def evaluate(model, data_iter, tgt_vocab, device):
    model.eval()
    pad = tgt_vocab.stoi[PAD]
    ce = nn.CrossEntropyLoss(ignore_index=pad, reduction="sum")
    total_loss, total_tokens = 0.0, 0
    bleu_list, rouge_list, chrf_list = [], [], []
    with torch.no_grad():
        for src, tgt in data_iter:
            src, tgt = src.to(device), tgt.to(device)
            inp, gold = tgt[:, :-1], tgt[:, 1:]
            logits = model(src, inp)
            B,T,V = logits.shape
            loss = ce(logits.view(B*T, V), gold.reshape(B*T))
            total_loss += loss.item()
            total_tokens += (gold != pad).sum().item()
            hyps = greedy_decode(model, src, tgt_vocab, max_len=inp.size(1)+20, device=device)
            for i in range(src.size(0)):
                ref_ids = gold[i].tolist()
                ref_toks = [t for t in tgt_vocab.decode(ref_ids) if t not in (PAD,SOS,EOS)]
                hyp_toks = hyps[i]
                bleu_list.append(bleu_score(ref_toks, hyp_toks))
                rouge_list.append(rouge_l(ref_toks, hyp_toks))
                chrf_list.append(chrf("".join(ref_toks), "".join(hyp_toks)))
    ppl = perplexity(total_loss / max(total_tokens,1))
    return {"BLEU": float(np.mean(bleu_list)) if bleu_list else 0.0,
            "ROUGE_L": float(np.mean(rouge_list)) if rouge_list else 0.0,
            "chrF": float(np.mean(chrf_list)) if chrf_list else 0.0,
            "Perplexity": ppl}

def train_loop(model, train_it, val_it, tgt_vocab, epochs=8, lr=3e-4, save_dir="./runs_urdu_bot"):
    os.makedirs(save_dir, exist_ok=True)
    pad = tgt_vocab.stoi[PAD]
    criterion = nn.CrossEntropyLoss(ignore_index=pad)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    best_bleu = -1.0
    ckpt = os.path.join(save_dir, "best_bleu.pt")

    for ep in range(1, epochs+1):
        model.train()
        running, steps = 0.0, 0
        for src, tgt in train_it:
            src, tgt = src.to(device), tgt.to(device)
            inp, gold = tgt[:, :-1], tgt[:, 1:]
            logits = model(src, inp)
            B,T,V = logits.shape
            loss = criterion(logits.view(B*T, V), gold.reshape(B*T))

            opt.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()

            running += loss.item(); steps += 1

        val_metrics = evaluate(model, val_it, tgt_vocab, device)
        print(f"[Epoch {ep}] train_loss={running/max(steps,1):.4f} | "
              f"val_BLEU={val_metrics['BLEU']:.4f} ROUGE_L={val_metrics['ROUGE_L']:.4f} "
              f"chrF={val_metrics['chrF']:.4f} PPL={val_metrics['Perplexity']:.2f}")

        if val_metrics["BLEU"] > best_bleu:
            best_bleu = val_metrics["BLEU"]
            torch.save({"model_state": model.state_dict()}, ckpt)
            print("  -> Saved new best checkpoint:", ckpt)

def greedy_decode(model, src, tgt_vocab, max_len=60, device="cpu"):
    model.eval()
    pad = tgt_vocab.stoi[PAD]; sos = tgt_vocab.stoi[SOS]; eos = tgt_vocab.stoi[EOS]
    with torch.no_grad():
        src_mask = make_pad_mask(src, pad)
        mem = model.encoder(src, src_mask)
        B = src.size(0)
        ys = torch.full((B,1), sos, dtype=torch.long, device=device)
        finished = torch.zeros(B, dtype=torch.bool, device=device)
        outs = [[] for _ in range(B)]
        for _ in range(max_len):
            tgt_pad = make_pad_mask(ys, pad)
            causal = subsequent_mask(ys.size(1)).to(device)
            logits = model.decoder(ys, mem, causal, tgt_pad, src_mask)
            nxt = logits[:,-1,:].argmax(-1)
            ys = torch.cat([ys, nxt.unsqueeze(1)], dim=1)
            for i,t in enumerate(nxt.tolist()):
                if not finished[i]:
                    if t == eos: finished[i] = True
                    else: outs[i].append(tgt_vocab.itos[t] if t < len(tgt_vocab.itos) else UNK)
            if finished.all(): break
        return outs

def beam_search_decode(model, src, tgt_vocab, beam=4, max_len=60, device="cpu"):
    pad = tgt_vocab.stoi[PAD]; sos = tgt_vocab.stoi[SOS]; eos = tgt_vocab.stoi[EOS]
    model.eval()
    with torch.no_grad():
        src_mask = make_pad_mask(src, pad)
        mem = model.encoder(src, src_mask)
        B = src.size(0)
        results = [[] for _ in range(B)]
        for b in range(B):
            beams = [(0.0, [sos])]
            for _ in range(max_len):
                new_beams = []
                for lp, seq in beams:
                    if seq[-1] == eos:
                        new_beams.append((lp, seq)); continue
                    ys = torch.tensor(seq, dtype=torch.long, device=device).unsqueeze(0)
                    tgt_pad = make_pad_mask(ys, pad)
                    causal = subsequent_mask(ys.size(1)).to(device)
                    logits = model.decoder(ys, mem[b:b+1], causal, tgt_pad, src_mask[b:b+1])
                    logp = torch.log_softmax(logits[:,-1,:], dim=-1).squeeze(0)
                    topk = torch.topk(logp, beam)
                    for kprob, idx in zip(topk.values.tolist(), topk.indices.tolist()):
                        new_beams.append((lp + kprob, seq + [idx]))
                beams = sorted(new_beams, key=lambda x:x[0], reverse=True)[:beam]
                if all(s[-1]==eos for _,s in beams): break
            best = max(beams, key=lambda x:x[0])[1]
            toks = [t for t in best[1:] if t != eos]
            results[b] = [tgt_vocab.itos[t] if t < len(tgt_vocab.itos) else UNK for t in toks]
        return results



## 9) Train
You can tweak hyperparameters below for speed vs. quality.


In [None]:

# Hyperparameters
D_MODEL = 256
HEADS = 2
ENC_LAYERS = 2
DEC_LAYERS = 2
D_FF = 1024
DROPOUT = 0.1
EPOCHS = 8      # Increase for better results if you have time/GPU
BATCH = 64
LR = 3e-4
SAVE_DIR = "./urdu_runs"

model = TransformerSeq2Seq(
    src_vocab_size=len(src_vocab),
    tgt_vocab_size=len(tgt_vocab),
    d_model=D_MODEL, enc_layers=ENC_LAYERS, dec_layers=DEC_LAYERS,
    heads=HEADS, d_ff=D_FF, dropout=DROPOUT
).to(device)

train_loop(model, train_it, val_it, tgt_vocab, epochs=EPOCHS, lr=LR, save_dir=SAVE_DIR)



## 10) Evaluate on Test Set
(Loads best checkpoint by validation BLEU if present.)


In [None]:

ckpt = os.path.join(SAVE_DIR, "best_bleu.pt")
if os.path.exists(ckpt):
    state = torch.load(ckpt, map_location=device)
    model.load_state_dict(state["model_state"])
    print("Loaded best checkpoint.")

test_metrics = evaluate(model, test_it, tgt_vocab, device)
print("[TEST] BLEU={:.4f} ROUGE_L={:.4f} chrF={:.4f} PPL={:.2f}".format(
    test_metrics["BLEU"], test_metrics["ROUGE_L"], test_metrics["chrF"], test_metrics["Perplexity"]
))



## 11) Try chatting (Greedy / Beam)
Enter an Urdu sentence; the model will generate the next-utterance reply.


In [None]:

def encode_single(sentence, src_vocab, max_len=80):
    toks = tokenize_urdu(sentence)[:max_len-2]
    ids = torch.tensor([src_vocab.encode(toks, add_sos_eos=True)], dtype=torch.long).to(device)
    return ids

def reply(text, beam=0):
    src = encode_single(text, src_vocab)
    if beam and beam > 0:
        outs = beam_search_decode(model, src, tgt_vocab, beam=beam, device=device)
    else:
        outs = greedy_decode(model, src, tgt_vocab, device=device)
    return " ".join(outs[0])

print("Example:")
print("You:", "آپ کیسے ہیں؟")
print("Bot:", reply("آپ کیسے ہیں؟", beam=4))



## 12) Save vocab (optional)
To reuse the same vocabulary later.


In [None]:

import pickle, pathlib
pathlib.Path(SAVE_DIR).mkdir(parents=True, exist_ok=True)
with open(os.path.join(SAVE_DIR, "src_vocab.pkl"), "wb") as f:
    pickle.dump({"itos": src_vocab.itos, "stoi": src_vocab.stoi}, f)
with open(os.path.join(SAVE_DIR, "tgt_vocab.pkl"), "wb") as f:
    pickle.dump({"itos": tgt_vocab.itos, "stoi": tgt_vocab.stoi}, f)
print("Saved:", os.path.join(SAVE_DIR, "src_vocab.pkl"), "and tgt_vocab.pkl")
