# Evaluate Transformer BLEU
This notebook loads the saved vocab and model, runs generation on a subset of the dataset, and computes corpus BLEU using utils.bleu_score.

In [1]:
import os
import torch
import pandas as pd
from utils import pickle as load_pickle, bleu_score
from transformer import TransformerWithPhrase, TransformerConfig
from preprocessing import preprocess_with_phrases, extract_7_phrases

# Settings
sequence_len = 128
min_len = 5
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [3]:
# Load vocab pickles (must match the model used during training)
assert os.path.exists('ch2i.pkl'), "model/ch2i.pkl not found"
assert os.path.exists('phrase2idx.pkl'), "model/phrase2idx.pkl not found"
ch2i = load_pickle('ch2i.pkl')
phrase2idx = load_pickle('phrase2idx.pkl')
i2ch = {i: c for c, i in ch2i.items()}

# Instantiate model config and model
mconfig = TransformerConfig(
    vocab_size=len(ch2i),
    sequence_len=sequence_len,
    nblock=4,
    nhead=8,
    embed_dim=256,
    phrase_emb_dim=16,
)
model = TransformerWithPhrase(mconfig, phrase_vocab_size=len(phrase2idx))

# Try loading checkpoint if available (optional names tried)
ckpt_paths = ['model.pth', 'model_checkpoint.pth', 'model/transformer.pth']
for p in ckpt_paths:
    if os.path.exists(p):
        state = torch.load(p, map_location=device)
        # handle state dict saved directly or under 'model' key
        if isinstance(state, dict) and 'model_state_dict' in state:
            model.load_state_dict(state['model_state_dict'])
        elif isinstance(state, dict) and 'state_dict' in state:
            model.load_state_dict(state['state_dict'])
        else:
            try:
                model.load_state_dict(state)
            except Exception:
                # best-effort: skip if incompatible
                print(f'Could not load checkpoint {p} into model; skipping.')
        print('Loaded checkpoint:', p)
        break

model.to(device)
model.eval()

Loaded checkpoint: model.pth


TransformerWithPhrase(
  (token_embedding): Embedding(790, 240)
  (phrase_embedding): Embedding(8, 16)
  (pos_embedding): Embedding(128, 256)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
         

In [4]:
# Load dataset and preprocess (use same preprocessing as training)
df = pd.read_csv('hindi_english_parallel.csv')
en_texts = df['english'].astype(str).tolist()
hi_texts = df['hindi'].astype(str).tolist()

# Optionally limit number of evaluation samples to speed up evaluation
max_eval = 200
en_texts = en_texts[:max_eval]
hi_texts = hi_texts[:max_eval]

en_proc, hi_proc, phrase_tags = preprocess_with_phrases(en_texts, hi_texts, min_len, sequence_len)


In [5]:
# Generation + BLEU collection
def encode_source(s, ch2i, seq_len):
    ids = [ch2i.get(c, 0) for c in s]
    ids = ids[:seq_len] + [0] * (seq_len - len(ids))
    return ids

def encode_phrases(tags, phrase2idx, seq_len):
    ids = [phrase2idx.get(t, 0) for t in tags]
    ids = ids[:seq_len] + [0] * (seq_len - len(ids))
    return ids

def decode_preds(pred_indices, i2ch):
    # pred_indices: list or 1D numpy array of ints
    return ''.join([i2ch.get(int(i), '') for i in pred_indices if int(i) != 0])

references = []
candidates = []

for src_proc, tgt_ref, tags in zip(en_proc, hi_texts, phrase_tags):
    src_ids = encode_source(src_proc, ch2i, sequence_len)
    p_ids = encode_phrases(tags, phrase2idx, sequence_len)
    src_tensor = torch.tensor([src_ids], dtype=torch.long, device=device)
    p_tensor = torch.tensor([p_ids], dtype=torch.long, device=device)
    with torch.no_grad():
        pred = model.generate(src_tensor, p_tensor, max_len=sequence_len, start_token=1)  # shape (1, L)
    pred_np = pred[0].cpu().numpy()
    cand = decode_preds(pred_np, i2ch)
    candidates.append(cand)
    # utils.bleu_score expects references as list-of-lists of reference sentences
    references.append([tgt_ref])


In [6]:
# Compute BLEU using utils.bleu_score
bleu1, bleu2, bleu3, bleu4 = bleu_score(references, candidates)
print(f'BLEU-1: {bleu1:.2f}, BLEU-2: {bleu2:.2f}, BLEU-3: {bleu3:.2f}, BLEU-4: {bleu4:.2f}')

# Optionally print a few examples
for i in range(min(10, len(candidates))):
    print('SRC:', en_texts[i])
    print('REF:', hi_texts[i])
    print('PRED:', candidates[i])
    print('---')

BLEU-1: 0.00, BLEU-2: 0.00, BLEU-3: 0.00, BLEU-4: 0.00
SRC: Give your application an accessibility workout
REF: अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
PRED: --------------------------------------------------------------------------------------------------------------------------------
---
SRC: Accerciser Accessibility Explorer
REF: एक्सेर्साइसर पहुंचनीयता अन्वेषक
PRED: --------------------------------------------------------------------------------------------------------------------------------
---
SRC: The default plugin layout for the bottom panel
REF: निचले पटल के लिए डिफोल्ट प्लग-इन खाका
PRED: --------------------------------------------------------------------------------------------------------------------------------
---
SRC: The default plugin layout for the top panel
REF: ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
PRED: --------------------------------------------------------------------------------------------------------------------------------
---
SRC: A list of plugins