**If you are using Colab. Please run the code under this text.**

In [None]:
# Install PyTorch and NLP-related libraries (datasets, transformers, metrics) for the notebook.
!pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip -q install datasets sentencepiece sacrebleu rouge-score transformers evaluate tqdm numpy pandas pyyaml triton

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [None]:
# Import core libraries (PyTorch, datasets, tqdm, etc.) and select CUDA or CPU device.
import os, time, math, random, json
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from dataclasses import dataclass
from typing import List, Dict
from tqdm.auto import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cuda


In [None]:
# Utility helpers: seeding for reproducibility, timing, GPU memory tracking, parameter counting, and loss statistics.
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def timeit():
    s=time.perf_counter()
    return lambda: time.perf_counter()-s

def mem_mb():
    return round(torch.cuda.max_memory_allocated()/(1024*1024),2) if torch.cuda.is_available() else 0.0

def count_params(m): return sum(p.numel() for p in m.parameters() if p.requires_grad)

class LossTracker:
    def __init__(self): self.v=[]
    def update(self,x): self.v.append(float(x))
    @property
    def mean(self): import numpy as np; return float(np.mean(self.v)) if self.v else 0.0
    @property
    def var(self): import numpy as np; return float(np.var(self.v)) if self.v else 0.0

In [None]:
# Data pipeline: SentencePiece BPE tokenizer, MTExample dataclass, dataset wrapper, collation, and DataLoader builders.
import datasets as hfds, sentencepiece as spm

class JointBPETokenizer:
    def __init__(self, work='work', vocab_size=8000):
        os.makedirs(work, exist_ok=True)
        self.work=work; self.vocab=vocab_size
        self.model_file=os.path.join(work,f'spm_{vocab_size}.model')
        self.sp=spm.SentencePieceProcessor()
        self._sp={'pad':0,'bos':1,'eos':2}
    def train(self, texts):
        if os.path.exists(self.model_file):
            self.sp.load(self.model_file); return
        corpus=os.path.join(self.work,'corpus.txt')
        with open(corpus,'w',encoding='utf-8') as f:
            for t in texts: f.write((t or '').strip()+'\n')
        spm.SentencePieceTrainer.Train(
            input=corpus, model_prefix=os.path.join(self.work,f'spm_{self.vocab}'),
            vocab_size=self.vocab, character_coverage=0.9995, model_type='bpe',
            pad_id=self._sp['pad'], bos_id=self._sp['bos'], eos_id=self._sp['eos'], unk_id=3
        )
        self.sp.load(self.model_file)
    def encode(self, text, add_bos=True, add_eos=True):
        ids=self.sp.encode(text or '', out_type=int)
        if add_bos: ids=[self._sp['bos']]+ids
        if add_eos: ids=ids+[self._sp['eos']]
        return ids
    def decode(self, ids):
        ids=[i for i in ids if i not in self._sp.values()]
        return self.sp.decode(ids)
    @property
    def pad_id(self): return self._sp['pad']
    @property
    def bos_id(self): return self._sp['bos']
    @property
    def eos_id(self): return self._sp['eos']
    @property
    def vocab_size(self): return self.sp.get_piece_size()

@dataclass
class MTExample: src:str; tgt:str

class MTDataset(Dataset):
    def __init__(self, pairs, tok, max_len=128):
        self.pairs=pairs; self.tok=tok; self.max_len=max_len
    def __len__(self): return len(self.pairs)
    def __getitem__(self,i):
        ex=self.pairs[i]
        s=self.tok.encode(ex.src)[:self.max_len]
        t=self.tok.encode(ex.tgt)[:self.max_len]
        return torch.tensor(s), torch.tensor(t)

def collate_fn(batch, pad):
    srcs,tgts=zip(*batch)
    srcs=nn.utils.rnn.pad_sequence(srcs,batch_first=True,padding_value=pad)
    tgts=nn.utils.rnn.pad_sequence(tgts,batch_first=True,padding_value=pad)
    return srcs,tgts

def load_pairs(dataset='multi30k', src_lang='en', tgt_lang='de', split='train'):
    if dataset=='multi30k':
        ds=hfds.load_dataset('bentrevett/multi30k')[split]
        return [MTExample(src=x['en'], tgt=x['de']) for x in ds]
    elif dataset=='iwslt14':
        ds=hfds.load_dataset('iwslt2017', f'iwslt2017-{src_lang}-{tgt_lang}')[split]
        return [MTExample(src=x['translation'][src_lang], tgt=x['translation'][tgt_lang]) for x in ds]
    else:
        raise ValueError('dataset must be multi30k or iwslt14')

def build_loaders(dataset, src_lang, tgt_lang, sp_vocab, batch_size, max_len, work='./work'):
    tr=load_pairs(dataset, src_lang, tgt_lang, 'train')
    va=load_pairs(dataset, src_lang, tgt_lang, 'validation')
    te=load_pairs(dataset, src_lang, tgt_lang, 'test')
    tok=JointBPETokenizer(os.path.join(work,'spm'), sp_vocab); tok.train([p.src for p in tr]+[p.tgt for p in tr])
    TR=DataLoader(MTDataset(tr,tok,max_len),batch_size=batch_size,shuffle=True,collate_fn=lambda b:collate_fn(b,tok.pad_id))
    VA=DataLoader(MTDataset(va,tok,max_len),batch_size=batch_size,shuffle=False,collate_fn=lambda b:collate_fn(b,tok.pad_id))
    TE=DataLoader(MTDataset(te,tok,max_len),batch_size=batch_size,shuffle=False,collate_fn=lambda b:collate_fn(b,tok.pad_id))
    return TR,VA,TE,tok

In [None]:
# Seq2Seq model with additive attention: encoder, decoder, attention module, and greedy decoding helper.
class AdditiveAttention(nn.Module):
    def __init__(self, enc_dim, dec_dim, attn_dim):
        super().__init__(); self.W_h=nn.Linear(enc_dim, attn_dim, bias=False); self.W_s=nn.Linear(dec_dim, attn_dim, bias=False); self.v=nn.Linear(attn_dim,1,bias=False)
    def forward(self, enc_outs, dec_state, mask=None):
        s=self.v(torch.tanh(self.W_h(enc_outs)+self.W_s(dec_state).unsqueeze(1))).squeeze(-1)
        if mask is not None: s=s.masked_fill(mask==0, torch.finfo(s.dtype).min)
        a=torch.softmax(s,dim=-1); ctx=torch.bmm(a.unsqueeze(1),enc_outs).squeeze(1); return ctx,a

class Encoder(nn.Module):
    def __init__(self, V,E,H,L=1,drop=0.1):
        super().__init__(); self.embed=nn.Embedding(V,E,padding_idx=0); self.rnn=nn.GRU(E,H,num_layers=L,batch_first=True,bidirectional=True); self.drop=nn.Dropout(drop); self.out_dim=2*H
    def forward(self, src):
        x=self.drop(self.embed(src)); outs,h=self.rnn(x); return outs,h

class Decoder(nn.Module):
    def __init__(self, V,E,enc_dim,H,L=1,drop=0.1):
        super().__init__(); self.embed=nn.Embedding(V,E,padding_idx=0); self.attn=AdditiveAttention(enc_dim,H,H); self.rnn=nn.GRU(E+enc_dim,H,num_layers=L,batch_first=True); self.fc=nn.Linear(H,V); self.drop=nn.Dropout(drop)
    def forward(self, y_prev, h, enc_outs, mask):
        emb=self.drop(self.embed(y_prev)).unsqueeze(1); ctx,_=self.attn(enc_outs,h[-1],mask); out,h=self.rnn(torch.cat([emb,ctx.unsqueeze(1)],dim=-1),h); logits=self.fc(out.squeeze(1)); return logits,h

class Seq2Seq(nn.Module):
    def __init__(self,V,E=256,H=512,Le=1,Ld=1,drop=0.1):
        super().__init__(); self.enc=Encoder(V,E,H,Le,drop); self.dec=Decoder(V,E,self.enc.out_dim,H,Ld,drop)
    def forward(self, src,tgt,pad,teacher_forcing=0.5,max_len=128):
        B,T=tgt.size(); enc_outs,h=self.enc(src); mask=(src!=pad).int(); dec_h=h[-1:].contiguous(); y=tgt[:,0]; outs=[]
        for t in range(1,T):
            logits,dec_h=self.dec(y,dec_h,enc_outs,mask); outs.append(logits.unsqueeze(1))
            use=(torch.rand(B,device=src.device)<teacher_forcing).long(); y=use*tgt[:,t]+(1-use)*torch.argmax(logits,dim=-1)
        return torch.cat(outs,dim=1)
    @torch.no_grad()
    def greedy_decode(self,src,pad,bos,eos,max_len=128):
        B=src.size(0); enc_outs,h=self.enc(src); mask=(src!=pad).int(); dec_h=h[-1:].contiguous(); y=torch.full((B,),bos,dtype=torch.long,device=src.device); outs=[y]
        for _ in range(max_len-1):
            logits,dec_h=self.dec(y,dec_h,enc_outs,mask); y=torch.argmax(logits,dim=-1); outs.append(y);
            if (y==eos).all(): break
        return torch.stack(outs,dim=1)

In [None]:
# Transformer-based MT model: positional encoding, Transformer encoder–decoder, and inference utility.
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,max_len=5000):
        super().__init__(); pe=torch.zeros(max_len,d_model); pos=torch.arange(0,max_len,dtype=torch.float32).unsqueeze(1)
        div=torch.exp(torch.arange(0,d_model,2,dtype=torch.float32)*(-math.log(10000.0)/d_model))
        pe[:,0::2]=torch.sin(pos*div); pe[:,1::2]=torch.cos(pos*div); pe=pe.unsqueeze(0); self.register_buffer('pe',pe)
    def forward(self,x): return x + self.pe[:, :x.size(1)]

class TransformerMT(nn.Module):
    def __init__(self,V,d=256,h=4,Le=3,Ld=3,ff=1024,drop=0.1):
        super().__init__()
        self.src_embed=nn.Embedding(V,d,padding_idx=0); self.tgt_embed=nn.Embedding(V,d,padding_idx=0); self.pos=PositionalEncoding(d)
        self.tf=nn.Transformer(d_model=d,nhead=h,num_encoder_layers=Le,num_decoder_layers=Ld,dim_feedforward=ff,dropout=drop,batch_first=True)
        self.gen=nn.Linear(d,V)
    def forward(self,src,tgt,pad):
        src_k=(src==pad); tgt_k=(tgt==pad); tgt_m=nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
        s=self.pos(self.src_embed(src)); t=self.pos(self.tgt_embed(tgt))
        out=self.tf(s,t,tgt_mask=tgt_m,src_key_padding_mask=src_k,tgt_key_padding_mask=tgt_k,memory_key_padding_mask=src_k)
        return self.gen(out)
    @torch.no_grad()
    def greedy_decode(self,src,pad,bos,eos,max_len=128):
        mem=self.tf.encoder(self.pos(self.src_embed(src)),src_key_padding_mask=(src==pad)); ys=torch.full((src.size(0),1),bos,dtype=torch.long,device=src.device)
        for _ in range(max_len-1):
            m=nn.Transformer.generate_square_subsequent_mask(ys.size(1)).to(src.device)
            dec=self.tf.decoder(self.pos(self.tgt_embed(ys)),mem,tgt_mask=m,tgt_key_padding_mask=(ys==pad),memory_key_padding_mask=(src==pad))
            nxt=self.gen(dec[:,-1:,:]).squeeze(1).argmax(dim=-1,keepdim=True); ys=torch.cat([ys,nxt],dim=1)
            if (nxt.squeeze(1)==eos).all(): break
        return ys

In [None]:
# Training / evaluation utilities: BLEU & ROUGE-L metrics, train/validation loops, and shared decode helper.
from rouge_score import rouge_scorer
import sacrebleu

def compute_metrics(hyps, refs):
    bleu=sacrebleu.corpus_bleu(hyps,[refs]).score
    scorer=rouge_scorer.RougeScorer(['rougeLsum'],use_stemmer=True)
    rls=[scorer.score(r,h)['rougeLsum'].fmeasure for h,r in zip(hyps,refs)]
    return {'BLEU':float(bleu),'ROUGE_L':float(sum(rls)/len(rls) if rls else 0.0)}

def train_epoch(model, loader, tok, cfg):
    model.train(); ce=nn.CrossEntropyLoss(ignore_index=tok.pad_id); losses,grad=LossTracker(),LossTracker(); opt=cfg['_opt']; scaler=torch.cuda.amp.GradScaler(enabled=cfg['amp'])
    for src,tgt in tqdm(loader,leave=False):
        src, tgt = src.to(device), tgt.to(device); opt.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=cfg['amp']):
            if cfg['model']=='seq2seq':
                logits=model(src,tgt,pad=tok.pad_id,teacher_forcing=cfg['teacher_forcing'],max_len=cfg['max_len']); gold=tgt[:,1:]
            else:
                logits=model(src,tgt[:,:-1],pad=tok.pad_id); gold=tgt[:,1:]
            loss=ce(logits.reshape(-1,logits.size(-1)), gold.reshape(-1))
        scaler.scale(loss).backward(); nn.utils.clip_grad_norm_(model.parameters(), cfg['clip']); scaler.step(opt); scaler.update()
        losses.update(loss.item()); tot=0.0
        for p in model.parameters():
            if p.grad is not None: n=p.grad.data.norm(2).item(); tot+=n*n
        grad.update(tot**0.5)
    return {'loss_mean':losses.mean,'loss_var':losses.var,'grad_norm':grad.mean}

@torch.no_grad()
def valid_epoch(model, loader, tok, cfg):
    model.eval(); ce=nn.CrossEntropyLoss(ignore_index=tok.pad_id); losses=LossTracker()
    for src,tgt in tqdm(loader,leave=False):
        src,tgt=src.to(device),tgt.to(device)
        if cfg['model']=='seq2seq': logits=model(src,tgt,pad=tok.pad_id,teacher_forcing=0.0,max_len=cfg['max_len']); gold=tgt[:,1:]
        else: logits=model(src,tgt[:,:-1],pad=tok.pad_id); gold=tgt[:,1:]
        loss=ce(logits.reshape(-1,logits.size(-1)), gold.reshape(-1)); losses.update(loss.item())
    return {'val_loss':losses.mean}

@torch.no_grad()
def decode_and_score(model, loader, tok, cfg, max_batches=None):
    model.eval(); hyps, refs=[],[]; k=0
    for src,tgt in loader:
        src,tgt=src.to(device),tgt.to(device)
        out=model.greedy_decode(src,tok.pad_id,tok.bos_id,tok.eos_id,cfg['max_len'])
        for o,r in zip(out.tolist(), tgt.tolist()):
            hyps.append(tok.decode(o)); refs.append(tok.decode(r))
        k+=1
        if max_batches and k>=max_batches: break
    return compute_metrics(hyps,refs), hyps[:5], refs[:5]

In [None]:
# Experiment configuration: dataset choice, tokenization settings, model hyperparameters, and DataLoader construction.
cfg=dict(dataset='multi30k', src_lang='en', tgt_lang='de', sp_vocab=8000, batch_size=64, max_len=128, amp=True, lr=3e-4, clip=1.0, teacher_forcing=0.5, seed=13)
set_seed(cfg['seed']); work='./outputs'; os.makedirs(work, exist_ok=True)
TR,VA,TE,tok=build_loaders(cfg['dataset'], cfg['src_lang'], cfg['tgt_lang'], cfg['sp_vocab'], cfg['batch_size'], cfg['max_len'], work)
print('Vocab:', tok.vocab_size)

README.md: 0.00B [00:00, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

val.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/29000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1014 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Vocab: 8000


In [None]:
# Train Seq2Seq
# Seq2Seq training loop: instantiate model, optimizer, run epochs, track best validation loss, and save checkpoint.
cfg_s2s=dict(cfg); cfg_s2s.update(model='seq2seq', embed_dim=256, hidden_dim=512, enc_layers=1, dec_layers=1, dropout=0.1, epochs=8)
s2s=Seq2Seq(tok.vocab_size, cfg_s2s['embed_dim'], cfg_s2s['hidden_dim'], cfg_s2s['enc_layers'], cfg_s2s['dec_layers'], cfg_s2s['dropout']).to(device)
opt=torch.optim.Adam(s2s.parameters(), lr=cfg_s2s['lr']); cfg_s2s['_opt']=opt
if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats()
wall=timeit(); best=1e9
for ep in range(1, cfg_s2s['epochs']+1):
    tr=train_epoch(s2s,TR,tok,cfg_s2s); va=valid_epoch(s2s,VA,tok,cfg_s2s)
    print(f"[Seq2Seq] ep{ep} train={tr['loss_mean']:.3f}±{tr['loss_var']:.3f} grad≈{tr['grad_norm']:.2f} val={va['val_loss']:.3f}")
    if va['val_loss']<best: best=va['val_loss']; torch.save(s2s.state_dict(), os.path.join(work,'s2s_best.pt'))
s2s_time=wall(); s2s_mem=mem_mb(); print({'time_s':round(s2s_time,2),'mem_mb':s2s_mem,'params':count_params(s2s)})

  model.train(); ce=nn.CrossEntropyLoss(ignore_index=tok.pad_id); losses,grad=LossTracker(),LossTracker(); opt=cfg['_opt']; scaler=torch.cuda.amp.GradScaler(enabled=cfg['amp'])


  0%|          | 0/454 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=cfg['amp']):


  0%|          | 0/16 [00:00<?, ?it/s]

[Seq2Seq] ep1 train=5.873±0.573 grad≈0.00 val=5.657


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Seq2Seq] ep2 train=5.128±0.031 grad≈0.00 val=5.288


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Seq2Seq] ep3 train=4.782±0.022 grad≈0.00 val=5.045


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Seq2Seq] ep4 train=4.546±0.027 grad≈0.00 val=4.891


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Seq2Seq] ep5 train=4.361±0.027 grad≈0.00 val=4.744


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Seq2Seq] ep6 train=4.209±0.028 grad≈0.00 val=4.644


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Seq2Seq] ep7 train=4.085±0.030 grad≈0.00 val=4.548


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Seq2Seq] ep8 train=3.971±0.028 grad≈0.00 val=4.487
{'time_s': 453.92, 'mem_mb': 1349.58, 'params': 14107968}


In [None]:
# Evaluate Seq2Seq
# Seq2Seq evaluation: reload best checkpoint, decode test set, compute metrics, and log sample translations.
s2s.load_state_dict(torch.load(os.path.join(work,'s2s_best.pt'), map_location=device))
m_s2s, hy_s2s, rf_s2s = decode_and_score(s2s, TE, tok, cfg_s2s)
print("Seq2Seq:", m_s2s)
for i,(h,r) in enumerate(zip(hy_s2s, rf_s2s),1): print(f"[{i}] HYP: {h}\n    REF: {r}\n")

Seq2Seq: {'BLEU': 18.132342704445286, 'ROUGE_L': 0.501615686529618}
[1] HYP: Ein Mann in einem orangefarbenen Hut, etwas etwas etwas etwas.
    REF: Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.

[2] HYP: Ein Mann, der auf einem auf einem grünen-Bahn-Shirt auf einem weißen..
    REF: Ein Boston Terrier läuft über saftig-grünes Gras vor einem weißen Zaun.

[3] HYP: Ein Mädchen in einer Kleidung mit einem mit einem mit einem vor einem vor..
    REF: Ein Mädchen in einem Karateanzug bricht ein Brett mit einem Tritt.

[4] HYP: Fünf Personen in die und stehen stehen stehen im Schnee, Schnee im Schnee. im Hintergrund.
    REF: Fünf Leute in Winterjacken und mit Helmen stehen im Schnee mit Schneemobilen im Hintergrund.

[5] HYP: Leute gehen die eines eines eines..
    REF: Leute Reparieren das Dach eines Hauses.



In [None]:
# Train Transformer
# Transformer training loop: instantiate Transformer model, optimizer, run epochs, track best validation loss, and save checkpoint.
cfg_tf=dict(cfg); cfg_tf.update(model='transformer', embed_dim=256, n_heads=4, n_layers=3, ff=1024, dropout=0.1, epochs=8)
tf=TransformerMT(tok.vocab_size, d=cfg_tf['embed_dim'], h=cfg_tf['n_heads'], Le=cfg_tf['n_layers'], Ld=cfg_tf['n_layers'], ff=cfg_tf['ff'], drop=cfg_tf['dropout']).to(device)
opt=torch.optim.Adam(tf.parameters(), lr=cfg_tf['lr']); cfg_tf['_opt']=opt
if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats()
wall=timeit(); best=1e9
for ep in range(1, cfg_tf['epochs']+1):
    tr=train_epoch(tf,TR,tok,cfg_tf); va=valid_epoch(tf,VA,tok,cfg_tf)
    print(f"[Transformer] ep{ep} train={tr['loss_mean']:.3f}±{tr['loss_var']:.3f} grad≈{tr['grad_norm']:.2f} val={va['val_loss']:.3f}")
    if va['val_loss']<best: best=va['val_loss']; torch.save(tf.state_dict(), os.path.join(work,'tf_best.pt'))
tf_time=wall(); tf_mem=mem_mb(); print({'time_s':round(tf_time,2),'mem_mb':tf_mem,'params':count_params(tf)})

  model.train(); ce=nn.CrossEntropyLoss(ignore_index=tok.pad_id); losses,grad=LossTracker(),LossTracker(); opt=cfg['_opt']; scaler=torch.cuda.amp.GradScaler(enabled=cfg['amp'])


  0%|          | 0/454 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=cfg['amp']):


  0%|          | 0/16 [00:00<?, ?it/s]

  output = torch._nested_tensor_from_mask(


[Transformer] ep1 train=5.478±0.641 grad≈0.00 val=4.781


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Transformer] ep2 train=4.474±0.046 grad≈0.00 val=4.202


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Transformer] ep3 train=4.020±0.038 grad≈0.00 val=3.833


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Transformer] ep4 train=3.688±0.032 grad≈0.00 val=3.551


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Transformer] ep5 train=3.434±0.029 grad≈0.00 val=3.340


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Transformer] ep6 train=3.230±0.025 grad≈0.00 val=3.182


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Transformer] ep7 train=3.049±0.027 grad≈0.00 val=3.040


  0%|          | 0/454 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

[Transformer] ep8 train=2.896±0.026 grad≈0.00 val=2.924
{'time_s': 158.57, 'mem_mb': 974.97, 'params': 11682624}


In [None]:
# Evaluate Transformer
# Transformer evaluation: reload best checkpoint, decode test set, compute metrics, and log sample translations.
tf.load_state_dict(torch.load(os.path.join(work,'tf_best.pt'), map_location=device))
m_tf, hy_tf, rf_tf = decode_and_score(tf, TE, tok, cfg_tf)
print("Transformer:", m_tf)
for i,(h,r) in enumerate(zip(hy_tf, rf_tf),1): print(f"[{i}] HYP: {h}\n    REF: {r}\n")

Transformer: {'BLEU': 3.888792176303914, 'ROUGE_L': 0.511691750011333}
[1] HYP: Ein Mann mit orangefarbenen Hut macht etwas auf etwas...........................................................
    REF: Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.

[2] HYP: Ein paar Rasischer Kleidung läuft auf einem grünen Zaun vor einem weißen Zaun.............................................. auf einem weißen Zaun.. auf einem weißen Zaun....
    REF: Ein Boston Terrier läuft über saftig-grünes Gras vor einem weißen Zaun.

[3] HYP: Ein Mädchen in Uniform wirft einen Stock mit einem Stock...........................................................
    REF: Ein Mädchen in einem Karateanzug bricht ein Brett mit einem Tritt.

[4] HYP: Fünf Personen in Winter und mit Helm stehen im Schnee, im Hintergrund sind Schnee. Schnee. im Hintergrund............ im Hintergrund.. Schnee. im Hintergrund. im Hintergrund... Schnee. Schnee. im Hintergrund. im Hintergrund. Schnee.... im Hintergrund sind im Hin

In [None]:
# Collect experiment summary (metrics, runtime, GPU memory, parameter count) into a pandas DataFrame for comparison.
import pandas as pd

summary = pd.DataFrame([
    dict(model='Seq2Seq+AddAttn', **m_s2s,
         train_time_s=round(s2s_time, 2),
         max_gpu_mem_mb=s2s_mem,
         params=count_params(s2s)),
    dict(model='Transformer', **m_tf,
         train_time_s=round(tf_time, 2),
         max_gpu_mem_mb=tf_mem,
         params=count_params(tf)),
])
summary

Unnamed: 0,model,BLEU,ROUGE_L,train_time_s,max_gpu_mem_mb,params
0,Seq2Seq+AddAttn,18.132343,0.501616,453.92,1349.58,14107968
1,Transformer,3.888792,0.511692,158.57,974.97,11682624
