# Environment setup

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

import sacrebleu

print("PyTorch version:", torch.__version__)
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)


# Dataset loading

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("vaibhavkumar11/hindi-english-parallel-corpus")

print("Path to dataset files:", path)

In [None]:
path = "/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv"
df = pd.read_csv(path)

print(df.columns)
df.head()
df.describe()

In [None]:
df = df[['english', 'hindi']].dropna()

df = df.rename(columns={'english': 'en', 'hindi': 'hi'})

df['en'] = df['en'].str.strip().str.lower()
df['hi'] = df['hi'].str.strip().str.lower()

# Remove extremely long sentences (RAM safety)
df = df[df['en'].str.len() < 200]
df = df[df['hi'].str.len() < 200]

# Keep only 10,000 examples for smooth training
df = df.head(10000).reset_index(drop=True)

print(df.head())
print("Total usable pairs:", len(df))

# Train-test split

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df[['en', 'hi']],
    test_size=0.2,
    random_state=42
)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
train_df.head()


# Tokenizer,vocabulary,encoder and dataloaders

In [None]:
from collections import Counter

def tokenize(text):
    return text.split()

# Build vocabulary
def build_vocab(sentences, max_size=8000):
    counter = Counter()
    for s in sentences:
        counter.update(tokenize(s))

    # Reserve 4 special tokens
    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}

    most_common = counter.most_common(max_size - 4)
    for i, (tok, _) in enumerate(most_common, start=4):
        vocab[tok] = i

    return vocab

# Build both vocabs
vocab_en = build_vocab(train_df['en'])
vocab_hi = build_vocab(train_df['hi'])

print("English vocab size:", len(vocab_en))
print("Hindi vocab size:", len(vocab_hi))

# Encoding function
def encode(sentence, vocab):
    tokens = tokenize(sentence)
    ids = [vocab.get(t, 3) for t in tokens]  # 3 = <unk>
    return [1] + ids + [2]  # <sos> ... <eos>

# Padding
def pad_batch(seqs, pad_id=0):
    max_len = max(len(s) for s in seqs)
    return torch.tensor([s + [pad_id] * (max_len - len(s)) for s in seqs])

# Dataset class
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, df, vocab_en, vocab_hi):
        self.en = df['en'].tolist()
        self.hi = df['hi'].tolist()
        self.vocab_en = vocab_en
        self.vocab_hi = vocab_hi

    def __len__(self):
        return len(self.en)

    def __getitem__(self, idx):
        en_ids = encode(self.en[idx], self.vocab_en)
        hi_ids = encode(self.hi[idx], self.vocab_hi)
        return en_ids, hi_ids

# Collate for DataLoader
def collate_fn(batch):
    en_batch = [b[0] for b in batch]
    hi_batch = [b[1] for b in batch]

    en_pad = pad_batch(en_batch, pad_id=0)
    hi_pad = pad_batch(hi_batch, pad_id=0)

    return en_pad, hi_pad

# DataLoaders
train_loader = torch.utils.data.DataLoader(
    TranslationDataset(train_df, vocab_en, vocab_hi),
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = torch.utils.data.DataLoader(
    TranslationDataset(val_df, vocab_en, vocab_hi),
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn
)

# Test batch
for en, hi in train_loader:
    print("English batch shape:", en.shape)
    print("Hindi batch shape:", hi.shape)
    break


# Positional Embeddings(Sinusoidal or learned)

In [None]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=200):
        super().__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                             (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)   # even indices
        pe[:, 1::2] = torch.cos(position * div_term)   # odd indices
        
        pe = pe.unsqueeze(0)   # shape: (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: (batch, seq_len, d_model)
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len]


# Scaled Dot-Product attention

In [None]:

class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super().__init__()
        self.d_k = d_k

    def forward(self, Q, K, V, mask=None):
        # Q, K, V: (batch, heads, seq_len, d_k)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attn = torch.softmax(scores, dim=-1)
        output = torch.matmul(attn, V)

        return output, attn


# Multihead attention 

In [None]:
!pip install einops sacrebleu --quiet


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        
        self.attention = ScaledDotProductAttention(self.d_k)
        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, x_q, x_k, x_v, mask=None):
        if mask is not None:
            # FIX: remove redundant dimension
            if mask.dim() == 4:
                mask = mask.squeeze(1)  # (B,1,L,L)->(B,L,L)

            # Expand mask for heads
            mask = mask.unsqueeze(1)   # (B,1,L,L)->(B,H,L,L)

        B, L, _ = x_q.size()

        # Linear projections
        Q = self.W_q(x_q)
        K = self.W_k(x_k)
        V = self.W_v(x_v)

        # Split into heads
        Q = rearrange(Q, "b l (h d) -> b h l d", h=self.num_heads)
        K = rearrange(K, "b l (h d) -> b h l d", h=self.num_heads)
        V = rearrange(V, "b l (h d) -> b h l d", h=self.num_heads)

        # Apply attention
        context, attn = self.attention(Q, K, V, mask)

        # Merge heads
        context = rearrange(context, "b h l d -> b l (h d)")
        output = self.W_o(context)

        return output, attn


# Fee forward Neural net

In [None]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.fc2(F.relu(self.fc1(x)))

# Layer norm and residual connections

In [None]:
class ResidualConnection(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer_output):
        return self.norm(x + self.dropout(sublayer_output))

# Encoder Block

In [None]:
# Self attention + FFN
class EncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.res1 = ResidualConnection(d_model, dropout)
        
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.res2 = ResidualConnection(d_model, dropout)

    def forward(self, x, mask):
        # 1) Self-attention
        attn_output, _ = self.self_attn(x, x, x, mask)
        x = self.res1(x, attn_output)

        # 2) Feed-forward network
        ffn_output = self.ffn(x)
        x = self.res2(x, ffn_output)

        return x


# Stacked Encoder(N layers)

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, dropout=0.1, max_len=200):
        super().__init__()

        # Token embeddings
        self.embedding = nn.Embedding(vocab_size, d_model)

        # Positional encodings
        self.pos_encoding = PositionalEncoding(d_model, max_len)

        # Stack encoder blocks
        self.layers = nn.ModuleList([
            EncoderBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # x: (batch, seq_len)

        x = self.embedding(x)        # (B, L, D)
        x = self.pos_encoding(x)     # add positional encodings
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, mask)

        return x    


# Decoder block

In [None]:
# Masked self attention + Cross attention +  FF NN

class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()

        # 1) Masked self-attention
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.res1 = ResidualConnection(d_model, dropout)

        # 2) Cross-attention (encoder → decoder)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.res2 = ResidualConnection(d_model, dropout)

        # 3) Feed-forward network
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.res3 = ResidualConnection(d_model, dropout)

    def forward(self, x, enc_out, trg_mask, src_mask):
        # x: decoder input
        # enc_out: encoder output

        # 1) Masked self-attention
        self_attn_out, _ = self.self_attn(x, x, x, trg_mask)
        x = self.res1(x, self_attn_out)

        # 2) Cross-attention (decoder queries, encoder memory as key/value)
        cross_attn_out, attn_weights = self.cross_attn(x, enc_out, enc_out, src_mask)
        x = self.res2(x, cross_attn_out)

        # 3) FFN
        ffn_out = self.ffn(x)
        x = self.res3(x, ffn_out)

        return x, attn_weights  


# Stacked Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, dropout=0.1, max_len=200):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.dropout = nn.Dropout(dropout)

        self.layers = nn.ModuleList([
            DecoderBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])

        self.final_norm = nn.LayerNorm(d_model)

    def forward(self, x, enc_out, trg_mask, src_mask):
        # x: decoder input 
        # enc_out: encoder output 

        x = self.embedding(x)
        x = self.pos_encoding(x)
        x = self.dropout(x)

        attn_maps = []   # storinf cross attention

        for layer in self.layers:
            x, attn_weights = layer(x, enc_out, trg_mask, src_mask)
            attn_maps.append(attn_weights)

        x = self.final_norm(x)
        return x, attn_maps


# Complete Transformer Model

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,   # English vocab
        trg_vocab_size,   # Hindi vocab
        d_model=256,
        num_heads=4,
        num_layers=3,
        d_ff=512,
        dropout=0.1,
        max_len=200
    ):
        super().__init__()

        # Encoder
        self.encoder = Encoder(
            vocab_size=src_vocab_size,
            d_model=d_model,
            num_heads=num_heads,
            d_ff=d_ff,
            num_layers=num_layers,
            dropout=dropout,
            max_len=max_len
        )

        # Decoder
        self.decoder = Decoder(
            vocab_size=trg_vocab_size,
            d_model=d_model,
            num_heads=num_heads,
            d_ff=d_ff,
            num_layers=num_layers,
            dropout=dropout,
            max_len=max_len
        )

        # Final projection to vocab size
        self.output_layer = nn.Linear(d_model, trg_vocab_size)

    def forward(self, src, trg, src_mask, trg_mask):
        enc_out = self.encoder(src, src_mask)
        dec_out, attn_weights = self.decoder(trg, enc_out, trg_mask, src_mask)
        logits = self.output_layer(dec_out)
        return logits, attn_weights


# Masking 

In [None]:
def create_padding_mask(seq, pad_id=0):
    mask = (seq != pad_id).unsqueeze(1).unsqueeze(2)  
    # shape: (B, 1, 1, L)
    return mask  # 1 for real tokens, 0 for pad

def create_subsequent_mask(size):
    # Prevents attending to future tokens
    mask = torch.tril(torch.ones(size, size)).bool()
    # shape: (L, L)
    return mask

def create_decoder_mask(trg, pad_id=0):
    # padding mask
    pad_mask = create_padding_mask(trg, pad_id)   # (B, 1, 1, L)

    # future mask
    seq_len = trg.size(1)
    subseq_mask = create_subsequent_mask(seq_len).to(trg.device)  # (L, L)
    subseq_mask = subseq_mask.unsqueeze(0).unsqueeze(1)           # (1, 1, L, L)

    # Final mask: both padding + future
    return pad_mask & subseq_mask


# Loss Function

In [None]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, label_smoothing, trg_vocab_size, ignore_index=0):
        super().__init__()
        self.src_vocab = trg_vocab_size
        self.ignore_index = ignore_index
        self.smoothing = label_smoothing
        self.confidence = 1.0 - label_smoothing

    def forward(self, pred, target):
        # pred: (B, L, vocab)
        # target: (B, L)

        pred = pred.reshape(-1, pred.size(-1))     # FIXED
        target = target.reshape(-1)                # FIXED

        # ignore padding
        mask = target != self.ignore_index
        pred = pred[mask]
        target = target[mask]

        # log softmax
        log_preds = F.log_softmax(pred, dim=-1)

        # label smoothing
        with torch.no_grad():
            true_dist = torch.zeros_like(log_preds)
            true_dist.fill_(self.smoothing / (self.src_vocab - 1))
            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)

        loss = torch.mean(torch.sum(-true_dist * log_preds, dim=1))
        return loss


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = Transformer(
    src_vocab_size=len(vocab_en),
    trg_vocab_size=len(vocab_hi),
    d_model=384,
    num_heads=6,
    num_layers=4,
    d_ff=768,
    dropout=0.1
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8)


loss_fn = LabelSmoothingLoss(
    label_smoothing=0.05,
    trg_vocab_size=len(vocab_hi),
    ignore_index=0
)

print("Model initialized")


# Training loop

In [None]:
train_losses = []
val_losses = []

for epoch in range(1, 6):   # changed from 3 to 6
    train_loss = train_epoch(model, train_loader, optimizer, loss_fn)
    val_loss = evaluate(model, val_loader, loss_fn)

    scheduler.step()

    train_losses.append(train_loss)
    val_losses.append(val_loss)

    print(f"Epoch {epoch}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")


# Training loop via teacher forcing

In [None]:
print("Dataset size:", len(df))
print("Train size:", len(train_df))
print("Val size:", len(val_df))

print("Number of batches in train_loader:", len(train_loader))
for i, (src, trg) in enumerate(train_loader):
    print("First batch shapes:", src.shape, trg.shape)
    break


In [None]:
def train_epoch(model, train_loader, optimizer, loss_fn, pad_id=0, device="cuda"):
    model.train()
    total_loss = 0

    for src, trg in train_loader:
        src, trg = src.to(device), trg.to(device)

        # Create masks
        src_mask = create_padding_mask(src, pad_id).to(device)
        trg_mask = create_decoder_mask(trg, pad_id).to(device)

        # Shift target for teacher forcing
        trg_input = trg[:, :-1]
        trg_output = trg[:, 1:]

        # Forward pass
        logits, _ = model(src, trg_input, src_mask, trg_mask[:, :, :-1, :-1])

        loss = loss_fn(logits, trg_output)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)


# Validation Loop

In [None]:
@torch.no_grad()
def evaluate(model, val_loader, loss_fn, pad_id=0, device="cuda"):
    model.eval()
    total_loss = 0

    for src, trg in val_loader:
        src, trg = src.to(device), trg.to(device)

        # masks
        src_mask = create_padding_mask(src, pad_id).to(device)
        trg_mask = create_decoder_mask(trg, pad_id).to(device)

        trg_input = trg[:, :-1]
        trg_output = trg[:, 1:]

        logits, _ = model(src, trg_input, src_mask, trg_mask[:, :, :-1, :-1])
        loss = loss_fn(logits, trg_output)

        total_loss += loss.item()

    return total_loss / len(val_loader)


# Greedy decoding and beam search

### Greedy decoding

In [None]:
@torch.no_grad()
def greedy_decode(model, src, src_mask, max_len, start_id=1, end_id=2, pad_id=0):
    device = src.device

    batch_size = src.size(0)

    # Start with <sos>
    trg_seq = torch.ones(batch_size, 1).fill_(start_id).long().to(device)

    for _ in range(max_len):
        trg_mask = create_decoder_mask(trg_seq, pad_id).to(device)

        logits, _ = model(src, trg_seq, src_mask, trg_mask)
        next_token = logits[:, -1, :].argmax(dim=-1).unsqueeze(1)

        trg_seq = torch.cat([trg_seq, next_token], dim=1)

        if (next_token == end_id).all():
            break

    return trg_seq


### Beam search

In [None]:
import heapq

@torch.no_grad()
def beam_search(model, src, src_mask, beam_width=3, max_len=40, start_id=1, end_id=2):
    device = src.device

    sequences = [[list([start_id]), 0.0]]  # (sequence, score)

    for _ in range(max_len):
        all_candidates = []

        for seq, score in sequences:
            trg_tensor = torch.tensor(seq).unsqueeze(0).to(device)
            trg_mask = create_decoder_mask(trg_tensor).to(device)

            logits, _ = model(src, trg_tensor, src_mask, trg_mask)
            probs = F.log_softmax(logits[:, -1, :], dim=-1)

            topk = torch.topk(probs, beam_width)
            for i in range(beam_width):
                token = topk.indices[0][i].item()
                new_seq = seq + [token]
                new_score = score + topk.values[0][i].item()
                all_candidates.append([new_seq, new_score])

        # Select best k sequences
        ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
        sequences = ordered[:beam_width]

        if all(seq[-1] == end_id for seq, _ in sequences):
            break

    best_seq = sequences[0][0]
    return torch.tensor(best_seq)


# BLEU and show sample translations

In [None]:
# Convert ids → tokens → sentence
def decode_ids(ids, vocab):
    inv_vocab = {v: k for k, v in vocab.items()}
    tokens = []
    for i in ids:
        if i in inv_vocab:
            tok = inv_vocab[i]
            if tok == "<eos>":
                break
            if tok not in ["<pad>", "<sos>"]:
                tokens.append(tok)
    return " ".join(tokens)


In [None]:
MAX_EVAL = 500   # evaluate only 300 samples

count = 0


In [None]:
from tqdm import tqdm

@torch.no_grad()
def compute_bleu(model, val_loader, vocab_en, vocab_hi, pad_id=0, device="cuda"):
    model.eval()

    references = []
    hypotheses = []

    # tqdm added here
    for src, trg in tqdm(val_loader, desc="Computing BLEU"):
        src, trg = src.to(device), trg.to(device)

        # process each sample separately
        for b in range(src.size(0)):
            src_b = src[b].unsqueeze(0)
            trg_b = trg[b].unsqueeze(0)

            src_mask = create_padding_mask(src_b, pad_id).to(device)

            # Beam search with width=3
            pred_ids = beam_search(model, src_b, src_mask, beam_width=3)
            pred_ids = pred_ids.unsqueeze(0)

            # decode predicted + target
            pred = decode_ids(pred_ids[0].tolist(), vocab_hi)
            tgt = decode_ids(trg_b[0].tolist(), vocab_hi)

            hypotheses.append(pred)
            references.append([tgt])

    bleu = corpus_bleu(hypotheses, list(zip(*references)))
    return bleu.score


In [None]:
bleu_score = compute_bleu(model, val_loader, vocab_en, vocab_hi)
print("BLEU:", bleu_score)


# Visualizations

## Train vs validation loss curves

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Transformer Training Curve")
plt.legend()
plt.grid()
plt.savefig("loss_curve.png", dpi=200)
plt.show()


# sample translations for github

In [None]:
samples = []

for i in range(5):   # take 5 examples
    # Convert list → tensor
    src_ids = torch.tensor(val_loader.dataset[i][0])
    trg_ids = torch.tensor(val_loader.dataset[i][1])

    # Decode English
    eng = decode_ids(src_ids.tolist(), vocab_en)

    # Predict Hindi
    hin_pred = translate_sentence(model, eng)

    # True Hindi
    hin_true = decode_ids(trg_ids.tolist(), vocab_hi)

    samples.append((eng, hin_pred, hin_true))

import pandas as pd
df_samples = pd.DataFrame(samples, columns=["English", "Predicted Hindi", "Actual Hindi"])
df_samples.to_csv("sample_translations.csv", index=False)

df_samples


# Attention Heatmaps

In [None]:
@torch.no_grad()
def get_attention(model, src, trg):
    model.eval()
    src, trg = src.to(device), trg.to(device)
    src_mask = create_padding_mask(src, 0).to(device)
    trg_mask = create_decoder_mask(trg, 0).to(device)
    logits, attn_weights = model(src, trg[:, :-1], src_mask, trg_mask[:, :, :-1, :-1])
    return attn_weights[-1]  # last decoder layer


In [None]:
import seaborn as sns

def plot_cross_attention(attn, src_tokens, trg_tokens, head=0):
    att = attn[0, head].cpu().numpy()
    plt.figure(figsize=(10,6))
    sns.heatmap(att, xticklabels=src_tokens, yticklabels=trg_tokens, cmap="viridis")
    plt.xlabel("Source Tokens")
    plt.ylabel("Target Tokens")
    plt.title(f"Cross Attention Heatmap (Head {head})")
    plt.tight_layout()
    plt.savefig("attention_heatmap.png", dpi=200)
    plt.show()

src, trg = next(iter(val_loader))
attn = get_attention(model, src, trg)

src_tokens = decode_ids(src[0].tolist(), vocab_en).split()
trg_tokens = decode_ids(trg[0].tolist(), vocab_hi).split()

plot_cross_attention(attn, src_tokens, trg_tokens)


# Saving everything

In [None]:
import json

with open("vocab_en.json", "w") as f:
    json.dump(vocab_en, f)

with open("vocab_hi.json", "w") as f:
    json.dump(vocab_hi, f)
torch.save(model.state_dict(), "transformer_model.pth")

metrics = {
    "train_losses": train_losses,
    "val_losses": val_losses,
    "bleu_score": bleu_score
}
pd.DataFrame(metrics).to_csv("metrics.csv", index=False)


In [None]:
import zipfile
import os

files_to_zip = [
    "transformer_model.pth",
    "vocab_en.json",
    "vocab_hi.json",
    "metrics.csv",
    "loss_curve.png",
    "attention_heatmap.png",
    "sample_translations.csv"
]

with zipfile.ZipFile("transformer_project_outputs.zip", "w") as zipf:
    for f in files_to_zip:
        if os.path.exists(f):
            zipf.write(f)

print("ZIP created: transformer_project_outputs.zip")
