<a href="https://colab.research.google.com/github/AlperYildirim1/Pay-Attention-Later/blob/main/ISMR_test_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q torchmetrics sacrebleu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import gc
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
from torchmetrics.text import BLEUScore
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from datasets import load_dataset
from torch.utils.data import DataLoader
from typing import List

# ==============================================================================
# --- 1. GLOBAL CONFIGURATIONS ---
# ==============================================================================
print("--- Loading Global Configurations ---")
DRIVE_BASE_PATH = "/content/drive/MyDrive"
ORIGINAL_BUCKETED_REPO_ID = "Yujivus/multi30k-de-en-bucketed-w4"
MODEL_CHECKPOINT = "Helsinki-NLP/opus-mt-de-en" # Tokenizer source

# --- Model Hyperparameters (Must match training) ---
MAX_LENGTH = 128
D_MODEL = 512
NUM_HEADS = 8
D_FF = 2048
DROPOUT = 0.1
EVAL_BATCH_SIZE = 64
SEED_FOR_DATALOADER = 115 # Just needs to be consistent

# --- Device Setup ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ==============================================================================
# --- 2. DATA LOADER SETUP ---
# ==============================================================================

def seed_worker(worker_id):
    """Sets seed for DataLoader workers for reproducibility."""
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

print("--- Initializing Tokenizer and DataLoaders ---")
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
VOCAB_SIZE = len(tokenizer)
print(f"Vocab size: {VOCAB_SIZE}")

# Load Original Datasets (for 'test' split)
try:
    original_datasets = load_dataset(ORIGINAL_BUCKETED_REPO_ID)
except Exception as e:
    print(f"CRITICAL: Could not load dataset. Check HF connection or path: {e}")
    # This is a fatal error, but we'll let it crash later if 'setup_test_dataloader' is called

# Define the standard collator
standard_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

# Define a consistent generator for the DataLoader
g = torch.Generator()
g.manual_seed(SEED_FOR_DATALOADER)

def setup_test_dataloader():
    """Sets up the DataLoader for the test set."""
    print(f"Loading 'test' split from: {ORIGINAL_BUCKETED_REPO_ID}")
    try:
        test_dataset = original_datasets["test"]
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=EVAL_BATCH_SIZE,
            collate_fn=standard_collator,
            num_workers=0,
            pin_memory=True,
            worker_init_fn=seed_worker,
            generator=g,
        )
        print("Test dataloader created successfully.")
        return test_dataloader
    except Exception as e:
        print(f"!!! ERROR: Failed to create test dataloader. {e}")
        raise

# ==============================================================================
# --- 3. MODEL ARCHITECTURE (Must match training) ---
# ==============================================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    """Injects positional information into the input embeddings."""
    def __init__(self, d_model: int, max_len: int = 5000):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor):
        # x shape: [batch_size, seq_len, d_model]
        return x + self.pe[:, :x.size(1)]

class FeedForward(nn.Module):
    """A standard two-layer feed-forward network with a ReLU activation."""
    def __init__(self, d_model: int, dff: int, dropout_rate: float = 0.1):
        super().__init__()
        self.ffn = nn.Sequential(
            nn.Linear(d_model, dff),
            nn.ReLU(),
            nn.Linear(dff, d_model),
            nn.Dropout(dropout_rate)
        )
    def forward(self, x: torch.Tensor):
        return self.ffn(x)

class StandardTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers, num_heads, d_model, dff, vocab_size, max_length, dropout):
        super().__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_length)
        self.dropout = nn.Dropout(dropout)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model, num_heads, dff, dropout, batch_first=True, norm_first=True # <-- THE FIX
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        decoder_layer = nn.TransformerDecoderLayer(
            d_model, num_heads, dff, dropout, batch_first=True, norm_first=True # <-- THE FIX
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)

        self.final_linear = nn.Linear(d_model, vocab_size)
        self.final_linear.weight = self.embedding.weight

    def forward(self, src, tgt, src_padding_mask, tgt_padding_mask, memory_key_padding_mask, tgt_mask):

        src_emb = self.embedding(src) * math.sqrt(self.d_model)
        tgt_emb = self.embedding(tgt) * math.sqrt(self.d_model)
        src_emb_pos = self.dropout(self.pos_encoder(src_emb))
        tgt_emb_pos = self.dropout(self.pos_encoder(tgt_emb))

        memory = self.encoder(src_emb_pos, src_key_padding_mask=src_padding_mask)
        decoder_output = self.decoder(
            tgt=tgt_emb_pos, memory=memory, tgt_mask=tgt_mask,
            tgt_key_padding_mask=tgt_padding_mask, memory_key_padding_mask=memory_key_padding_mask
        )
        return self.final_linear(decoder_output)


    def create_masks(self, src, tgt):
        src_padding_mask = (src == tokenizer.pad_token_id)
        tgt_padding_mask = (tgt == tokenizer.pad_token_id)
        # Creates a square causal mask for the decoder. This prevents any token from attending to future tokens. With this way model can not cheat.
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(
            sz=tgt.size(1),
            device=src.device,
            dtype=torch.bool
        )
        return src_padding_mask, tgt_padding_mask, src_padding_mask, tgt_mask

    @torch.no_grad()
    def generate(self, src: torch.Tensor, max_length: int, num_beams: int = 5) -> torch.Tensor:
        self.eval()
        src_padding_mask = (src == tokenizer.pad_token_id)

        src_emb = self.embedding(src) * math.sqrt(self.d_model)
        src_emb_pos = self.pos_encoder(src_emb)
        memory = self.encoder(self.dropout(src_emb_pos), src_key_padding_mask=src_padding_mask)

        batch_size = src.shape[0]
        memory = memory.repeat_interleave(num_beams, dim=0)
        memory_key_padding_mask = src_padding_mask.repeat_interleave(num_beams, dim=0)

        initial_token = tokenizer.pad_token_id
        beams = torch.full((batch_size * num_beams, 1), initial_token, dtype=torch.long, device=src.device)

        beam_scores = torch.zeros(batch_size * num_beams, device=src.device)
        finished_beams = torch.zeros(batch_size * num_beams, dtype=torch.bool, device=src.device)
        for _ in range(max_length - 1):
            if finished_beams.all(): break
            tgt_mask = nn.Transformer.generate_square_subsequent_mask(beams.size(1)).to(src.device)
            tgt_emb = self.embedding(beams) * math.sqrt(self.d_model) # FIX HERE TOO
            tgt_emb_pos = self.pos_encoder(tgt_emb)
            decoder_output = self.decoder(tgt=self.dropout(tgt_emb_pos), memory=memory, tgt_mask=tgt_mask, memory_key_padding_mask=memory_key_padding_mask)
            logits = self.final_linear(decoder_output[:, -1, :])
            log_probs = F.log_softmax(logits, dim=-1)
            log_probs[:, tokenizer.pad_token_id] = -torch.inf
            if finished_beams.any(): log_probs[finished_beams, tokenizer.eos_token_id] = 0
            total_scores = beam_scores.unsqueeze(1) + log_probs
            if _ == 0:
                total_scores = total_scores.view(batch_size, num_beams, -1)
                total_scores[:, 1:, :] = -torch.inf # Sadece ilk beam'in başlamasına izin ver
                total_scores = total_scores.view(batch_size * num_beams, -1)
            else:
                total_scores = beam_scores.unsqueeze(1) + log_probs
            total_scores = total_scores.view(batch_size, -1)
            top_scores, top_indices = torch.topk(total_scores, k=num_beams, dim=1)
            beam_indices = top_indices // log_probs.shape[-1]; token_indices = top_indices % log_probs.shape[-1]
            batch_indices = torch.arange(batch_size, device=src.device).unsqueeze(1)
            effective_indices = (batch_indices * num_beams + beam_indices).view(-1)
            beams = beams[effective_indices]
            beams = torch.cat([beams, token_indices.view(-1, 1)], dim=1)
            beam_scores = top_scores.view(-1)
            finished_beams = finished_beams | (beams[:, -1] == tokenizer.eos_token_id)
        final_beams = beams.view(batch_size, num_beams, -1)
        final_scores = beam_scores.view(batch_size, num_beams)
        normalized_scores = final_scores / (final_beams != tokenizer.pad_token_id).sum(-1).float().clamp(min=1)
        best_beams = final_beams[torch.arange(batch_size), normalized_scores.argmax(1), :]
        self.train()
        return best_beams


# ==============================================================================
# --- 4. EVALUATION FUNCTION (Must match training) ---
# ==============================================================================

def evaluate(model, dataloader, device):
    """Evaluates the model using beam search decoding."""
    bleu_metric = BLEUScore()
    model.eval() # Ensure model is in eval mode

    for batch in tqdm(dataloader, desc="Evaluating", leave=False):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'] # Keep labels on CPU for sacrebleu

        generated_ids = model.generate(input_ids, max_length=MAX_LENGTH, num_beams=5)

        # Decode predictions and references
        pred_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        # Replace -100 in labels (which are on CPU) before decoding
        labels[labels == -100] = tokenizer.pad_token_id
        ref_texts = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # sacrebleu expects references to be in a list of lists
        bleu_metric.update(pred_texts, [[ref] for ref in ref_texts])

    return bleu_metric.compute().item() * 100 # Return BLEU score as 0-100

# ==============================================================================
# --- 5. EXPERIMENT DEFINITIONS ---
# ==============================================================================

# --- Model Architecture Configs ---
STD_TRANSFORMER_CONFIG = {
    "num_encoder_layers": 20,
    "num_decoder_layers": 20,
    "num_heads": NUM_HEADS,
    "d_model": D_MODEL,
    "dff": D_FF,
    "vocab_size": VOCAB_SIZE,
    "max_length": MAX_LENGTH,
    "dropout": DROPOUT
}

WIDE_TRANSFORMER_CONFIG = {
    "num_encoder_layers": 1,
    "num_decoder_layers": 1,
    "num_heads": NUM_HEADS,
    "d_model": D_MODEL,
    "dff": D_FF,
    "vocab_size": VOCAB_SIZE,
    "max_length": MAX_LENGTH,
    "dropout": DROPOUT
}

# --- Programmatically generate all experiment paths ---
SEEDS = [115, 116, 117, 118]
experiments = {}

for seed in SEEDS:
    # --- 20-Layer ---
    experiments[f"20-Layer (Seed {seed}) - Baseline"] = {
        "path": f"iterative/iterative-30k-dataset-seed-{seed}-20-layered-transformer-1e-4/iter_1",
        "config": STD_TRANSFORMER_CONFIG,
        "group": "20-Layer Baseline"
    }
    experiments[f"20-Layer (Seed {seed}) - ISMR"] = {
        "path": f"iterative/iterative-30k-dataset-seed-{seed}-20-layered-transformer-1e-4/iter_2",
        "config": STD_TRANSFORMER_CONFIG,
        "group": "20-Layer ISMR"
    }
    experiments[f"20-Layer (Seed {seed}) - Ablation"] = {
        "path": f"iterative/iterative-30k-dataset-seed-{seed}-SHUFFLED-ABLATION-1e-4/iter_1_shuffled",
        "config": STD_TRANSFORMER_CONFIG,
        "group": "20-Layer Ablation"
    }
    # --- 1-Layer ---
    experiments[f"1-Layer (Seed {seed}) - Baseline"] = {
        "path": f"iterative/iterative-30k-dataset-seed-{seed}-1-layered-transformer-1e-4/iter_1",
        "config": WIDE_TRANSFORMER_CONFIG,
        "group": "1-Layer Baseline"
    }
    experiments[f"1-Layer (Seed {seed}) - ISMR"] = {
        "path": f"iterative/iterative-30k-dataset-seed-{seed}-1-layered-transformer-1e-4/iter_2",
        "config": WIDE_TRANSFORMER_CONFIG,
        "group": "1-Layer ISMR"
    }

print(f"--- Defined {len(experiments)} total experiments to evaluate ---")

# ==============================================================================
# --- 6. MAIN EVALUATION LOOP ---
# ==============================================================================

print("\n--- Starting Final Evaluation on Test Set ---")

# --- Setup Test Dataloader (once) ---
test_dataloader = setup_test_dataloader()
results_list = []

for name, details in experiments.items():
    print("\n" + "="*70)
    print(f"--- Evaluating Experiment: {name} ---")
    print(f"    Group: {details['group']}")

    # --- 1. Define Path to Best Model ---
    iter_folder = details["path"] # Get the base path from the dictionary

    # *** NEW FALLBACK LOGIC (from plot_averages.py) ***
    # Check if this is an ablation run and if the 'iter_1_shuffled' path exists
    if details['group'] == "20-Layer Ablation":
        base_run_path = details["path"].rsplit('/', 1)[0] # Get the parent folder
        shuffled_path = os.path.join(DRIVE_BASE_PATH, base_run_path, "iter_1_shuffled")
        if not os.path.exists(shuffled_path):
            print(f"    [INFO] 'iter_1_shuffled' not found. Trying 'iter_1' instead.")
            # Re-point iter_folder to the 'iter_1' version
            iter_folder = os.path.join(base_run_path, "iter_1")
    # *** END OF NEW LOGIC ***

    model_relative_path = os.path.join(iter_folder, "models", "best.pt")
    final_best_model_path = os.path.join(DRIVE_BASE_PATH, model_relative_path)
    print(f"    Loading model from: {final_best_model_path}")

    if not os.path.exists(final_best_model_path):
        print(f"!!! WARNING: Model file not found. Skipping this experiment.")
        results_list.append({
            "Experiment": name,
            "Group": details['group'],
            "Test BLEU": np.nan,
            "Error": "Model file not found"
        })
        continue

    # --- 2. Instantiate Model ---
    print("    Instantiating model...")
    try:
        model_config = details["config"]
        final_model = StandardTransformer(**model_config)
    except Exception as e:
        print(f"!!! ERROR: Failed to instantiate model. {e}")
        results_list.append({
            "Experiment": name,
            "Group": details['group'],
            "Test BLEU": np.nan,
            "Error": "Model instantiation error"
        })
        continue

    # --- 3. Load Weights ---
    try:
        final_model.load_state_dict(torch.load(final_best_model_path, map_location=device))
        print("    Successfully loaded model weights.")
        final_model.to(device)
        final_model.eval()
    except Exception as e:
        print(f"!!! ERROR: Failed to load model weights. {e}")
        results_list.append({
            "Experiment": name,
            "Group": details['group'],
            "Test BLEU": np.nan,
            "Error": "Weight loading error"
        })
        del final_model
        gc.collect()
        continue

    # --- 4. Run Evaluation ---
    print("    Running evaluation on the test set...")
    try:
        test_bleu_score = evaluate(final_model, test_dataloader, device)
        print(f"    --> Test Set BLEU Score: {test_bleu_score:.4f}")
        results_list.append({
            "Experiment": name,
            "Group": details['group'],
            "Test BLEU": test_bleu_score,
            "Error": None
        })

    except Exception as e:
        print(f"!!! ERROR: Evaluation failed. {e}")
        results_list.append({
            "Experiment": name,
            "Group": details['group'],
            "Test BLEU": np.nan,
            "Error": f"Evaluation runtime error: {e}"
        })

    # --- 5. Clean up ---
    del final_model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

# ==============================================================================
# --- 7. FINAL REPORT ---
# ==============================================================================

print("\n\n" + "="*70)
print("--- FINAL TEST SET RESULTS (ALL RUNS) ---")
print("="*70)

df_results = pd.DataFrame(results_list)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
print(df_results.to_string(index=False, columns=["Group", "Experiment", "Test BLEU", "Error"]))

print("\n\n" + "="*70)
print("--- FINAL AVERAGED RESULTS SUMMARY ---")
print("="*70)

# Define a specific order for the final table
group_order = [
    "20-Layer Baseline",
    "20-Layer ISMR",
    "20-Layer Ablation",
    "1-Layer Baseline",
    "1-Layer ISMR"
]

# Calculate statistics
avg_results = df_results.groupby('Group')['Test BLEU'].agg(['mean', 'std', 'count']).reindex(group_order)
print(avg_results.to_string(float_format="%.4f"))

print("\n\nEvaluation complete.")

In [None]:
from google.colab import runtime
runtime.unassign()