<a href="https://colab.research.google.com/github/AlperYildirim1/Pay-Attention-Later/blob/main/Shimmer_Gated_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install "numpy<2.0.0"

In [None]:
# @title üìä Shimmer V5: Official Paper Evaluation (WMT14 Gold Standard)
# ==============================================================================
# 0. INSTALL & SETUP
# ==============================================================================
# Numpy fix for Colab
!pip install -q unbabel-comet bert_score x-transformers sacremoses sacrebleu huggingface_hub

import torch
import pandas as pd
from tqdm.auto import tqdm
from transformers import AutoTokenizer
from datasets import load_dataset
from bert_score import score as bert_score_func
from comet import download_model, load_from_checkpoint
import sacrebleu
import sys
import os
from huggingface_hub import hf_hub_download

# --- CONFIGURATION ---
REPO_ID = "Yujivus/PRISM-Molecule-100k"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LENGTH = 128
BATCH_SIZE = 32
BEAM_SIZE = 5

print(f"‚öôÔ∏è Hardware: {DEVICE}")

# ==============================================================================
# 1. LOAD SHIMMER FROM HUGGING FACE
# ==============================================================================
print(f"üì• Downloading Architecture Code from {REPO_ID}...")
os.makedirs("shimmer_code", exist_ok=True)
hf_hub_download(repo_id=REPO_ID, filename="modeling_prism_gated.py", local_dir="shimmer_code")
sys.path.append("shimmer_code")

from modeling_prism_gated import PRISMHybrid_RoPE

print("üìö Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(REPO_ID)

print("üèóÔ∏è Constructing Shimmer V5...")
CONFIG = {
    "vocab_size": 58101,
    "d_model": 512,
    "num_heads": 8,
    "dff": 2048,
    "dropout": 0.1,
    "max_length": 128,
    "num_encoder_layers": 6,
    "num_refining_layers": 0,
    "num_decoder_layers": 6
}
model = PRISMHybrid_RoPE(**CONFIG)

print("üì• Downloading Weights...")
weights_path = hf_hub_download(repo_id=REPO_ID, filename="pytorch_model.bin")
state_dict = torch.load(weights_path, map_location=DEVICE)
model.load_state_dict(state_dict)

model.to(DEVICE)
model.eval()
print("‚úÖ Shimmer V5 Ready.")


In [None]:

# ==============================================================================
# 2. DATA LOADING (OFFICIAL WMT14)
# ==============================================================================
print("üìâ Loading OFFICIAL WMT14 Test Set (newstest2014)...")
# Hugging Face 'wmt14' veri seti 'translation' key'i altƒ±nda 'de' ve 'en' tutar.
dataset = load_dataset("wmt14", "de-en", split="test")

print(f"   Total Test Examples: {len(dataset)}")

# ==============================================================================
# 3. GENERATION LOOP (Raw Text -> Tokenize -> Generate)
# ==============================================================================
print(f"üöÄ Generating Translations (Beam={BEAM_SIZE})...")

all_sources = []
all_preds = []
all_refs = []

# Veriyi batch'ler halinde i≈ülemek i√ßin basit bir loop
# Hugging Face dataset'i slice edilebilir: dataset[0:32]
total_samples = len(dataset)

with torch.no_grad():
    for i in tqdm(range(0, total_samples, BATCH_SIZE), desc="Translating"):
        # Batch'i al (Dictionary d√∂ner: {'translation': [{'de':..., 'en':...}, ...]})
        batch = dataset[i : i + BATCH_SIZE]

        # Kaynak ve Hedef metinleri ayƒ±kla
        src_texts = [x['de'] for x in batch['translation']]
        ref_texts = [x['en'] for x in batch['translation']]

        # Kaydedelim (COMET i√ßin lazƒ±m)
        all_sources.extend(src_texts)
        all_refs.extend(ref_texts)

        # Tokenize (Anlƒ±k)
        inputs = tokenizer(
            src_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_LENGTH
        ).to(DEVICE)

        # Generate
        generated_ids = model.generate(
            inputs.input_ids,
            max_length=MAX_LENGTH,
            num_beams=BEAM_SIZE
        )

        # Decode
        pred_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        all_preds.extend(pred_texts)

# ==============================================================================
# 4. SCORING (GOLD STANDARD)
# ==============================================================================
print("\nüìä Calculating Metrics...")

# --- A. SacreBLEU (Official) ---
# WMT standartlarƒ±nda referanslar liste i√ßinde liste olmalƒ±
print("   Calculating SacreBLEU...")
bleu = sacrebleu.corpus_bleu(all_preds, [all_refs])
shimmer_bleu = bleu.score

# --- B. COMET (WMT22) ---
print("‚òÑÔ∏è Loading COMET (wmt22-comet-da)...")
comet_path = download_model("Unbabel/wmt22-comet-da")
comet_model = load_from_checkpoint(comet_path).to(DEVICE)
comet_data = [{"src": s, "mt": p, "ref": r} for s, p, r in zip(all_sources, all_preds, all_refs)]
# Batch size'ƒ± GPU'ya g√∂re artƒ±rabilirsin (A100 ise 64-128 yap)
comet_out = comet_model.predict(comet_data, batch_size=32, gpus=1, progress_bar=True)
shimmer_comet = comet_out.system_score

# --- C. BERTScore ---
print("ü§ñ Calculating BERTScore...")
P, R, F1 = bert_score_func(all_preds, all_refs, lang="en", verbose=False, device=DEVICE, batch_size=32)
shimmer_bert = F1.mean().item()

# ==============================================================================
# 5. FINAL REPORT
# ==============================================================================
# Baseline deƒüerlerini buraya manuel girmen gerekecek (veya kƒ±yaslamayƒ± sonra yaparsƒ±n)
# ≈ûimdilik placeholder (0.0) koydum.
BASELINE_BLEU = 29.30  # Bunu kendi baseline sonucunla g√ºncelle
BASELINE_COMET = 0.8114
BASELINE_BERT = 0.9427

results_df = pd.DataFrame({
    "Metric": ["SacreBLEU", "COMET (wmt22)", "BERTScore (F1)"],
    "Transformer (Baseline)": [BASELINE_BLEU, BASELINE_COMET, BASELINE_BERT],
    "Shimmer (PRISM v5)": [shimmer_bleu, shimmer_comet, shimmer_bert],
})
results_df["Delta"] = results_df["Shimmer (PRISM v5)"] - results_df["Transformer (Baseline)"]

print("\n" + "="*70)
print(f"‚öñÔ∏è  FINAL OFFICIAL RESULTS (WMT14 Test Set)  ‚öñÔ∏è")
print("="*70)
print(results_df.to_string(index=False))
print("="*70)

In [None]:
# @title üßÆ RoSE Parameter Autopsy
import torch
import torch.nn as nn
from modeling_prism_gated import PRISMHybrid_RoPE
from transformers import AutoTokenizer
import pandas as pd

# --- CONFIG (Must match your checkpoint) ---
CONFIG = {
    "vocab_size": 58101,
    "d_model": 512,
    "num_heads": 8,
    "dff": 2048,
    "dropout": 0.1,
    "max_length": 128,
    "num_encoder_layers": 6,
    "num_refining_layers": 0,
    "num_decoder_layers": 6
}

print("üèóÔ∏è Initializing Model...")
model = PRISMHybrid_RoPE(**CONFIG)

# Helper function to count parameters accurately (handling Complex params)
def get_params(module):
    count = 0
    for p in module.parameters():
        # If parameter is complex (cfloat), it takes 2x floats (Real+Imag)
        # But usually we count "learnable values".
        if p.is_complex():
            count += p.numel() * 2
        else:
            count += p.numel()
    return count

# --- BREAKDOWN ---
# 1. Embeddings (Memory)
# Includes Source (Harmonic) and Target (Standard)
# Note: Harmonic is usually Frozen in your experiments, but we count it as "Parameter Load"
p_emb_src = get_params(model.harmonic_embedding)
p_emb_tgt = get_params(model.tgt_embedding)
p_emb_total = p_emb_src + p_emb_tgt

# 2. Encoder (Spectral Logic)
p_encoder = get_params(model.prism_encoder)

# 3. Bridge (Interface)
p_bridge = get_params(model.bridge)

# 4. Decoder (Standard Transformer Logic)
p_decoder = get_params(model.decoder)

# 5. Output Head (Usually tied to target embeddings, but let's count unique weights if untied)
# In your code: self.final_linear.weight = self.tgt_embedding.weight (Shared)
# So we don't double count.
p_head = 0

total_params = p_emb_total + p_encoder + p_bridge + p_decoder

# --- TRANSFORMER BASELINE ESTIMATE (For Comparison) ---
# Standard Base: 6 Enc, 6 Dec, d=512, Shared Embeddings
# Enc Layer ~= 3.15M | Dec Layer ~= 4.2M
# Embeddings (Shared Src/Tgt) ~= 58101 * 512 = 29.7M
baseline_emb = 58101 * 512
baseline_enc = 6 * (3.15 * 10**6) # Approx standard layer size
baseline_dec = 6 * (4.2 * 10**6)  # Approx standard layer size
baseline_total = baseline_emb + baseline_enc + baseline_dec

# --- DATAFRAME GENERATION ---
data = [
    ["Embeddings (Memory)", f"{p_emb_total/1e6:.1f}M", f"{baseline_emb/1e6:.1f}M", "High (Complex x2 + Separate)"],
    ["Encoder (Logic)",     f"{p_encoder/1e6:.1f}M",   f"{baseline_enc/1e6:.1f}M",  "Check Result below"],
    ["Bridge (Interface)",  f"{p_bridge/1e6:.1f}M",    "0.0M",                    "PRISM specific"],
    ["Decoder (Refine)",    f"{p_decoder/1e6:.1f}M",   f"{baseline_dec/1e6:.1f}M",  "Similar"],
    ["TOTAL",               f"{total_params/1e6:.1f}M",f"{baseline_total/1e6:.1f}M", "Total Capacity"]
]

df = pd.DataFrame(data, columns=["Component", "RoSE (Yours)", "Standard Transformer", "Note"])

print("\n" + "="*60)
print("üìä ARCHITECTURAL BREAKDOWN")
print("="*60)
print(df.to_string(index=False))
print("="*60)

# Calculate Logic Ratio
rose_logic = p_encoder + p_decoder
rose_ratio = rose_logic / total_params
trans_logic = baseline_enc + baseline_dec
trans_ratio = trans_logic / baseline_total

print(f"\nüß† Logic/Reasoning Ratio:")
print(f"   RoSE:        {rose_ratio:.1%} of params are Logic")
print(f"   Transformer: {trans_ratio:.1%} of params are Logic")