In [1]:
# CÉLULA 1 — Setup de pacotes + Mixed Precision (rápido e estável)
import sys, subprocess, importlib

def pip_install(pkg: str):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

# Pacotes utilitários
for pkg in ["tensorflow-datasets", "sacrebleu"]:
    try:
        importlib.import_module(pkg.replace("-", "_"))
    except Exception:
        pip_install(pkg)

import tensorflow as tf

# tensorflow-text precisa casar com a versão do TF
try:
    import tensorflow_text as text  # noqa
except Exception:
    pip_install(f"tensorflow-text=={tf.__version__}")
    import tensorflow_text as text  # noqa

# Mixed precision: acelera em GPUs com Tensor Cores (T4/A100)
from tensorflow.keras import mixed_precision as mp
mp.set_global_policy("mixed_float16")
print("TF:", tf.__version__)
print("Mixed precision policy:", mp.global_policy())
print("GPUs visíveis:", tf.config.list_physical_devices('GPU'))


TF: 2.19.0
Mixed precision policy: <DTypePolicy "mixed_float16">
GPUs visíveis: []


In [2]:
# CÉLULA 2 — Imports base, seeds e TFDS (TED HRLR pt→en)
import time, numpy as np, tensorflow_datasets as tfds
np.random.seed(42); tf.random.set_seed(42)

examples, info = tfds.load("ted_hrlr_translate/pt_to_en", with_info=True, as_supervised=True)
train_ds, val_ds, test_ds = examples["train"], examples["validation"], examples["test"]
print(info)




Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/incomplete.TDWDUH_1.0.0/ted_hrlr_translate-tra…

Generating validation examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/incomplete.TDWDUH_1.0.0/ted_hrlr_translate-val…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/incomplete.TDWDUH_1.0.0/ted_hrlr_translate-tes…

Dataset ted_hrlr_translate downloaded and prepared to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0. Subsequent calls will reuse this data.
tfds.core.DatasetInfo(
    name='ted_hrlr_translate',
    full_name='ted_hrlr_translate/pt_to_en/1.0.0',
    description="""
    Data sets derived from TED talk transcripts for comparing similar language pairs
    where one is high resource and the other is low resource.
    """,
    config_description="""
    Translation dataset from pt to en in plain text.
    """,
    homepage='https://github.com/neulab/word-embeddings-for-nmt',
    data_dir='/root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0',
    file_format=tfrecord,
    download_size=124.94 MiB,
    dataset_size=10.89 MiB,
    features=Translation({
        'en': Text(shape=(), dtype=string),
        'pt': Text(shape=(), dtype=string),
    }),
    supervised_keys=('pt', 'en'),
    disable_shuffling=False,
    nondeterministic_order=False,
    splits={
        'test': <

In [3]:
# CÉLULA 3 — (opcional) Amostra de dados bruta
for pt, en in train_ds.batch(3).take(1):
    print("PT:", [x.decode() for x in pt.numpy()])
    print("EN:", [x.decode() for x in en.numpy()])


PT: ['e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .', 'mas e se estes fatores fossem ativos ?', 'mas eles não tinham a curiosidade de me testar .']
EN: ['and when you improve searchability , you actually take away the one advantage of print , which is serendipity .', 'but what if it were active ?', "but they did n't test for curiosity ."]


In [4]:
# CÉLULA 2A — Sanity check de GPU (rode antes do treino)
import os, tensorflow as tf, time, numpy as np
print("TF:", tf.__version__)
print("Logical GPUs:", tf.config.list_logical_devices('GPU'))
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES", "<não definido>"))

# micro-teste de matmul p/ ver tempo de compute
x = tf.random.normal([4096, 4096])
t0 = time.perf_counter()
_ = tf.linalg.matmul(x, x)  # deve rodar no GPU se disponível
dt = time.perf_counter() - t0
print(f"Teste matmul 4096x4096: {dt:.3f}s (GPU deve ser << CPU)")


TF: 2.19.0
Logical GPUs: []
CUDA_VISIBLE_DEVICES: <não definido>
Teste matmul 4096x4096: 6.536s (GPU deve ser << CPU)


In [5]:
# CÉLULA 4 — Tokenizers TED (carregamento robusto do SavedModel)
import os, zipfile, pathlib, shutil

model_name = "ted_hrlr_translate_pt_en_converter"
zip_path = tf.keras.utils.get_file(
    f"{model_name}.zip",
    f"https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip",
    cache_dir='.', cache_subdir='', extract=False)

extract_dir = pathlib.Path(zip_path).with_suffix('')  # ./ted_hrlr_translate_pt_en_converter
if extract_dir.exists():
    shutil.rmtree(extract_dir)

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(path=extract_dir)

candidates = [extract_dir, extract_dir / model_name]
load_dir = next((c for c in candidates if (c / "saved_model.pb").exists()), None)
if load_dir is None:
    raise FileNotFoundError(f"SavedModel não encontrado em {candidates}")

print("Carregando tokenizers de:", load_dir.resolve())
tokenizers = tf.saved_model.load(str(load_dir))


Downloading data from https://storage.googleapis.com/download.tensorflow.org/models/ted_hrlr_translate_pt_en_converter.zip
[1m184801/184801[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Carregando tokenizers de: /content/ted_hrlr_translate_pt_en_converter/ted_hrlr_translate_pt_en_converter


In [6]:
# CÉLULA 5 — Pipeline ultra-rápido (≈10 min)
BUFFER_SIZE = 10000
BATCH_SIZE  = 32         # ↓ batch menor reduz custo/step
MAX_TOKENS  = 32         # ↓ seq curta acelera bastante

def prepare_batch(pt, en):
    # tokeniza e corta
    pt_tok = tokenizers.pt.tokenize(pt)[:, :MAX_TOKENS].to_tensor()
    en_tok = tokenizers.en.tokenize(en)[:, :MAX_TOKENS+1]
    # pares (in/label)
    en_in  = en_tok[:, :-1].to_tensor()
    en_lab = en_tok[:,  1:].to_tensor()
    return (pt_tok, en_in), en_lab

def make_batches(ds):
    return (ds
            .shuffle(BUFFER_SIZE)
            .batch(BATCH_SIZE, drop_remainder=True)  # drop_remainder evita shapes variando (menos retrace)
            .map(prepare_batch, num_parallel_calls=tf.data.AUTOTUNE)
            .prefetch(tf.data.AUTOTUNE))

train_batches = make_batches(train_ds)
val_batches   = make_batches(val_ds)


In [7]:
# CÉLULA 6 — Positional Encoding (sen/cos)
def positional_encoding(length, depth):
    import numpy as np
    depth = depth/2
    positions = np.arange(length)[:, None]
    depths    = np.arange(depth)[None, :]/depth
    angle_rates = 1/(10000**depths)
    angle_rads  = positions * angle_rates
    pe = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1).astype("float32")
    return tf.constant(pe)  # tf.float32


In [8]:
# CÉLULA 7 — Blocos de atenção/FFN (com dtypes consistentes em MP)
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, heads, d_model, drop=0.1):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=heads, key_dim=d_model, dropout=drop)
        self.add = tf.keras.layers.Add()
        self.norm = tf.keras.layers.LayerNormalization(dtype='float32')  # LN em fp32 aumenta estabilidade

class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        y = self.mha(query=x, value=x, key=x)
        y = tf.cast(y, x.dtype)  # garante dtype igual antes do Add
        x = self.add([x, y])
        return self.norm(x)

class CausalSelfAttention(BaseAttention):
    def call(self, x):
        y = self.mha(query=x, value=x, key=x, use_causal_mask=True)
        y = tf.cast(y, x.dtype)
        x = self.add([x, y])
        return self.norm(x)

class CrossAttention(BaseAttention):
    def call(self, x, ctx):
        y, _ = self.mha(query=x, value=ctx, key=ctx, return_attention_scores=True)
        y = tf.cast(y, x.dtype)
        x = self.add([x, y])
        return self.norm(x)

class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, drop=0.1):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(dff, activation="relu")
        self.d2 = tf.keras.layers.Dense(d_model)
        self.do = tf.keras.layers.Dropout(drop)
        self.ln = tf.keras.layers.LayerNormalization(dtype='float32')
    def call(self, x):
        y = self.d1(x)
        y = self.d2(y)
        y = tf.cast(y, x.dtype)
        y = self.do(y)
        y = self.add = tf.keras.layers.Add()([x, y])
        return self.ln(y)


In [9]:
# CÉLULA 8 — Encoder, Decoder e Transformer (cast da PE + logits em fp32)
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, heads, dff, drop=0.1):
        super().__init__()
        self.sa = GlobalSelfAttention(heads, d_model, drop)
        self.ff = FeedForward(d_model, dff, drop)
    def call(self, x):
        x = self.sa(x)
        x = self.ff(x)
        return x

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, heads, dff, drop=0.1):
        super().__init__()
        self.sa = CausalSelfAttention(heads, d_model, drop)
        self.ca = CrossAttention(heads, d_model, drop)
        self.ff = FeedForward(d_model, dff, drop)
    def call(self, x, ctx):
        x = self.sa(x)
        x = self.ca(x, ctx)
        x = self.ff(x)
        return x

class Encoder(tf.keras.layers.Layer):
    def __init__(self, L, d_model, heads, dff, vocab, drop=0.1, max_pos=2048):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(vocab, d_model, mask_zero=True)
        self.pe  = positional_encoding(max_pos, d_model)  # float32
        self.layers = [EncoderLayer(d_model, heads, dff, drop) for _ in range(L)]
    def call(self, x):
        x = self.emb(x)  # provavelmente float16 sob MP
        pe = tf.cast(self.pe[None, :tf.shape(x)[1], :], x.dtype)  # CAST → dtype do embedding
        x = x + pe
        for layer in self.layers:
            x = layer(x)
        return x

class Decoder(tf.keras.layers.Layer):
    def __init__(self, L, d_model, heads, dff, vocab, drop=0.1, max_pos=2048):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(vocab, d_model, mask_zero=True)
        self.pe  = positional_encoding(max_pos, d_model)
        self.layers = [DecoderLayer(d_model, heads, dff, drop) for _ in range(L)]
        self.out = tf.keras.layers.Dense(vocab, dtype='float32')  # logits em fp32
    def call(self, x, ctx):
        x = self.emb(x)
        pe = tf.cast(self.pe[None, :tf.shape(x)[1], :], x.dtype)
        x = x + pe
        for layer in self.layers:
            x = layer(x, ctx)
        try:
            del x._keras_mask  # remove máscara herdada do Embedding
        except Exception:
            pass
        return self.out(x)

class Transformer(tf.keras.Model):
    def __init__(self, L, d_model, heads, dff, src_vocab, tgt_vocab, drop=0.1):
        super().__init__()
        self.enc = Encoder(L, d_model, heads, dff, src_vocab, drop)
        self.dec = Decoder(L, d_model, heads, dff, tgt_vocab, drop)
    def call(self, inputs):
        pt, en = inputs
        ctx = self.enc(pt)
        return self.dec(en, ctx)


In [10]:
# CÉLULA 9 — Modelo “míni” e compile
def masked_loss(y_true, y_pred):
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), y_pred.dtype)
    loss = loss * tf.cast(mask, loss.dtype)
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def masked_accuracy(y_true, y_pred):
    pred = tf.argmax(y_pred, axis=-1, output_type=y_true.dtype)
    mask = tf.cast(tf.not_equal(y_true, 0), y_true.dtype)
    match = tf.cast(tf.equal(y_true, pred), y_true.dtype) * mask
    return tf.reduce_sum(match) / tf.reduce_sum(mask)

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=1000):
        super().__init__(); self.d_model=tf.cast(d_model, tf.float32); self.warmup=warmup_steps
    def __call__(self, step):
        step=tf.cast(step, tf.float32)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(tf.math.rsqrt(step), step*(self.warmup**-1.5))

# hiperparâmetros míni (1 camada, largura baixa)
L, d_model, dff, heads, drop = 1, 64, 128, 4, 0.1   # heads divide d_model (64/4=16)

transformer = Transformer(
    L, d_model, heads, dff,
    tokenizers.pt.get_vocab_size().numpy(),
    tokenizers.en.get_vocab_size().numpy(),
    drop
)

lr  = CustomSchedule(d_model)
opt = tf.keras.optimizers.Adam(lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

transformer.compile(optimizer=opt, loss=masked_loss, metrics=[masked_accuracy])
transformer.build([(None, None), (None, None)])
print(transformer.summary())




None


In [11]:
# CÉLULA 10A — Warm-up (compila grafo e aquece pipeline) — roda UMA vez antes do fit real
# usa 5 batches de treino e 2 de validação, silencioso (verbose=0)
_ = transformer.fit(
    train_batches.take(5),
    epochs=1,
    steps_per_epoch=5,
    validation_data=val_batches.take(2),
    validation_steps=2,
    verbose=0,
)
print("Warm-up concluído.")


Warm-up concluído.


In [12]:
# CÉLULA 10 — Treino ultra-rápido (≈10 min total, muitas vezes menos)
import time

class EpochTimer(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs=None): self.times=[]
    def on_epoch_begin(self, epoch, logs=None): self.t0=time.perf_counter()
    def on_epoch_end(self, epoch, logs=None):
        dt=time.perf_counter()-self.t0; self.times.append(dt)
        print(f"[tempo] época {epoch+1}: {dt/60:.2f} min | loss={logs.get('loss'):.4f} | val_loss={logs.get('val_loss'):.4f}")

EPOCHS = 1
STEPS_PER_EPOCH = 5    # 5 steps apenas
VAL_STEPS       = 2

hist = transformer.fit(
    train_batches,
    epochs=EPOCHS,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=val_batches,
    validation_steps=VAL_STEPS,
    callbacks=[EpochTimer()]
)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - loss: 8.8744 - masked_accuracy: 0.0000e+00[tempo] época 1: 0.36 min | loss=8.8715 | val_loss=8.8591
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 4s/step - loss: 8.8739 - masked_accuracy: 0.0000e+00 - val_loss: 8.8591 - val_masked_accuracy: 0.0000e+00


In [13]:
# CÉLULA 11 — Tradução greedy (rápida)
probe = tokenizers.en.tokenize(tf.constant(["hello"]))
START_ID = int(probe[0][0].numpy())
END_ID   = int(probe[0][-1].numpy())
MAX_GEN  = 32   # coerente com MAX_TOKENS

def translate_pt2en(sentence_pt: str):
    pt = tf.constant([sentence_pt])
    pt_tokens = tokenizers.pt.tokenize(pt).to_tensor()
    en_tokens = tf.constant([[START_ID]], dtype=tf.int64)
    for _ in range(MAX_GEN):
        logits = transformer((pt_tokens, en_tokens))
        next_id = tf.argmax(logits[:, -1, :], axis=-1, output_type=en_tokens.dtype)
        en_tokens = tf.concat([en_tokens, next_id[:, None]], axis=1)
        if int(next_id[0].numpy()) == END_ID: break
    return tokenizers.en.detokenize(en_tokens)[0].numpy().decode("utf-8")

print(translate_pt2en("este é o primeiro livro que eu fiz."))
print(translate_pt2en("gostaria de um copo de água, por favor."))




visited offering choicesivable arabic pakistan hop brian brian hear hear hear helps unprecedented later later delicious delicious intrinsic intrinsic intrinsic intrinsic intrinsic wings wings intrinsic disappear disappear disappear intrinsic tenderness intrinsic
visited offering choicesivable arabic pakistan hop brian brian hear hear hear helps unprecedented later later delicious delicious intrinsic intrinsic intrinsic intrinsic intrinsic wings wings intrinsic disappear disappear disappear disappear intrinsic intrinsic


In [14]:
# CÉLULA 12 — (opcional) SacreBLEU rápido em amostra pequena
import sacrebleu as sbl

def bleu_on(ds, n=32):
    hyps, refs = [], []
    for pt, en in ds.unbatch().take(n):
        pt_s, en_s = pt.numpy().decode("utf-8"), en.numpy().decode("utf-8")
        hyp = translate_pt2en(pt_s)
        hyps.append(hyp)
        refs.append([en_s])
    return sbl.corpus_bleu(hyps, list(zip(*refs))).score

# Descomente para medir:
# bleu_val = bleu_on(val_ds, n=32)
# print("SacreBLEU (val, n=32):", bleu_val)


In [9]:
# =========================
# A1 — Setup de artefatos e utilitários
# =========================
import os, json, time, csv, shutil, zipfile, pathlib
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

ASSETS_DIR = pathlib.Path("assets")
ASSETS_DIR.mkdir(parents=True, exist_ok=True)

def _exists(name: str) -> bool:
    return name in globals() and globals()[name] is not None

print("assets:", ASSETS_DIR.resolve())


# =========================
# A2 — Gráfico: loss por época (usa objeto 'hist' do treino)
# =========================
def plot_loss(history, outpath=ASSETS_DIR / "loss_por_epoca.png"):
    if history is None or not getattr(history, "history", None):
        return None
    h = history.history
    epochs = list(range(1, len(h.get("loss", [])) + 1))
    if not epochs:
        return None
    plt.figure()
    plt.plot(epochs, h["loss"], label="train_loss")
    if "val_loss" in h:
        plt.plot(epochs, h["val_loss"], label="val_loss")
    plt.xlabel("época"); plt.ylabel("loss"); plt.title("Loss por época")
    plt.legend(); plt.tight_layout()
    plt.savefig(outpath, dpi=150); plt.close()
    return str(outpath)

_loss_png = plot_loss(globals().get("hist", None))
if _loss_png:
    print("loss_png:", _loss_png)
else:
    print("loss_png: não gerado (hist ausente ou vazio)")


# =========================
# A3 — Throughput (tokens/s) de forward em amostra curta
# Requer: 'transformer' e 'train_batches' já definidos no notebook
# =========================
def estimate_forward_throughput(batches, steps=10, model=None):
    (pt_tok, en_in), y = next(iter(batches.take(1)))
    _ = model((pt_tok, en_in))  # warm-up
    total_tokens, ran = 0, 0
    t0 = time.perf_counter()
    it = iter(batches)
    for i in range(steps):
        try:
            (pt_tok, en_in), y = next(it)
        except StopIteration:
            break
        _ = model((pt_tok, en_in))
        total_tokens += int(y.shape[0] * y.shape[1])
        ran += 1
    dt = time.perf_counter() - t0
    return {"steps": ran, "total_tokens": total_tokens, "seconds": dt, "tokens_per_sec": (total_tokens / dt) if dt > 0 else None}

_forward_stats = None
if _exists("transformer") and _exists("train_batches"):
    _forward_stats = estimate_forward_throughput(train_batches, steps=10, model=transformer)
    with open(ASSETS_DIR / "throughput_forward.json", "w", encoding="utf-8") as f:
        json.dump(_forward_stats, f, ensure_ascii=False, indent=2)
    plt.figure()
    plt.bar(["forward"], [(_forward_stats["tokens_per_sec"] or 0.0)])
    plt.ylabel("tokens/s (forward)"); plt.title("Throughput (amostra curta)")
    plt.tight_layout(); plt.savefig(ASSETS_DIR / "throughput_tokens_por_epoca.png", dpi=150); plt.close()
    print("throughput_forward:", _forward_stats)
else:
    print("throughput_forward: não gerado (transformer/train_batches ausentes)")


# =========================
# A4 — Histograma de comprimentos de tokens (PT/EN)
# Requer: 'train_ds' e 'tokenizers' definidos
# =========================
def lengths_hist_fixed(ds, tokenizers, n=512, cap=None, outpath=ASSETS_DIR / "comprimentos_tokens_hist.png"):
    pt_lens, en_lens, count = [], [], 0
    for pt, en in ds.take(n):
        pt_tok = tokenizers.pt.tokenize(pt[None, ...]).to_tensor()
        en_tok = tokenizers.en.tokenize(en[None, ...]).to_tensor()
        pt_len = int(tf.shape(pt_tok)[1].numpy())
        en_len = int(tf.shape(en_tok)[1].numpy())
        if cap is not None:
            pt_len = min(pt_len, int(cap))
            en_len = min(en_len, int(cap))
        pt_lens.append(pt_len); en_lens.append(en_len); count += 1
    plt.figure()
    plt.hist(pt_lens, bins=30, alpha=0.7, label="PT")
    plt.hist(en_lens, bins=30, alpha=0.7, label="EN")
    if cap is not None:
        plt.axvline(int(cap), linestyle="--", alpha=0.7, label=f"MAX_TOKENS={int(cap)}")
    plt.xlabel("comprimento em tokens"); plt.ylabel("freq.")
    plt.title(f"Distribuição de comprimentos (amostra N={count})")
    plt.legend(); plt.tight_layout(); plt.savefig(outpath, dpi=150); plt.close()
    return str(outpath)

_hist_png = None
if _exists("train_ds") and _exists("tokenizers"):
    _cap = globals().get("MAX_TOKENS", None)
    _hist_png = lengths_hist_fixed(train_ds, tokenizers, n=512, cap=_cap)
    print("comprimentos_tokens_hist:", _hist_png)
else:
    print("comprimentos_tokens_hist: não gerado (train_ds/tokenizers ausentes)")


# =========================
# A5 — Atenção do modelo e amostras qualitativas
# Requer: 'transformer' e 'tokenizers' definidos
# =========================
def ensure_translate_fn():
    if "translate_pt2en" in globals() and callable(globals()["translate_pt2en"]):
        return globals()["translate_pt2en"]
    def _translate_pt2en(sentence_pt: str, max_len=None):
        if max_len is None: max_len = int(globals().get("MAX_TOKENS", 32))
        pt = tf.constant([sentence_pt])
        pt_tokens = tokenizers.pt.tokenize(pt).to_tensor()
        probe = tokenizers.en.tokenize(tf.constant(["hello"]))
        START_ID = int(probe[0][0].numpy()); END_ID = int(probe[0][-1].numpy())
        en_tokens = tf.constant([[START_ID]], dtype=tf.int64)
        for _ in range(max_len):
            logits = transformer((pt_tokens, en_tokens))
            next_id = tf.argmax(logits[:, -1, :], axis=-1, output_type=en_tokens.dtype)
            en_tokens = tf.concat([en_tokens, next_id[:, None]], axis=1)
            if int(next_id[0].numpy()) == END_ID:
                break
        return tokenizers.en.detokenize(en_tokens)[0].numpy().decode("utf-8")
    globals()["translate_pt2en"] = _translate_pt2en
    return globals()["translate_pt2en"]

def attention_from_model(model, tokenizers, outpath=ASSETS_DIR / "atencao_modelo.png"):
    pt = tf.constant(["este é o primeiro livro que eu fiz."])
    pt_tok = tokenizers.pt.tokenize(pt)[:, :int(globals().get("MAX_TOKENS", 32))].to_tensor()
    start_id = int(tokenizers.en.tokenize(tf.constant(["hello"]))[0][0].numpy())
    en_in = tf.constant([[start_id]], dtype=tf.int64)
    ctx = model.enc(pt_tok)
    x = model.dec.emb(en_in)
    pe = positional_encoding(2048, x.shape[-1]); x = x + tf.cast(pe[None, :tf.shape(x)[1], :], x.dtype)
    # captura a cross-attention da última camada
    att_scores = None
    for i, dl in enumerate(model.dec.layers):
        x = dl.sa(x); y, att = dl.ca.mha(query=x, value=ctx, key=ctx, return_attention_scores=True)
        y = tf.cast(y, x.dtype); x = dl.ca.add([x, y]); x = dl.ca.norm(x); x = dl.ff(x)
        att_scores = att
    a = att_scores
    if a.shape.rank == 4 and a.shape[1] == a.shape[1]:
        A = tf.reduce_mean(a, axis=1)[0]
    else:
        A = tf.reduce_mean(a, axis=-1)[0]
    plt.figure(figsize=(6,5))
    plt.imshow(A.numpy(), aspect="auto", interpolation="nearest"); plt.colorbar()
    plt.xlabel("PT (fonte)"); plt.ylabel("EN_in (consulta)")
    plt.title("Mapa de atenção (modelo)")
    plt.tight_layout(); plt.savefig(outpath, dpi=150); plt.close()
    return str(outpath)

_att_png = None
_samples_csv = None
if _exists("transformer") and _exists("tokenizers"):
    _att_png = attention_from_model(transformer, tokenizers)
    print("atencao_modelo:", _att_png)
    translate_pt2en = ensure_translate_fn()
    samples_pt = [
        "este é o primeiro livro que eu fiz.",
        "gostaria de um copo de água, por favor.",
        "o sistema funcionou melhor do que esperávamos.",
        "vou então muito rapidamente partilhar convosco algumas histórias.",
        "os meus vizinhos ouviram sobre esta ideia."
    ]
    rows = [{"pt": s, "pred_en": translate_pt2en(s)} for s in samples_pt]
    _samples_csv = ASSETS_DIR / "amostras_traducoes.csv"
    with open(_samples_csv, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["pt", "pred_en"]); w.writeheader(); w.writerows(rows)
    print("amostras_traducoes.csv:", _samples_csv.resolve())
else:
    print("atenção/amostras: não gerado (transformer/tokenizers ausentes)")


# =========================
# A6 — Métricas e micro-benchmark de backprop
# Requer: 'transformer', 'train_batches' e 'masked_loss' (define fallback se ausente)
# =========================
if "masked_loss" not in globals():
    def masked_loss(y_true, y_pred):
        scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
        loss = scce(y_true, y_pred)
        mask = tf.cast(tf.not_equal(y_true, 0), loss.dtype)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def _count_params(model):
    return int(np.sum([np.prod(v.shape) for v in model.trainable_variables]))

_metrics_out = ASSETS_DIR / "metrics.json"
_bp_out = ASSETS_DIR / "backprop_microbench.json"

_metrics = {}
if _exists("transformer"):
    _metrics = {
        "trainable_params": _count_params(transformer),
        "d_model": int(transformer.dec.emb.output_dim),
        "num_layers": len(transformer.enc.layers),
        "num_heads": int(transformer.dec.layers[0].sa.mha.num_heads) if transformer.dec.layers else None,
        "batch_size": int(globals().get("BATCH_SIZE", 32)),
        "max_tokens": int(globals().get("MAX_TOKENS", 32)),
        "loss_final": float(globals().get("hist").history["loss"][-1]) if _exists("hist") and "loss" in hist.history else None,
        "val_loss_final": float(globals().get("hist").history.get("val_loss", [np.nan])[-1]) if _exists("hist") else None,
        "throughput_forward_tokens_per_sec": float((_forward_stats or {}).get("tokens_per_sec")) if _forward_stats else None,
        "tf_version": tf.__version__,
    }
    with open(_metrics_out, "w", encoding="utf-8") as f:
        json.dump(_metrics, f, ensure_ascii=False, indent=2)
    print("metrics.json:", _metrics_out.resolve())
else:
    print("metrics.json: não gerado (transformer ausente)")

_bp = {}
if _exists("transformer") and _exists("train_batches") and hasattr(transformer, "optimizer"):
    @tf.function(reduce_retracing=True)
    def _one_step(model, inputs, y_true):
        with tf.GradientTape() as tape:
            y_pred = model(inputs, training=True)
            loss = masked_loss(y_true, y_pred)
        grads = tape.gradient(loss, model.trainable_variables)
        model.optimizer.apply_gradients(zip(grads, model.trainable_variables))
        return loss
    (pt_tok, en_in), y_true = next(iter(train_batches.take(1)))
    _ = _one_step(transformer, (pt_tok, en_in), y_true)  # warm-up
    t0 = time.perf_counter()
    loss_val = float(_one_step(transformer, (pt_tok, en_in), y_true).numpy())
    dt = time.perf_counter() - t0
    batch_tokens = int(y_true.shape[0] * y_true.shape[1])
    _bp = {
        "train_one_step_seconds": float(dt),
        "batch_tokens": batch_tokens,
        "tokens_per_sec_backprop": (batch_tokens / dt) if dt > 0 else None,
        "loss_after_step": loss_val,
    }
    with open(_bp_out, "w", encoding="utf-8") as f:
        json.dump(_bp, f, ensure_ascii=False, indent=2)
    print("backprop_microbench.json:", _bp_out.resolve())
else:
    print("backprop_microbench.json: não gerado (requer transformer/train_batches/optimizer)")

# Snippet descritivo para apoio ao README
_snippet = [
    "- assets/loss_por_epoca.png — curvas de treinamento (loss/época)",
    "- assets/throughput_tokens_por_epoca.png — throughput (tokens/s) estimado (forward)",
    "- assets/comprimentos_tokens_hist.png — distribuição de comprimentos de tokens (PT/EN)",
    "- assets/atencao_modelo.png — mapa de atenção (modelo)",
    "- assets/amostras_traducoes.csv — amostras qualitativas PT → EN",
    "- assets/metrics.json — hiperparâmetros e métricas resumidas",
    "- assets/throughput_forward.json — detalhes do throughput forward",
    "- assets/backprop_microbench.json — tempo de um passo de backprop e tokens/s",
]
with open(ASSETS_DIR / "readme_snippet.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(_snippet) + "\n")
print("readme_snippet.txt:", (ASSETS_DIR / "readme_snippet.txt").resolve())


# =========================
# A7 — Empacotar tudo: copiar para assets/ e gerar ZIP (assets_bundle.zip)
# Inclui README.md se existir na raiz
# =========================
ROOT = pathlib.Path(".").resolve()
ASSETS_DIR = ROOT / "assets"
ASSETS_DIR.mkdir(parents=True, exist_ok=True)

EXPECTED = [
    "loss_por_epoca.png",
    "throughput_tokens_por_epoca.png",
    "comprimentos_tokens_hist.png",
    "atencao_modelo.png",
    "amostras_traducoes.csv",
    "results_gpu.json",
    "results_cpu.json",
    "cpu_gpu_table.txt",
    "metrics.json",
    "throughput_forward.json",
    "backprop_microbench.json",
    "readme_snippet.txt",
]

def _locate(name: str):
    p = ASSETS_DIR / name
    if p.exists(): return p
    c = ROOT / name
    if c.exists(): return c
    skip = {"site-packages", ".cache", ".config", ".ipython", ".local", ".keras", ".nv", "tensorflow_datasets", "__pycache__"}
    for path in ROOT.rglob(name):
        s = str(path)
        if any(sk in s for sk in skip): continue
        return path
    return None

copied, missing = [], []
for fname in EXPECTED:
    src = _locate(fname)
    if src is None:
        missing.append(fname);
        continue
    dst = ASSETS_DIR / fname
    if src.resolve() != dst.resolve():
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(src, dst)
    copied.append(dst.name)

zip_name = "assets_bundle.zip"
with zipfile.ZipFile(zip_name, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for path in ASSETS_DIR.rglob("*"):
        if path.is_file():
            z.write(path, arcname=str(path.relative_to(ROOT)))
    if (ROOT / "README.md").exists():
        z.write(ROOT / "README.md", arcname="README.md")

print("bundle_zip:", (ROOT / zip_name))
print("assets_conteudo:", sorted([p.name for p in ASSETS_DIR.iterdir()]))
if missing:
    print("faltando_no_bundle:", missing)

try:
    from google.colab import files  # type: ignore
    files.download(zip_name)
except Exception:
    pass


assets: /content/assets
loss_png: não gerado (hist ausente ou vazio)
throughput_forward: não gerado (transformer/train_batches ausentes)
comprimentos_tokens_hist: assets/comprimentos_tokens_hist.png
atenção/amostras: não gerado (transformer/tokenizers ausentes)
metrics.json: não gerado (transformer ausente)
backprop_microbench.json: não gerado (requer transformer/train_batches/optimizer)
readme_snippet.txt: /content/assets/readme_snippet.txt
bundle_zip: /content/assets_bundle.zip
assets_conteudo: ['atencao_modelo.png', 'comprimentos_tokens_hist.png', 'cpu_gpu_table.txt', 'readme_snippet.txt', 'results_cpu.json', 'results_gpu.json']
faltando_no_bundle: ['loss_por_epoca.png', 'throughput_tokens_por_epoca.png', 'amostras_traducoes.csv', 'metrics.json', 'throughput_forward.json', 'backprop_microbench.json']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
# CELULA ÚNICA — Diagnóstico e regeneração automática dos artefatos faltantes + ZIP final

import os, json, time, csv, shutil, zipfile, pathlib
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

# --------------------------
# Configurações rápidas
# --------------------------
ROOT = pathlib.Path(".").resolve()
ASSETS = ROOT / "assets"
ASSETS.mkdir(parents=True, exist_ok=True)

BATCH_SIZE  = int(os.environ.get("BATCH_SIZE", 32))
MAX_TOKENS  = int(os.environ.get("MAX_TOKENS", 32))
EPOCHS      = int(os.environ.get("EPOCHS", 1))
STEPS       = int(os.environ.get("STEPS_PER_EPOCH", 40))
VAL_STEPS   = int(os.environ.get("VAL_STEPS", 8))
BLEU_N      = int(os.environ.get("BLEU_N", 100))

print("Diagnóstico inicial")
print("- assets dir:", ASSETS.resolve())
print("- GPU visível:", bool(tf.config.list_physical_devices('GPU')))

# --------------------------
# Utilitários de verificação
# --------------------------
def _has(name: str):
    return name in globals() and globals()[name] is not None

def _print_missing():
    print("Variáveis na sessão:")
    for n in ["tokenizers","train_ds","val_ds","train_batches","transformer","hist"]:
        print(f"  - {n}: {'OK' if _has(n) else 'ausente'}")

_print_missing()

# --------------------------
# 1) Garantir tokenizers e TFDS
# --------------------------
def ensure_tokenizers():
    import zipfile, shutil as _shutil
    base = ROOT / "ted_hrlr_translate_pt_en_converter"
    cand = [base, base / "ted_hrlr_translate_pt_en_converter"]
    load_dir = None
    for c in cand:
        if (c / "saved_model.pb").exists():
            load_dir = c; break
    if load_dir is None:
        url = "https://storage.googleapis.com/download.tensorflow.org/models/ted_hrlr_translate_pt_en_converter.zip"
        zip_path = tf.keras.utils.get_file("ted_hrlr_translate_pt_en_converter.zip", url, cache_dir='.', cache_subdir='', extract=False)
        tgt = pathlib.Path(zip_path).with_suffix('')
        if tgt.exists(): _shutil.rmtree(tgt)
        with zipfile.ZipFile(zip_path, 'r') as z: z.extractall(path=tgt)
        load_dir = tgt if (tgt / "saved_model.pb").exists() else (tgt / "ted_hrlr_translate_pt_en_converter")
    print("Tokenizers em:", load_dir.resolve())
    return tf.saved_model.load(str(load_dir))

def ensure_tfds():
    import tensorflow_datasets as tfds
    ex, info = tfds.load("ted_hrlr_translate/pt_to_en", with_info=True, as_supervised=True)
    print("TFDS OK | splits:", list(ex.keys()))
    return ex["train"], ex["validation"], ex.get("test", None)

if not _has("tokenizers"):
    try:
        import tensorflow_text as _text  # noqa
    except Exception:
        print("Instalando tensorflow-text compatível com TF", tf.__version__)
        import sys, subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", f"tensorflow-text=={tf.__version__}"])
        import tensorflow_text as _text  # noqa
    tokenizers = ensure_tokenizers()

if not _has("train_ds") or not _has("val_ds"):
    train_ds, val_ds, _ = ensure_tfds()

# --------------------------
# 2) Pipeline de batches
# --------------------------
def prepare_batch(pt, en, tokenizers, max_tokens=MAX_TOKENS):
    pt_tok = tokenizers.pt.tokenize(pt)[:, :max_tokens].to_tensor()
    en_tok = tokenizers.en.tokenize(en)[:, :max_tokens+1]
    en_in  = en_tok[:, :-1].to_tensor()
    en_lab = en_tok[:,  1:].to_tensor()
    return (pt_tok, en_in), en_lab

def make_batches(ds, tokenizers, batch=BATCH_SIZE, max_tokens=MAX_TOKENS):
    return (ds
            .shuffle(10000)
            .batch(batch, drop_remainder=True)
            .map(lambda pt,en: prepare_batch(pt,en,tokenizers,max_tokens), num_parallel_calls=tf.data.AUTOTUNE)
            .prefetch(tf.data.AUTOTUNE))

if not _has("train_batches"):
    train_batches = make_batches(train_ds, tokenizers, BATCH_SIZE, MAX_TOKENS)
    val_batches   = make_batches(val_ds, tokenizers, BATCH_SIZE, MAX_TOKENS)

# --------------------------
# 3) Modelo mínimo (definição + compile)
# --------------------------
def positional_encoding(length, depth):
    import numpy as _np
    depth = depth/2
    positions = _np.arange(length)[:, None]
    depths    = _np.arange(depth)[None, :]/depth
    angle_rates = 1/(10000**depths)
    angle_rads  = positions * angle_rates
    pe = _np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1).astype("float32")
    return tf.constant(pe)

class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, heads, d_model, drop=0.1):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=heads, key_dim=d_model, dropout=drop)
        self.add = tf.keras.layers.Add()
        self.norm = tf.keras.layers.LayerNormalization(dtype='float32')

class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        y = self.mha(query=x, value=x, key=x)
        y = tf.cast(y, x.dtype)
        x = self.add([x, y])
        return self.norm(x)

class CausalSelfAttention(BaseAttention):
    def call(self, x):
        y = self.mha(query=x, value=x, key=x, use_causal_mask=True)
        y = tf.cast(y, x.dtype)
        x = self.add([x, y])
        return self.norm(x)

class CrossAttention(BaseAttention):
    def call(self, x, ctx):
        y, _ = self.mha(query=x, value=ctx, key=ctx, return_attention_scores=True)
        y = tf.cast(y, x.dtype)
        x = self.add([x, y])
        return self.norm(x)

class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, drop=0.1):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(dff, activation="relu")
        self.d2 = tf.keras.layers.Dense(d_model)
        self.do = tf.keras.layers.Dropout(drop)
        self.ln = tf.keras.layers.LayerNormalization(dtype='float32')
    def call(self, x):
        y = self.d1(x); y = self.d2(y); y = tf.cast(y, x.dtype); y = self.do(y)
        y = tf.keras.layers.Add()([x, y])
        return self.ln(y)

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, heads, dff, drop=0.1):
        super().__init__(); self.sa = GlobalSelfAttention(heads, d_model, drop); self.ff = FeedForward(d_model, dff, drop)
    def call(self, x): x = self.sa(x); x = self.ff(x); return x

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, heads, dff, drop=0.1):
        super().__init__(); self.sa = CausalSelfAttention(heads, d_model, drop); self.ca = CrossAttention(heads, d_model, drop); self.ff = FeedForward(d_model, dff, drop)
    def call(self, x, ctx): x = self.sa(x); x = self.ca(x, ctx); x = self.ff(x); return x

class Encoder(tf.keras.layers.Layer):
    def __init__(self, L, d_model, heads, dff, vocab, drop=0.1, max_pos=2048):
        super().__init__(); self.emb = tf.keras.layers.Embedding(vocab, d_model, mask_zero=True); self.pe  = positional_encoding(max_pos, d_model); self.layers = [EncoderLayer(d_model, heads, dff, drop) for _ in range(L)]
    def call(self, x):
        x = self.emb(x)
        pe = tf.cast(self.pe[None, :tf.shape(x)[1], :], x.dtype); x = x + pe
        for layer in self.layers: x = layer(x)
        return x

class Decoder(tf.keras.layers.Layer):
    def __init__(self, L, d_model, heads, dff, vocab, drop=0.1, max_pos=2048):
        super().__init__(); self.emb = tf.keras.layers.Embedding(vocab, d_model, mask_zero=True); self.pe  = positional_encoding(max_pos, d_model); self.layers = [DecoderLayer(d_model, heads, dff, drop) for _ in range(L)]; self.out = tf.keras.layers.Dense(vocab, dtype='float32')
    def call(self, x, ctx):
        x = self.emb(x); pe = tf.cast(self.pe[None, :tf.shape(x)[1], :], x.dtype); x = x + pe
        for layer in self.layers: x = layer(x, ctx)
        try: del x._keras_mask
        except Exception: pass
        return self.out(x)

class Transformer(tf.keras.Model):
    def __init__(self, L, d_model, heads, dff, src_vocab, tgt_vocab, drop=0.1):
        super().__init__(); self.enc = Encoder(L, d_model, heads, dff, src_vocab, drop); self.dec = Decoder(L, d_model, heads, dff, tgt_vocab, drop)
    def call(self, inputs):
        pt, en = inputs; ctx = self.enc(pt); return self.dec(en, ctx)

def masked_loss(y_true, y_pred):
    scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = scce(y_true, y_pred); mask = tf.cast(tf.not_equal(y_true, 0), loss.dtype); loss = loss * mask
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def masked_accuracy(y_true, y_pred):
    pred = tf.argmax(y_pred, axis=-1, output_type=y_true.dtype); mask = tf.cast(tf.not_equal(y_true, 0), y_true.dtype)
    match = tf.cast(tf.equal(y_true, pred), y_true.dtype) * mask; return tf.reduce_sum(match) / tf.reduce_sum(mask)

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=1000): super().__init__(); self.d_model=tf.cast(d_model, tf.float32); self.warmup=warmup_steps
    def __call__(self, step): step=tf.cast(step, tf.float32); return tf.math.rsqrt(self.d_model)*tf.math.minimum(tf.math.rsqrt(step), step*(self.warmup**-1.5))

def build_model():
    L, d_model, dff, heads, drop = 1, 64, 128, 4, 0.1
    src_vocab = int(tokenizers.pt.get_vocab_size().numpy())
    tgt_vocab = int(tokenizers.en.get_vocab_size().numpy())
    model = Transformer(L, d_model, heads, dff, src_vocab, tgt_vocab, drop)
    lr  = CustomSchedule(d_model)
    opt = tf.keras.optimizers.Adam(lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
    model.compile(optimizer=opt, loss=masked_loss, metrics=[masked_accuracy])
    return model

if not _has("transformer"):
    # mixed precision apenas se GPU presente
    try:
        from tensorflow.keras import mixed_precision as mp
        mp.set_global_policy("mixed_float16" if tf.config.list_physical_devices('GPU') else "float32")
    except Exception:
        pass
    transformer = build_model()

# --------------------------
# 4) Treino curto (gera 'hist') + gráficos/artefatos
# --------------------------
class EpochTimer(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs=None): self.times=[]
    def on_epoch_begin(self, epoch, logs=None): self.t0=time.perf_counter()
    def on_epoch_end(self, epoch, logs=None):
        dt=time.perf_counter()-self.t0; self.times.append(dt)
        print(f"[tempo] época {epoch+1}: {dt:.2f}s | loss={logs.get('loss'):.4f} | val_loss={logs.get('val_loss'):.4f}")

if not _has("hist"):
    _ = transformer.fit(train_batches.take(2), epochs=1, steps_per_epoch=2, validation_data=val_batches.take(1), validation_steps=1, verbose=0)
    timer = EpochTimer()
    hist = transformer.fit(train_batches, epochs=EPOCHS, steps_per_epoch=STEPS, validation_data=val_batches, validation_steps=VAL_STEPS, callbacks=[timer])

# 4.1 Loss por época
def plot_loss(history, outpath=ASSETS / "loss_por_epoca.png"):
    h = history.history; ep = list(range(1, len(h.get("loss", []))+1))
    plt.figure(); plt.plot(ep, h["loss"], label="train_loss")
    if "val_loss" in h: plt.plot(ep, h["val_loss"], label="val_loss")
    plt.xlabel("época"); plt.ylabel("loss"); plt.title("Loss por época"); plt.legend(); plt.tight_layout()
    plt.savefig(outpath, dpi=150); plt.close(); return str(outpath)

loss_path = plot_loss(hist)
print("OK:", loss_path)

# 4.2 Throughput forward
def forward_throughput(batches, model, steps=10, out_json=ASSETS/"throughput_forward.json", out_png=ASSETS/"throughput_tokens_por_epoca.png"):
    (pt_tok, en_in), y = next(iter(batches.take(1))); _ = model((pt_tok, en_in))
    total_tokens, ran = 0, 0; t0 = time.perf_counter()
    it = iter(batches)
    for i in range(steps):
        try: (pt_tok, en_in), y = next(it)
        except StopIteration: break
        _ = model((pt_tok, en_in)); total_tokens += int(y.shape[0]*y.shape[1]); ran += 1
    dt = time.perf_counter()-t0; tps = (total_tokens/dt) if dt>0 else None
    stats = {"steps": ran, "total_tokens": total_tokens, "seconds": dt, "tokens_per_sec": tps}
    with open(out_json, "w", encoding="utf-8") as f: json.dump(stats, f, ensure_ascii=False, indent=2)
    plt.figure(); plt.bar(["forward"], [tps or 0.0]); plt.ylabel("tokens/s (forward)"); plt.title("Throughput (amostra curta)"); plt.tight_layout(); plt.savefig(out_png, dpi=150); plt.close()
    return stats, str(out_png)

fwd_stats, fwd_png = forward_throughput(train_batches, transformer)
print("OK:", fwd_png, "|", int(fwd_stats["tokens_per_sec"] or 0), "tok/s")

# 4.3 Histograma de comprimentos
def lengths_histogram(ds, tokenizers, n=512, cap=MAX_TOKENS, out_png=ASSETS/"comprimentos_tokens_hist.png"):
    pt_lens, en_lens, count = [], [], 0
    for pt, en in ds.take(n):
        pt_tok = tokenizers.pt.tokenize(pt[None, ...]).to_tensor()
        en_tok = tokenizers.en.tokenize(en[None, ...]).to_tensor()
        pt_len = int(tf.shape(pt_tok)[1].numpy()); en_len = int(tf.shape(en_tok)[1].numpy())
        if cap: pt_len=min(pt_len,cap); en_len=min(en_len,cap)
        pt_lens.append(pt_len); en_lens.append(en_len); count += 1
    plt.figure(); plt.hist(pt_lens, bins=30, alpha=0.7, label="PT"); plt.hist(en_lens, bins=30, alpha=0.7, label="EN")
    if cap: plt.axvline(cap, linestyle="--", alpha=0.7, label=f"MAX_TOKENS={cap}")
    plt.xlabel("comprimento em tokens"); plt.ylabel("freq."); plt.title(f"Distribuição de comprimentos (N={count})"); plt.legend(); plt.tight_layout(); plt.savefig(out_png, dpi=150); plt.close()
    return str(out_png)

len_png = lengths_histogram(train_ds, tokenizers)
print("OK:", len_png)

# 4.4 Atenção do modelo
def attention_map(model, tokenizers, out_png=ASSETS/"atencao_modelo.png"):
    pt = tf.constant(["este é o primeiro livro que eu fiz."])
    pt_tok = tokenizers.pt.tokenize(pt)[:, :MAX_TOKENS].to_tensor()
    start_id = int(tokenizers.en.tokenize(tf.constant(["hello"]))[0][0].numpy())
    en_in = tf.constant([[start_id]], dtype=tf.int64)
    ctx = model.enc(pt_tok); x = model.dec.emb(en_in); pe = positional_encoding(2048, x.shape[-1]); x = x + tf.cast(pe[None,:tf.shape(x)[1],:], x.dtype)
    att_scores = None
    for dl in model.dec.layers:
        x = dl.sa(x); y, att = dl.ca.mha(query=x, value=ctx, key=ctx, return_attention_scores=True)
        y = tf.cast(y, x.dtype); x = dl.ca.add([x,y]); x = dl.ca.norm(x); x = dl.ff(x); att_scores = att
    A = tf.reduce_mean(att_scores, axis=1)[0] if att_scores.shape.rank==4 and att_scores.shape[1] else tf.reduce_mean(att_scores, axis=-1)[0]
    plt.figure(figsize=(6,5)); plt.imshow(A.numpy(), aspect="auto", interpolation="nearest"); plt.colorbar(); plt.xlabel("PT (fonte)"); plt.ylabel("EN_in (consulta)"); plt.title("Mapa de atenção (modelo)"); plt.tight_layout(); plt.savefig(out_png, dpi=150); plt.close()
    return str(out_png)

att_png = attention_map(transformer, tokenizers)
print("OK:", att_png)

# 4.5 Amostras qualitativas
def translate_pt2en(model, tokenizers, sentence_pt: str, max_len=MAX_TOKENS):
    pt = tf.constant([sentence_pt]); pt_tokens = tokenizers.pt.tokenize(pt).to_tensor()
    probe = tokenizers.en.tokenize(tf.constant(["hello"])); START_ID=int(probe[0][0].numpy()); END_ID=int(probe[0][-1].numpy())
    en_tokens = tf.constant([[START_ID]], dtype=tf.int64)
    for _ in range(max_len):
        logits = model((pt_tokens, en_tokens)); next_id = tf.argmax(logits[:, -1, :], axis=-1, output_type=en_tokens.dtype)
        en_tokens = tf.concat([en_tokens, next_id[:, None]], axis=1)
        if int(next_id[0].numpy()) == END_ID: break
    return tokenizers.en.detokenize(en_tokens)[0].numpy().decode("utf-8")

samples_pt = [
    "este é o primeiro livro que eu fiz.",
    "gostaria de um copo de água, por favor.",
    "o sistema funcionou melhor do que esperávamos.",
    "vou então muito rapidamente partilhar convosco algumas histórias.",
    "os meus vizinhos ouviram sobre esta ideia."
]
rows = [{"pt": s, "pred_en": translate_pt2en(transformer, tokenizers, s)} for s in samples_pt]
csv_path = ASSETS / "amostras_traducoes.csv"
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["pt","pred_en"]); w.writeheader(); w.writerows(rows)
print("OK:", csv_path)

# 4.6 Métricas e micro-benchmark backprop
def count_params(model): return int(np.sum([np.prod(v.shape) for v in model.trainable_variables]))
metrics = {
    "trainable_params": count_params(transformer),
    "d_model": int(transformer.dec.emb.output_dim),
    "num_layers": len(transformer.enc.layers),
    "num_heads": int(transformer.dec.layers[0].sa.mha.num_heads) if transformer.dec.layers else None,
    "batch_size": BATCH_SIZE,
    "max_tokens": MAX_TOKENS,
    "loss_final": float(hist.history["loss"][-1]),
    "val_loss_final": float(hist.history.get("val_loss", [np.nan])[-1]),
    "throughput_forward_tokens_per_sec": float(fwd_stats["tokens_per_sec"]) if fwd_stats else None,
    "tf_version": tf.__version__,
}
with open(ASSETS / "metrics.json", "w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)
print("OK:", ASSETS / "metrics.json")

@tf.function(reduce_retracing=True)
def one_step(model, inputs, y_true):
    with tf.GradientTape() as tape:
        y_pred = model(inputs, training=True); loss = masked_loss(y_true, y_pred)
    grads = tape.gradient(loss, model.trainable_variables); model.optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss

(pt_tok, en_in), y_true = next(iter(train_batches.take(1))); _ = one_step(transformer, (pt_tok, en_in), y_true)
t0 = time.perf_counter(); loss_val = float(one_step(transformer, (pt_tok, en_in), y_true).numpy()); dt = time.perf_counter()-t0
bp = {"train_one_step_seconds": dt, "batch_tokens": int(y_true.shape[0]*y_true.shape[1]), "tokens_per_sec_backprop": (y_true.shape[0]*y_true.shape[1])/dt}
with open(ASSETS / "backprop_microbench.json", "w", encoding="utf-8") as f:
    json.dump(bp, f, ensure_ascii=False, indent=2)
print("OK:", ASSETS / "backprop_microbench.json")

# 4.7 Resultados CPU/GPU (se já existirem, mantém)
def maybe_write(path, data):
    if not path.exists():
        with open(path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2)

results_quick = {
    "epochs": int(EPOCHS), "steps_per_epoch": int(STEPS), "val_steps": int(VAL_STEPS),
    "time_per_epoch_sec_mean": None, "time_total_sec": None,
    "loss_final": metrics["loss_final"], "val_loss_final": metrics["val_loss_final"],
    "sacrebleu_val_n": None, "sacrebleu_val_score": None,
    "batch_size": BATCH_SIZE, "max_tokens": MAX_TOKENS, "model_params": metrics["trainable_params"]
}
maybe_write(ASSETS / "results_cpu.json", results_quick)
maybe_write(ASSETS / "results_gpu.json", results_quick)

# 4.8 Tabela consolidada
table_path = ASSETS / "cpu_gpu_table.txt"
lines = [
    "| Hardware | Épocas | Steps/época | Tempo/época (média) | Tempo total | Loss final | SacreBLEU (n) |",
    "|---------:|:------:|:-----------:|:--------------------:|:-----------:|-----------:|:-------------:|",
]
def _load(p):
    try: return json.load(open(p,"r"))
    except Exception: return None
gpu_r = _load(ASSETS/"results_gpu.json"); cpu_r = _load(ASSETS/"results_cpu.json")
if gpu_r: lines.append(f"| GPU      | {gpu_r.get('epochs','—')} | {gpu_r.get('steps_per_epoch','—')} | {('%.2fs'%gpu_r['time_per_epoch_sec_mean']) if gpu_r.get('time_per_epoch_sec_mean') else '—'} | {('%.2fs'%gpu_r['time_total_sec']) if gpu_r.get('time_total_sec') else '—'} | {gpu_r.get('loss_final','—')} | {(gpu_r.get('sacrebleu_val_score','—'))} ({gpu_r.get('sacrebleu_val_n','—')}) |")
if cpu_r: lines.append(f"| CPU      | {cpu_r.get('epochs','—')} | {cpu_r.get('steps_per_epoch','—')} | {('%.2fs'%cpu_r['time_per_epoch_sec_mean']) if cpu_r.get('time_per_epoch_sec_mean') else '—'} | {('%.2fs'%cpu_r['time_total_sec']) if cpu_r.get('time_total_sec') else '—'} | {cpu_r.get('loss_final','—')} | {(cpu_r.get('sacrebleu_val_score','—'))} ({cpu_r.get('sacrebleu_val_n','—')}) |")
with open(table_path, "w", encoding="utf-8") as f: f.write("\n".join(lines) + "\n")
print("OK:", table_path)

# 4.9 Snippet de README
snippet = [
    "- assets/loss_por_epoca.png — curvas de treinamento (loss/época)",
    "- assets/throughput_tokens_por_epoca.png — throughput (tokens/s) estimado (forward)",
    "- assets/comprimentos_tokens_hist.png — distribuição de comprimentos de tokens (PT/EN)",
    "- assets/atencao_modelo.png — mapa de atenção (modelo)",
    "- assets/amostras_traducoes.csv — amostras qualitativas PT → EN",
    "- assets/metrics.json — hiperparâmetros e métricas resumidas",
    "- assets/throughput_forward.json — detalhes do throughput forward",
    "- assets/backprop_microbench.json — tempo de um passo de backprop e tokens/s",
]
with open(ASSETS / "readme_snippet.txt", "w", encoding="utf-8") as f: f.write("\n".join(snippet) + "\n")
print("OK:", ASSETS / "readme_snippet.txt")

# --------------------------
# 5) Empacotar tudo em assets_bundle.zip (mantém a pasta 'assets' no ZIP)
# --------------------------
EXPECTED = [
    "loss_por_epoca.png",
    "throughput_tokens_por_epoca.png",
    "comprimentos_tokens_hist.png",
    "atencao_modelo.png",
    "amostras_traducoes.csv",
    "results_gpu.json",
    "results_cpu.json",
    "cpu_gpu_table.txt",
    "metrics.json",
    "throughput_forward.json",
    "backprop_microbench.json",
    "readme_snippet.txt",
]
missing, present = [], []
for fname in EXPECTED:
    p = ASSETS / fname
    if p.exists(): present.append(fname)
    else: missing.append(fname)

zip_name = "assets_bundle.zip"
with zipfile.ZipFile(zip_name, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for path in ASSETS.rglob("*"):
        if path.is_file(): z.write(path, arcname=str(path.relative_to(ROOT)))
    if (ROOT / "README.md").exists():
        z.write(ROOT / "README.md", arcname="README.md")

print("Resumo final")
print("- Gerados:", sorted(present))
print("- Faltantes:", missing if missing else "nenhum")
print("- ZIP:", (ROOT / zip_name))

try:
    from google.colab import files  # type: ignore
    files.download(zip_name)
except Exception:
    pass


Diagnóstico inicial
- assets dir: /content/assets
- GPU visível: True
Variáveis na sessão:
  - tokenizers: OK
  - train_ds: OK
  - val_ds: OK
  - train_batches: OK
  - transformer: ausente
  - hist: ausente
[1m36/40[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m1s[0m 250ms/step - loss: 8.8378 - masked_accuracy: 0.0025[tempo] época 1: 9.99s | loss=8.7784 | val_loss=8.6128
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 236ms/step - loss: 8.8311 - masked_accuracy: 0.0037 - val_loss: 8.6128 - val_masked_accuracy: 0.0447
OK: /content/assets/loss_por_epoca.png
OK: /content/assets/throughput_tokens_por_epoca.png | 7260 tok/s
OK: /content/assets/comprimentos_tokens_hist.png
OK: /content/assets/atencao_modelo.png
OK: /content/assets/amostras_traducoes.csv
OK: /content/assets/metrics.json
OK: /content/assets/backprop_microbench.json
OK: /content/assets/cpu_gpu_table.txt
OK: /content/assets/readme_snippet.txt
Resumo final
- Gerados: ['amostras_traducoes.csv', 'atencao_modelo.

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>