In [None]:
# CÉLULA 1 — Setup de pacotes + Mixed Precision (rápido e estável)
import sys, subprocess, importlib

def pip_install(pkg: str):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

# Pacotes utilitários
for pkg in ["tensorflow-datasets", "sacrebleu"]:
    try:
        importlib.import_module(pkg.replace("-", "_"))
    except Exception:
        pip_install(pkg)

import tensorflow as tf

# tensorflow-text precisa casar com a versão do TF
try:
    import tensorflow_text as text  # noqa
except Exception:
    pip_install(f"tensorflow-text=={tf.__version__}")
    import tensorflow_text as text  # noqa

# Mixed precision: acelera em GPUs com Tensor Cores (T4/A100)
from tensorflow.keras import mixed_precision as mp
mp.set_global_policy("mixed_float16")
print("TF:", tf.__version__)
print("Mixed precision policy:", mp.global_policy())
print("GPUs visíveis:", tf.config.list_physical_devices('GPU'))


TF: 2.19.0
Mixed precision policy: <DTypePolicy "mixed_float16">
GPUs visíveis: []


In [None]:
# CÉLULA 2 — Imports base, seeds e TFDS (TED HRLR pt→en)
import time, numpy as np, tensorflow_datasets as tfds
np.random.seed(42); tf.random.set_seed(42)

examples, info = tfds.load("ted_hrlr_translate/pt_to_en", with_info=True, as_supervised=True)
train_ds, val_ds, test_ds = examples["train"], examples["validation"], examples["test"]
print(info)


tfds.core.DatasetInfo(
    name='ted_hrlr_translate',
    full_name='ted_hrlr_translate/pt_to_en/1.0.0',
    description="""
    Data sets derived from TED talk transcripts for comparing similar language pairs
    where one is high resource and the other is low resource.
    """,
    config_description="""
    Translation dataset from pt to en in plain text.
    """,
    homepage='https://github.com/neulab/word-embeddings-for-nmt',
    data_dir='/root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0',
    file_format=tfrecord,
    download_size=124.94 MiB,
    dataset_size=10.89 MiB,
    features=Translation({
        'en': Text(shape=(), dtype=string),
        'pt': Text(shape=(), dtype=string),
    }),
    supervised_keys=('pt', 'en'),
    disable_shuffling=False,
    nondeterministic_order=False,
    splits={
        'test': <SplitInfo num_examples=1803, num_shards=1>,
        'train': <SplitInfo num_examples=51785, num_shards=1>,
        'validation': <SplitInfo num_examples=1

In [None]:
# CÉLULA 3 — (opcional) Amostra de dados bruta
for pt, en in train_ds.batch(3).take(1):
    print("PT:", [x.decode() for x in pt.numpy()])
    print("EN:", [x.decode() for x in en.numpy()])


PT: ['e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .', 'mas e se estes fatores fossem ativos ?', 'mas eles não tinham a curiosidade de me testar .']
EN: ['and when you improve searchability , you actually take away the one advantage of print , which is serendipity .', 'but what if it were active ?', "but they did n't test for curiosity ."]


In [None]:
# CÉLULA 2A — Sanity check de GPU (rode antes do treino)
import os, tensorflow as tf, time, numpy as np
print("TF:", tf.__version__)
print("Logical GPUs:", tf.config.list_logical_devices('GPU'))
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES", "<não definido>"))

# micro-teste de matmul p/ ver tempo de compute
x = tf.random.normal([4096, 4096])
t0 = time.perf_counter()
_ = tf.linalg.matmul(x, x)  # deve rodar no GPU se disponível
dt = time.perf_counter() - t0
print(f"Teste matmul 4096x4096: {dt:.3f}s (GPU deve ser << CPU)")


TF: 2.19.0
Logical GPUs: []
CUDA_VISIBLE_DEVICES: <não definido>
Teste matmul 4096x4096: 3.595s (GPU deve ser << CPU)


In [None]:
# CÉLULA 4 — Tokenizers TED (carregamento robusto do SavedModel)
import os, zipfile, pathlib, shutil

model_name = "ted_hrlr_translate_pt_en_converter"
zip_path = tf.keras.utils.get_file(
    f"{model_name}.zip",
    f"https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip",
    cache_dir='.', cache_subdir='', extract=False)

extract_dir = pathlib.Path(zip_path).with_suffix('')  # ./ted_hrlr_translate_pt_en_converter
if extract_dir.exists():
    shutil.rmtree(extract_dir)

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(path=extract_dir)

candidates = [extract_dir, extract_dir / model_name]
load_dir = next((c for c in candidates if (c / "saved_model.pb").exists()), None)
if load_dir is None:
    raise FileNotFoundError(f"SavedModel não encontrado em {candidates}")

print("Carregando tokenizers de:", load_dir.resolve())
tokenizers = tf.saved_model.load(str(load_dir))


Carregando tokenizers de: /content/ted_hrlr_translate_pt_en_converter/ted_hrlr_translate_pt_en_converter


In [None]:
# CÉLULA 5 — Pipeline ultra-rápido (≈10 min)
BUFFER_SIZE = 10000
BATCH_SIZE  = 32         # ↓ batch menor reduz custo/step
MAX_TOKENS  = 32         # ↓ seq curta acelera bastante

def prepare_batch(pt, en):
    # tokeniza e corta
    pt_tok = tokenizers.pt.tokenize(pt)[:, :MAX_TOKENS].to_tensor()
    en_tok = tokenizers.en.tokenize(en)[:, :MAX_TOKENS+1]
    # pares (in/label)
    en_in  = en_tok[:, :-1].to_tensor()
    en_lab = en_tok[:,  1:].to_tensor()
    return (pt_tok, en_in), en_lab

def make_batches(ds):
    return (ds
            .shuffle(BUFFER_SIZE)
            .batch(BATCH_SIZE, drop_remainder=True)  # drop_remainder evita shapes variando (menos retrace)
            .map(prepare_batch, num_parallel_calls=tf.data.AUTOTUNE)
            .prefetch(tf.data.AUTOTUNE))

train_batches = make_batches(train_ds)
val_batches   = make_batches(val_ds)


In [None]:
# CÉLULA 6 — Positional Encoding (sen/cos)
def positional_encoding(length, depth):
    import numpy as np
    depth = depth/2
    positions = np.arange(length)[:, None]
    depths    = np.arange(depth)[None, :]/depth
    angle_rates = 1/(10000**depths)
    angle_rads  = positions * angle_rates
    pe = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1).astype("float32")
    return tf.constant(pe)  # tf.float32


In [None]:
# CÉLULA 7 — Blocos de atenção/FFN (com dtypes consistentes em MP)
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, heads, d_model, drop=0.1):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=heads, key_dim=d_model, dropout=drop)
        self.add = tf.keras.layers.Add()
        self.norm = tf.keras.layers.LayerNormalization(dtype='float32')  # LN em fp32 aumenta estabilidade

class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        y = self.mha(query=x, value=x, key=x)
        y = tf.cast(y, x.dtype)  # garante dtype igual antes do Add
        x = self.add([x, y])
        return self.norm(x)

class CausalSelfAttention(BaseAttention):
    def call(self, x):
        y = self.mha(query=x, value=x, key=x, use_causal_mask=True)
        y = tf.cast(y, x.dtype)
        x = self.add([x, y])
        return self.norm(x)

class CrossAttention(BaseAttention):
    def call(self, x, ctx):
        y, _ = self.mha(query=x, value=ctx, key=ctx, return_attention_scores=True)
        y = tf.cast(y, x.dtype)
        x = self.add([x, y])
        return self.norm(x)

class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, drop=0.1):
        super().__init__()
        self.d1 = tf.keras.layers.Dense(dff, activation="relu")
        self.d2 = tf.keras.layers.Dense(d_model)
        self.do = tf.keras.layers.Dropout(drop)
        self.ln = tf.keras.layers.LayerNormalization(dtype='float32')
    def call(self, x):
        y = self.d1(x)
        y = self.d2(y)
        y = tf.cast(y, x.dtype)
        y = self.do(y)
        y = self.add = tf.keras.layers.Add()([x, y])
        return self.ln(y)


In [None]:
# CÉLULA 8 — Encoder, Decoder e Transformer (cast da PE + logits em fp32)
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, heads, dff, drop=0.1):
        super().__init__()
        self.sa = GlobalSelfAttention(heads, d_model, drop)
        self.ff = FeedForward(d_model, dff, drop)
    def call(self, x):
        x = self.sa(x)
        x = self.ff(x)
        return x

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, heads, dff, drop=0.1):
        super().__init__()
        self.sa = CausalSelfAttention(heads, d_model, drop)
        self.ca = CrossAttention(heads, d_model, drop)
        self.ff = FeedForward(d_model, dff, drop)
    def call(self, x, ctx):
        x = self.sa(x)
        x = self.ca(x, ctx)
        x = self.ff(x)
        return x

class Encoder(tf.keras.layers.Layer):
    def __init__(self, L, d_model, heads, dff, vocab, drop=0.1, max_pos=2048):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(vocab, d_model, mask_zero=True)
        self.pe  = positional_encoding(max_pos, d_model)  # float32
        self.layers = [EncoderLayer(d_model, heads, dff, drop) for _ in range(L)]
    def call(self, x):
        x = self.emb(x)  # provavelmente float16 sob MP
        pe = tf.cast(self.pe[None, :tf.shape(x)[1], :], x.dtype)  # CAST → dtype do embedding
        x = x + pe
        for layer in self.layers:
            x = layer(x)
        return x

class Decoder(tf.keras.layers.Layer):
    def __init__(self, L, d_model, heads, dff, vocab, drop=0.1, max_pos=2048):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(vocab, d_model, mask_zero=True)
        self.pe  = positional_encoding(max_pos, d_model)
        self.layers = [DecoderLayer(d_model, heads, dff, drop) for _ in range(L)]
        self.out = tf.keras.layers.Dense(vocab, dtype='float32')  # logits em fp32
    def call(self, x, ctx):
        x = self.emb(x)
        pe = tf.cast(self.pe[None, :tf.shape(x)[1], :], x.dtype)
        x = x + pe
        for layer in self.layers:
            x = layer(x, ctx)
        try:
            del x._keras_mask  # remove máscara herdada do Embedding
        except Exception:
            pass
        return self.out(x)

class Transformer(tf.keras.Model):
    def __init__(self, L, d_model, heads, dff, src_vocab, tgt_vocab, drop=0.1):
        super().__init__()
        self.enc = Encoder(L, d_model, heads, dff, src_vocab, drop)
        self.dec = Decoder(L, d_model, heads, dff, tgt_vocab, drop)
    def call(self, inputs):
        pt, en = inputs
        ctx = self.enc(pt)
        return self.dec(en, ctx)


In [None]:
# CÉLULA 9 — Modelo “míni” e compile
def masked_loss(y_true, y_pred):
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')(y_true, y_pred)
    mask = tf.cast(tf.not_equal(y_true, 0), y_pred.dtype)
    loss = loss * tf.cast(mask, loss.dtype)
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def masked_accuracy(y_true, y_pred):
    pred = tf.argmax(y_pred, axis=-1, output_type=y_true.dtype)
    mask = tf.cast(tf.not_equal(y_true, 0), y_true.dtype)
    match = tf.cast(tf.equal(y_true, pred), y_true.dtype) * mask
    return tf.reduce_sum(match) / tf.reduce_sum(mask)

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=1000):
        super().__init__(); self.d_model=tf.cast(d_model, tf.float32); self.warmup=warmup_steps
    def __call__(self, step):
        step=tf.cast(step, tf.float32)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(tf.math.rsqrt(step), step*(self.warmup**-1.5))

# hiperparâmetros míni (1 camada, largura baixa)
L, d_model, dff, heads, drop = 1, 64, 128, 4, 0.1   # heads divide d_model (64/4=16)

transformer = Transformer(
    L, d_model, heads, dff,
    tokenizers.pt.get_vocab_size().numpy(),
    tokenizers.en.get_vocab_size().numpy(),
    drop
)

lr  = CustomSchedule(d_model)
opt = tf.keras.optimizers.Adam(lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

transformer.compile(optimizer=opt, loss=masked_loss, metrics=[masked_accuracy])
transformer.build([(None, None), (None, None)])
print(transformer.summary())




None


In [None]:
# CÉLULA 10A — Warm-up (compila grafo e aquece pipeline) — roda UMA vez antes do fit real
# usa 5 batches de treino e 2 de validação, silencioso (verbose=0)
_ = transformer.fit(
    train_batches.take(5),
    epochs=1,
    steps_per_epoch=5,
    validation_data=val_batches.take(2),
    validation_steps=2,
    verbose=0,
)
print("Warm-up concluído.")


Warm-up concluído.


In [None]:
# CÉLULA 10 — Treino ultra-rápido (≈10 min total, muitas vezes menos)
import time

class EpochTimer(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs=None): self.times=[]
    def on_epoch_begin(self, epoch, logs=None): self.t0=time.perf_counter()
    def on_epoch_end(self, epoch, logs=None):
        dt=time.perf_counter()-self.t0; self.times.append(dt)
        print(f"[tempo] época {epoch+1}: {dt/60:.2f} min | loss={logs.get('loss'):.4f} | val_loss={logs.get('val_loss'):.4f}")

EPOCHS = 1
STEPS_PER_EPOCH = 5    # 5 steps apenas
VAL_STEPS       = 2

hist = transformer.fit(
    train_batches,
    epochs=EPOCHS,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_data=val_batches,
    validation_steps=VAL_STEPS,
    callbacks=[EpochTimer()]
)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - loss: 8.8748 - masked_accuracy: 0.0000e+00[tempo] época 1: 0.23 min | loss=8.8749 | val_loss=8.8708
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3s/step - loss: 8.8748 - masked_accuracy: 0.0000e+00 - val_loss: 8.8708 - val_masked_accuracy: 0.0000e+00


In [None]:
# CÉLULA 11 — Tradução greedy (rápida)
probe = tokenizers.en.tokenize(tf.constant(["hello"]))
START_ID = int(probe[0][0].numpy())
END_ID   = int(probe[0][-1].numpy())
MAX_GEN  = 32   # coerente com MAX_TOKENS

def translate_pt2en(sentence_pt: str):
    pt = tf.constant([sentence_pt])
    pt_tokens = tokenizers.pt.tokenize(pt).to_tensor()
    en_tokens = tf.constant([[START_ID]], dtype=tf.int64)
    for _ in range(MAX_GEN):
        logits = transformer((pt_tokens, en_tokens))
        next_id = tf.argmax(logits[:, -1, :], axis=-1, output_type=en_tokens.dtype)
        en_tokens = tf.concat([en_tokens, next_id[:, None]], axis=1)
        if int(next_id[0].numpy()) == END_ID: break
    return tokenizers.en.detokenize(en_tokens)[0].numpy().decode("utf-8")

print(translate_pt2en("este é o primeiro livro que eu fiz."))
print(translate_pt2en("gostaria de um copo de água, por favor."))




theme rape sake sake sakeitchitchitchitchitchitchitchitchitchitch estimate psychologist psychologist psychologist equivalent lighting directly directly hardware device device johnnyitchitchitchitch bees
theme hundred sake sake sakeitchitchitchitchitchitchitchitchitchitch estimate psychologist psychologist psychologist equivalent lighting directly directly hardware device device johnnyitch inchesitchitch bees


In [None]:
# CÉLULA 12 — (opcional) SacreBLEU rápido em amostra pequena
import sacrebleu as sbl

def bleu_on(ds, n=32):
    hyps, refs = [], []
    for pt, en in ds.unbatch().take(n):
        pt_s, en_s = pt.numpy().decode("utf-8"), en.numpy().decode("utf-8")
        hyp = translate_pt2en(pt_s)
        hyps.append(hyp)
        refs.append([en_s])
    return sbl.corpus_bleu(hyps, list(zip(*refs))).score

# Descomente para medir:
# bleu_val = bleu_on(val_ds, n=32)
# print("SacreBLEU (val, n=32):", bleu_val)
