In [1]:
import json
import tensorflow as tf
from tokenizers import Tokenizer


class AgroDatasetTF(tf.data.Dataset):
    def __new__(cls, path, tokenizer_path, max_len=256):
        tokenizer = Tokenizer.from_file(tokenizer_path)
        samples = []

        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                prompt = data["prompt"]
                response = data["response"]
                combined = f"{prompt} {response}"

                # Tokenize and truncate
                ids = tokenizer.encode(combined).ids[:max_len]

                if len(ids) >= 2:  # minimal length to create input/label
                    input_ids = ids[:-1]
                    labels = ids[1:]
                    samples.append((input_ids, labels))

        # Convert to TensorFlow tensors
        def gen():
            for input_ids, labels in samples:
                yield {
                    "input_ids": tf.constant(input_ids, dtype=tf.int32),
                    "labels": tf.constant(labels, dtype=tf.int32),
                }

        return tf.data.Dataset.from_generator(
            gen,
            output_signature={
                "input_ids": tf.TensorSpec(shape=(None,), dtype=tf.int32),
                "labels": tf.TensorSpec(shape=(None,), dtype=tf.int32),
            },
        )

2025-06-05 20:39:10.407184: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-05 20:39:10.415996: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749127150.428770   19110 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749127150.431862   19110 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749127150.439346   19110 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
import tensorflow as tf
from tensorflow.keras import layers


class DecoderBlock(layers.Layer):
    def __init__(self, d_model, n_heads, dropout):
        super().__init__()
        self.mha = layers.MultiHeadAttention(num_heads=n_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential(
            [
                layers.Dense(d_model * 4, activation="relu"),
                layers.Dense(d_model),
            ]
        )
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout)
        self.dropout2 = layers.Dropout(dropout)

    def call(self, x, training=False, mask=None):
        attn = self.mha(x, x, attention_mask=mask, use_causal_mask=True)
        attn = self.dropout1(attn, training=training)
        x = self.norm1(x + attn)

        ffn_out = self.ffn(x)
        ffn_out = self.dropout2(ffn_out, training=training)
        return self.norm2(x + ffn_out)


class AgroLensGPT(tf.keras.Model):
    def __init__(
        self,
        vocab_size,
        max_length=512,
        d_model=256,
        n_heads=4,
        n_layers=4,
        dropout=0.1,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.token_embed = layers.Embedding(vocab_size, d_model)
        self.pos_embed = layers.Embedding(max_length, d_model)
        self.blocks = [DecoderBlock(d_model, n_heads, dropout) for _ in range(n_layers)]
        self.final_norm = layers.LayerNormalization(epsilon=1e-6)
        self.output_head = layers.Dense(vocab_size)

        # Precomputed causal mask (for max_length)
        self.causal_mask = tf.linalg.band_part(tf.ones((max_length, max_length)), -1, 0)

    def call(self, x, training=False):
        B, T = tf.shape(x)[0], tf.shape(x)[1]
        token_emb = self.token_embed(x)  # (B, T, d_model)
        pos_indices = tf.range(start=0, limit=T)
        pos_emb = self.pos_embed(pos_indices)[tf.newaxis, :, :]  # (1, T, d_model)

        h = token_emb + pos_emb  # (B, T, d_model)
        mask = self.causal_mask[:T, :T][tf.newaxis, tf.newaxis, :, :]  # (1, 1, T, T)

        for block in self.blocks:
            h = block(h, training=training, mask=mask)

        h = self.final_norm(h)
        return self.output_head(h)  # (B, T, vocab_size)

    def generate(self, tokenizer, prompt, max_new_tokens=50):
        input_ids = tokenizer.encode(prompt).ids
        input_tensor = tf.constant([input_ids], dtype=tf.int32)

        for _ in range(max_new_tokens):
            logits = self(input_tensor, training=False)
            next_token = tf.argmax(logits[:, -1, :], axis=-1, output_type=tf.int32)
            input_tensor = tf.concat(
                [input_tensor, tf.expand_dims(next_token, axis=1)], axis=1
            )

            if next_token.numpy()[0] == tokenizer.token_to_id("</s>"):
                break
            if input_tensor.shape[1] >= self.max_length:
                break

        return tokenizer.decode(input_tensor[0].numpy().tolist())

In [3]:
import tensorflow as tf
from tensorflow.keras import layers


class AgroTransformerEncoder(tf.keras.Model):
    def __init__(
        self, vocab_size=8000, max_length=128, d_model=256, n_heads=4, dropout=0.1
    ):
        super().__init__()
        self.token_embed = layers.Embedding(input_dim=vocab_size, output_dim=d_model)
        self.pos_embed = layers.Embedding(input_dim=max_length, output_dim=d_model)

        self.attn = layers.MultiHeadAttention(num_heads=n_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential(
            [
                layers.Dense(d_model * 4, activation="relu"),
                layers.Dropout(dropout),
                layers.Dense(d_model),
            ]
        )
        self.dropout = layers.Dropout(dropout)
        self.ln1 = layers.LayerNormalization()
        self.ln2 = layers.LayerNormalization()

    def call(self, x, training=False):
        seq_len = tf.shape(x)[1]
        pos = tf.range(start=0, limit=seq_len, delta=1)
        pos = tf.expand_dims(pos, 0)
        x = self.token_embed(x) + self.pos_embed(pos)

        attn_output = self.attn(x, x, attention_mask=None, use_causal_mask=False)
        x = self.ln1(x + self.dropout(attn_output, training=training))

        ffn_output = self.ffn(x, training=training)
        x = self.ln2(x + self.dropout(ffn_output, training=training))

        x = tf.reduce_mean(x, axis=1)  # Global average pooling
        return x  # shape: (batch, d_model)