# Tokenizer

In [None]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
from tokenizers.processors import TemplateProcessing
import json

# === Load data ===
with open("baseline-dataset.jsonl", "r", encoding="utf-8") as f:
    texts = []
    for line in f:
        data = json.loads(line)
        texts.append(data["prompt"])
        texts.append(data["response"])

# === Write to plain text (required by tokenizer trainer) ===
with open("tokenizer_corpus.txt", "w", encoding="utf-8") as f:
    for text in texts:
        f.write(text.strip() + "\n")

# === Init tokenizer ===
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
trainer = trainers.BpeTrainer(
    vocab_size=8000,
    show_progress=True,
    special_tokens=["<pad>", "<s>", "</s>", "<unk>"],
)

# === Train ===
tokenizer.train(["tokenizer_corpus.txt"], trainer)

# === Post-processing untuk auto menambahkan <s> dan </s> saat encoding
tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> <s> $B </s>",
    special_tokens=[
        ("<s>", tokenizer.token_to_id("<s>")),
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

tokenizer.decoder = decoders.ByteLevel()

# === Save ===
tokenizer.save("tokenizer-agrolens.json")   
print("✅ Tokenizer saved to tokenizer-agrolens.json")

: 

In [None]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("tokenizer-agrolens.json")
enc = tokenizer.encode("Apa itu penyakit blast?")
print(enc.tokens)
print(enc.ids)

# GPTModel

In [None]:
import tensorflow as tf
from tensorflow.keras import layers


class AgroLensGPT(tf.keras.Model):

    def __init__(
        self,
        vocab_size,
        max_length=512,
        d_model=256,
        n_heads=4,
        n_layers=4,
        dropout=0.1,
    ):
        super(AgroLensGPT, self).__init__()
        self.token_embed = layers.Embedding(input_dim=vocab_size, output_dim=d_model)
        self.pos_embed = layers.Embedding(input_dim=max_length, output_dim=d_model)

        self.decoder_blocks = [
            tf.keras.layers.LayerNormalization(epsilon=1e-6),
        ]
        self.decoder_layers = []
        for _ in range(n_layers):
            self.decoder_layers.append(
                [
                    layers.MultiHeadAttention(num_heads=n_heads, key_dim=d_model),
                    layers.Dropout(dropout),
                    layers.LayerNormalization(epsilon=1e-6),
                    layers.Dense(d_model * 4, activation="relu"),
                    layers.Dense(d_model),
                    layers.Dropout(dropout),
                    layers.LayerNormalization(epsilon=1e-6),
                ]
            )

        self.final_ln = layers.LayerNormalization(epsilon=1e-6)
        self.output_head = layers.Dense(vocab_size)

    def call(self, x, training=False):
        B, T = tf.shape(x)[0], tf.shape(x)[1]
        token_emb = self.token_embed(x)  # (B, T, d_model)
        positions = tf.range(start=0, limit=T, delta=1)
        pos_emb = self.pos_embed(positions)  # (T, d_model)
        pos_emb = tf.expand_dims(pos_emb, axis=0)  # (1, T, d_model)
        h = token_emb + pos_emb  # (B, T, d_model)

        # Causal mask
        causal_mask = tf.linalg.band_part(tf.ones((T, T)), -1, 0)  # (T, T)
        causal_mask = tf.cast(causal_mask, dtype=tf.bool)

        for mha, drop1, ln1, ff1, ff2, drop2, ln2 in self.decoder_layers:
            attn_output = mha(h, h, h, attention_mask=causal_mask, use_causal_mask=True)
            attn_output = drop1(attn_output, training=training)
            h = ln1(h + attn_output)

            ffn_output = ff1(h)
            ffn_output = ff2(ffn_output)
            ffn_output = drop2(ffn_output, training=training)
            h = ln2(h + ffn_output)

        h = self.final_ln(h)
        logits = self.output_head(h)  # (B, T, vocab_size)
        return logits

In [None]:
model = AgroLensGPT(vocab_size=8000)
display(model.summary())
sample_input = tf.random.uniform((2, 64), minval=0, maxval=8000, dtype=tf.int32)
logits = model(sample_input)
print(logits.shape)  # Expected: (2, 64, 8000)

# Loader dan Tokenizer

In [None]:
import json
import tensorflow as tf
from tokenizers import Tokenizer


class AgroDatasetTF(tf.data.Dataset):
    def __new__(cls, path, tokenizer_path, max_len=256):
        tokenizer = Tokenizer.from_file(tokenizer_path)
        samples = []

        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                prompt = data["prompt"]
                response = data["response"]
                combined = f"{prompt} {response}"

                # Tokenize and truncate
                ids = tokenizer.encode(combined).ids[:max_len]

                if len(ids) >= 2:  # minimal length to create input/label
                    input_ids = ids[:-1]
                    labels = ids[1:]
                    samples.append((input_ids, labels))

        # Convert to TensorFlow tensors
        def gen():
            for input_ids, labels in samples:
                yield {
                    "input_ids": tf.constant(input_ids, dtype=tf.int32),
                    "labels": tf.constant(labels, dtype=tf.int32),
                }

        return tf.data.Dataset.from_generator(
            gen,
            output_signature={
                "input_ids": tf.TensorSpec(shape=(None,), dtype=tf.int32),
                "labels": tf.TensorSpec(shape=(None,), dtype=tf.int32),
            },
        )

In [None]:
dataset = AgroDatasetTF(
    "baseline-dataset.jsonl", "tokenizer-agrolens.json", max_len=512
)
dataset = dataset.padded_batch(8, padded_shapes={"input_ids": [None], "labels": [None]})
for batch in dataset.take(1):
    print(batch["input_ids"].shape, batch["labels"].shape)

In [None]:
import tensorflow as tf

# --- Hyperparameters ---
BATCH_SIZE = 16
EPOCHS = 100
LR = 3e-4
MAX_LEN = 512

# --- Dataset ---
dataset = AgroDatasetTF(
    "baseline-dataset.jsonl", "tokenizer-agrolens.json", max_len=MAX_LEN
)
dataset = dataset.padded_batch(
    BATCH_SIZE, padded_shapes={"input_ids": [None], "labels": [None]}
)
dataset = dataset.shuffle(1000).prefetch(tf.data.AUTOTUNE)

# --- Model ---
model = AgroLensGPT(vocab_size=8000)
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction="none"
)


# --- Custom Training Loop ---
@tf.function
def train_step(input_ids, labels):
    with tf.GradientTape() as tape:
        logits = model(input_ids, training=True)
        # Shifted label loss, ignoring padding (-100 equivalent in PyTorch)
        mask = tf.cast(labels != -100, tf.float32)
        loss_values = loss_fn(labels, logits)
        loss = tf.reduce_sum(loss_values * mask) / tf.reduce_sum(mask)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


# --- Training Loop ---
for epoch in range(EPOCHS):
    total_loss = 0.0
    steps = 0

    for batch in dataset:
        input_ids = batch["input_ids"]
        labels = batch["labels"]
        loss = train_step(input_ids, labels)
        total_loss += loss.numpy()
        steps += 1

    avg_loss = total_loss / steps
    print(f"📘 Epoch {epoch+1}: Loss = {avg_loss:.4f}")

# --- Save weights ---
model.save_weights("agrolens_model_tf.weights.h5")

In [None]:
import tensorflow as tf
from tokenizers import Tokenizer

# --- Konstanta dan Load Model ---
MODEL_PATH = "agrolens_model_tf.weights.h5"
TOKENIZER_PATH = "tokenizer-agrolens.json"
VOCAB_SIZE = 8000
MAX_LEN = 128

model = AgroLensGPT(vocab_size=VOCAB_SIZE)
dummy_input = tf.constant([[1] * 64], dtype=tf.int32)  # bentuk (1, 64)
_ = model(dummy_input)  # memanggil forward p
model.load_weights(MODEL_PATH)
tokenizer = Tokenizer.from_file(TOKENIZER_PATH)

In [None]:
def generate(prompt: str, max_new_tokens=50):
    # Tokenisasi prompt
    input_ids = tokenizer.encode(prompt).ids
    input_tensor = tf.constant([input_ids], dtype=tf.int32)

    for _ in range(max_new_tokens):
        # Loloskan input ke model
        logits = model(input_tensor, training=False)

        # Ambil token berikutnya dari distribusi logit terakhir
        next_token = tf.argmax(logits[:, -1, :], axis=-1, output_type=tf.int32)

        # Tambahkan ke input_tensor
        input_tensor = tf.concat(
            [input_tensor, tf.expand_dims(next_token, axis=1)], axis=1
        )

        # Jika token akhir ditemukan
        if tokenizer.token_to_id("</s>") in next_token.numpy():
            break

        # Batasi panjang maksimum
        if input_tensor.shape[1] >= MAX_LEN:
            break

    # Decode output
    output_ids = input_tensor[0].numpy().tolist()
    return tokenizer.decode(output_ids)

In [None]:
import ipywidgets as widgets
from IPython.display import display, Markdown

input_box = widgets.Text(
    value="Apa itu penyakit blast?",
    placeholder="Tulis pertanyaan di sini...",
    description="❓ Pertanyaan:",
    layout=widgets.Layout(width="100%"),
)

output_box = widgets.Output()
generate_button = widgets.Button(
    description="Jawab 🚀", button_style="success", layout=widgets.Layout(width="15%")
)


def on_generate_clicked(b):
    prompt = input_box.value
    response = generate(prompt)
    output_box.clear_output()
    with output_box:
        display(
            Markdown(
                f"### 🧑 Kamu: \n{prompt}\n---\n### 🌾 AgroLens Menjawab:\n{response}"
            )
        )


generate_button.on_click(on_generate_clicked)

# Tampilkan
display(widgets.VBox([input_box, generate_button, output_box]))