# Tokenizer

In [3]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
from tokenizers.processors import TemplateProcessing
import json

dataset_path = "../../datasets/dataset.jsonl"

# === Load data ===
with open(dataset_path, "r", encoding="utf-8") as f:
    texts = []
    for line in f:
        data = json.loads(line)
        texts.append(data["prompt"])
        texts.append(data["response"])

# === Write to plain text (required by tokenizer trainer) ===
with open("tokenizer_corpus.txt", "w", encoding="utf-8") as f:
    for text in texts:
        f.write(text.strip() + "\n")

# === Init tokenizer ===
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
trainer = trainers.BpeTrainer(
    vocab_size=8000,
    show_progress=True,
    special_tokens=["<pad>", "<s>", "</s>", "<unk>"],
)

# === Train ===
tokenizer.train(["tokenizer_corpus.txt"], trainer)

# === Post-processing untuk auto menambahkan <s> dan </s> saat encoding
tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> <s> $B </s>",
    special_tokens=[
        ("<s>", tokenizer.token_to_id("<s>")),
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

tokenizer.decoder = decoders.ByteLevel()

# === Save ===
tokenizer.save("tokenizer-agrolens.json")   
print("✅ Tokenizer saved to tokenizer-agrolens.json")




✅ Tokenizer saved to tokenizer-agrolens.json


In [4]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("tokenizer-agrolens.json")
enc = tokenizer.encode("Apa itu penyakit blast?")
print(enc.tokens)
print(enc.ids)

['<s>', 'ĠApa', 'Ġitu', 'Ġpenyakit', 'Ġblast', '?', '</s>']
[1, 176, 289, 128, 260, 21, 2]


# GPTModel

In [5]:
import tensorflow as tf
from tensorflow.keras import layers


class DecoderBlock(layers.Layer):
    def __init__(self, d_model, n_heads, dropout):
        super().__init__()
        self.mha = layers.MultiHeadAttention(num_heads=n_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential([
            layers.Dense(d_model * 4, activation='relu'),
            layers.Dense(d_model),
        ])
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout)
        self.dropout2 = layers.Dropout(dropout)

    def call(self, x, training=False, mask=None):
        attn = self.mha(x, x, attention_mask=mask, use_causal_mask=True)
        attn = self.dropout1(attn, training=training)
        x = self.norm1(x + attn)

        ffn_out = self.ffn(x)
        ffn_out = self.dropout2(ffn_out, training=training)
        return self.norm2(x + ffn_out)


class AgroLensGPT(tf.keras.Model):
    def __init__(
        self,
        vocab_size,
        max_length=512,
        d_model=256,
        n_heads=4,
        n_layers=4,
        dropout=0.1,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.token_embed = layers.Embedding(vocab_size, d_model)
        self.pos_embed = layers.Embedding(max_length, d_model)
        self.blocks = [DecoderBlock(d_model, n_heads, dropout) for _ in range(n_layers)]
        self.final_norm = layers.LayerNormalization(epsilon=1e-6)
        self.output_head = layers.Dense(vocab_size)

        # Precomputed causal mask (for max_length)
        self.causal_mask = tf.linalg.band_part(tf.ones((max_length, max_length)), -1, 0)

    def call(self, x, training=False):
        B, T = tf.shape(x)[0], tf.shape(x)[1]
        token_emb = self.token_embed(x)  # (B, T, d_model)
        pos_indices = tf.range(start=0, limit=T)
        pos_emb = self.pos_embed(pos_indices)[tf.newaxis, :, :]  # (1, T, d_model)

        h = token_emb + pos_emb  # (B, T, d_model)
        mask = self.causal_mask[:T, :T][tf.newaxis, tf.newaxis, :, :]  # (1, 1, T, T)

        for block in self.blocks:
            h = block(h, training=training, mask=mask)

        h = self.final_norm(h)
        return self.output_head(h)  # (B, T, vocab_size)

    def generate(self, tokenizer, prompt, max_new_tokens=50):
        input_ids = tokenizer.encode(prompt).ids
        input_tensor = tf.constant([input_ids], dtype=tf.int32)

        for _ in range(max_new_tokens):
            logits = self(input_tensor, training=False)
            next_token = tf.argmax(logits[:, -1, :], axis=-1, output_type=tf.int32)
            input_tensor = tf.concat(
                [input_tensor, tf.expand_dims(next_token, axis=1)], axis=1
            )

            if next_token.numpy()[0] == tokenizer.token_to_id("</s>"):
                break
            if input_tensor.shape[1] >= self.max_length:
                break

        return tokenizer.decode(input_tensor[0].numpy().tolist())

2025-06-05 01:23:42.994916: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-05 01:23:43.148185: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749057823.202029    1032 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749057823.218871    1032 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749057823.335008    1032 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [6]:
model = AgroLensGPT(vocab_size=8000)
display(model.summary())
sample_input = tf.random.uniform((2, 64), minval=0, maxval=8000, dtype=tf.int32)
logits = model(sample_input)
print(logits.shape)  # Expected: (2, 64, 8000)

I0000 00:00:1749057827.148919    1032 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


None

(2, 64, 8000)


# Loader dan Tokenizer

In [7]:
import json
import tensorflow as tf
from tokenizers import Tokenizer


class AgroDatasetTF(tf.data.Dataset):
    def __new__(cls, path, tokenizer_path, max_len=256):
        tokenizer = Tokenizer.from_file(tokenizer_path)
        samples = []

        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                prompt = data["prompt"]
                response = data["response"]
                combined = f"{prompt} {response}"

                # Tokenize and truncate
                ids = tokenizer.encode(combined).ids[:max_len]

                if len(ids) >= 2:  # minimal length to create input/label
                    input_ids = ids[:-1]
                    labels = ids[1:]
                    samples.append((input_ids, labels))

        # Convert to TensorFlow tensors
        def gen():
            for input_ids, labels in samples:
                yield {
                    "input_ids": tf.constant(input_ids, dtype=tf.int32),
                    "labels": tf.constant(labels, dtype=tf.int32),
                }

        return tf.data.Dataset.from_generator(
            gen,
            output_signature={
                "input_ids": tf.TensorSpec(shape=(None,), dtype=tf.int32),
                "labels": tf.TensorSpec(shape=(None,), dtype=tf.int32),
            },
        )

In [8]:
dataset = AgroDatasetTF(
    dataset_path, "tokenizer-agrolens.json", max_len=512
)
dataset = dataset.padded_batch(8, padded_shapes={"input_ids": [None], "labels": [None]})
for batch in dataset.take(1):
    print(batch["input_ids"].shape, batch["labels"].shape)

(8, 40) (8, 40)


2025-06-05 01:23:52.159261: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [9]:
import tensorflow as tf

# --- Hyperparameters ---
BATCH_SIZE = 16
EPOCHS = 100
LR = 3e-4
MAX_LEN = 512

# --- Dataset ---
dataset = AgroDatasetTF(
    dataset_path, "tokenizer-agrolens.json", max_len=MAX_LEN
)
dataset = dataset.padded_batch(
    BATCH_SIZE, padded_shapes={"input_ids": [None], "labels": [None]}
)
dataset = dataset.shuffle(1000).prefetch(tf.data.AUTOTUNE)

# --- Model ---
model = AgroLensGPT(vocab_size=8000)
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction="none"
)

display(model.summary())

# --- Custom Training Loop ---
@tf.function
def train_step(input_ids, labels):
    with tf.GradientTape() as tape:
        logits = model(input_ids, training=True)
        # Shifted label loss, ignoring padding (-100 equivalent in PyTorch)
        mask = tf.cast(labels != -100, tf.float32)
        loss_values = loss_fn(labels, logits)
        loss = tf.reduce_sum(loss_values * mask) / tf.reduce_sum(mask)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss


# --- Training Loop ---
for epoch in range(EPOCHS):
    total_loss = 0.0
    steps = 0

    for batch in dataset:
        input_ids = batch["input_ids"]
        labels = batch["labels"]
        loss = train_step(input_ids, labels)
        total_loss += loss.numpy()
        steps += 1

    avg_loss = total_loss / steps
    print(f"📘 Epoch {epoch+1}: Loss = {avg_loss:.4f}")

# --- Save weights ---
model.save_weights("agrolens_model_tf.weights.h5")

None

📘 Epoch 1: Loss = 8.0781


2025-06-05 01:24:05.702623: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


📘 Epoch 2: Loss = 7.2472


2025-06-05 01:24:06.237593: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


📘 Epoch 3: Loss = 6.6469
📘 Epoch 4: Loss = 6.0936
📘 Epoch 5: Loss = 5.5957
📘 Epoch 6: Loss = 5.1616


2025-06-05 01:24:07.212692: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


📘 Epoch 7: Loss = 4.8093
📘 Epoch 8: Loss = 4.5174
📘 Epoch 9: Loss = 4.2744
📘 Epoch 10: Loss = 4.0653
📘 Epoch 11: Loss = 3.8832
📘 Epoch 12: Loss = 3.7086
📘 Epoch 13: Loss = 3.5439
📘 Epoch 14: Loss = 3.3860


2025-06-05 01:24:09.190210: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


📘 Epoch 15: Loss = 3.2355
📘 Epoch 16: Loss = 3.0907
📘 Epoch 17: Loss = 2.9482
📘 Epoch 18: Loss = 2.8116
📘 Epoch 19: Loss = 2.6923
📘 Epoch 20: Loss = 2.5577
📘 Epoch 21: Loss = 2.4333
📘 Epoch 22: Loss = 2.3048
📘 Epoch 23: Loss = 2.1950
📘 Epoch 24: Loss = 2.0791
📘 Epoch 25: Loss = 1.9714
📘 Epoch 26: Loss = 1.8686
📘 Epoch 27: Loss = 1.7505
📘 Epoch 28: Loss = 1.6489
📘 Epoch 29: Loss = 1.5559
📘 Epoch 30: Loss = 1.4685


2025-06-05 01:24:12.980852: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


📘 Epoch 31: Loss = 1.3930
📘 Epoch 32: Loss = 1.3079
📘 Epoch 33: Loss = 1.2233
📘 Epoch 34: Loss = 1.1397
📘 Epoch 35: Loss = 1.0598
📘 Epoch 36: Loss = 0.9861
📘 Epoch 37: Loss = 0.9172
📘 Epoch 38: Loss = 0.8484
📘 Epoch 39: Loss = 0.7850
📘 Epoch 40: Loss = 0.7252
📘 Epoch 41: Loss = 0.6735
📘 Epoch 42: Loss = 0.6303
📘 Epoch 43: Loss = 0.5823
📘 Epoch 44: Loss = 0.5425
📘 Epoch 45: Loss = 0.5059
📘 Epoch 46: Loss = 0.4744
📘 Epoch 47: Loss = 0.4384
📘 Epoch 48: Loss = 0.4124
📘 Epoch 49: Loss = 0.3846
📘 Epoch 50: Loss = 0.3601
📘 Epoch 51: Loss = 0.3395
📘 Epoch 52: Loss = 0.3203
📘 Epoch 53: Loss = 0.3013
📘 Epoch 54: Loss = 0.2841
📘 Epoch 55: Loss = 0.2693
📘 Epoch 56: Loss = 0.2583
📘 Epoch 57: Loss = 0.2476
📘 Epoch 58: Loss = 0.2366
📘 Epoch 59: Loss = 0.2266
📘 Epoch 60: Loss = 0.2163
📘 Epoch 61: Loss = 0.2101
📘 Epoch 62: Loss = 0.2005


2025-06-05 01:24:20.822369: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


📘 Epoch 63: Loss = 0.1941
📘 Epoch 64: Loss = 0.1889
📘 Epoch 65: Loss = 0.1835
📘 Epoch 66: Loss = 0.1768
📘 Epoch 67: Loss = 0.1708
📘 Epoch 68: Loss = 0.1674
📘 Epoch 69: Loss = 0.1630
📘 Epoch 70: Loss = 0.1607
📘 Epoch 71: Loss = 0.1571
📘 Epoch 72: Loss = 0.1530
📘 Epoch 73: Loss = 0.1496
📘 Epoch 74: Loss = 0.1468
📘 Epoch 75: Loss = 0.1437
📘 Epoch 76: Loss = 0.1427
📘 Epoch 77: Loss = 0.1397
📘 Epoch 78: Loss = 0.1378
📘 Epoch 79: Loss = 0.1352
📘 Epoch 80: Loss = 0.1328
📘 Epoch 81: Loss = 0.1324
📘 Epoch 82: Loss = 0.1303
📘 Epoch 83: Loss = 0.1288
📘 Epoch 84: Loss = 0.1269
📘 Epoch 85: Loss = 0.1247
📘 Epoch 86: Loss = 0.1244
📘 Epoch 87: Loss = 0.1240
📘 Epoch 88: Loss = 0.1232
📘 Epoch 89: Loss = 0.1191
📘 Epoch 90: Loss = 0.1177
📘 Epoch 91: Loss = 0.1171
📘 Epoch 92: Loss = 0.1166
📘 Epoch 93: Loss = 0.1154
📘 Epoch 94: Loss = 0.1143
📘 Epoch 95: Loss = 0.1141
📘 Epoch 96: Loss = 0.1149
📘 Epoch 97: Loss = 0.1138
📘 Epoch 98: Loss = 0.1114
📘 Epoch 99: Loss = 0.1114
📘 Epoch 100: Loss = 0.1100


In [10]:
import tensorflow as tf
from tokenizers import Tokenizer

# --- Konstanta dan Load Model ---
MODEL_PATH = "agrolens_model_tf.weights.h5"
TOKENIZER_PATH = "tokenizer-agrolens.json"
VOCAB_SIZE = 8000
MAX_LEN = 128

model = AgroLensGPT(vocab_size=VOCAB_SIZE)
dummy_input = tf.constant([[1] * 64], dtype=tf.int32)  # bentuk (1, 64)
_ = model(dummy_input)  # memanggil forward p
model.load_weights(MODEL_PATH)
tokenizer = Tokenizer.from_file(TOKENIZER_PATH)

In [11]:
def generate(prompt: str, max_new_tokens=50):
    # Tokenisasi prompt
    input_ids = tokenizer.encode(prompt).ids
    input_tensor = tf.constant([input_ids], dtype=tf.int32)

    for _ in range(max_new_tokens):
        # Loloskan input ke model
        logits = model(input_tensor, training=False)

        # Ambil token berikutnya dari distribusi logit terakhir
        next_token = tf.argmax(logits[:, -1, :], axis=-1, output_type=tf.int32)

        # Tambahkan ke input_tensor
        input_tensor = tf.concat(
            [input_tensor, tf.expand_dims(next_token, axis=1)], axis=1
        )

        # Jika token akhir ditemukan
        if tokenizer.token_to_id("</s>") in next_token.numpy():
            break

        # Batasi panjang maksimum
        if input_tensor.shape[1] >= MAX_LEN:
            break

    # Decode output
    output_ids = input_tensor[0].numpy().tolist()
    return tokenizer.decode(output_ids)

In [12]:
import ipywidgets as widgets
from IPython.display import display, Markdown

input_box = widgets.Text(
    value="Apa itu penyakit blast?",
    placeholder="Tulis pertanyaan di sini...",
    description="❓ Pertanyaan:",
    layout=widgets.Layout(width="100%"),
)

output_box = widgets.Output()
generate_button = widgets.Button(
    description="Jawab 🚀", button_style="success", layout=widgets.Layout(width="15%")
)


def on_generate_clicked(b):
    prompt = input_box.value
    response = generate(prompt)
    output_box.clear_output()
    with output_box:
        display(
            Markdown(
                f"### 🧑 Kamu: \n{prompt}\n---\n### 🌾 AgroLens Menjawab:\n{response}"
            )
        )


generate_button.on_click(on_generate_clicked)

# Tampilkan
display(widgets.VBox([input_box, generate_button, output_box]))

VBox(children=(Text(value='Apa itu penyakit blast?', description='❓ Pertanyaan:', layout=Layout(width='100%'),…