# Tokenizer

In [2]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
from tokenizers.processors import TemplateProcessing
import pandas as pd
import json
import tensorflow as tf

# === Load data dari Excel ===
excel_path = "../../datasets/xlsx/Dataset Labeling Chatbot.xlsx"
sheet_names = ["Blast", "Tungro", "Bacterial Blight", "BrownSpot", "LeafScald"]

texts = []
for sheet in sheet_names:
    df = pd.read_excel(excel_path, sheet_name=sheet, engine="openpyxl")
    prompts = df["prompt"].dropna().astype(str).tolist()
    responses = df["response"].dropna().astype(str).tolist()
    texts.extend(prompts + responses)

# === Simpan ke file teks sebagai korpus tokenizer ===
with open("tokenizer_corpus.txt", "w", encoding="utf-8") as f:
    for line in texts:
        f.write(line.strip() + "\n")

# === Inisialisasi tokenizer BPE ===
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
trainer = trainers.BpeTrainer(
    vocab_size=8000,
    show_progress=True,
    special_tokens=["<pad>", "<s>", "</s>", "<unk>"],
)

# === Latih tokenizer ===
tokenizer.train(["tokenizer_corpus.txt"], trainer)

# === Tambahkan pemrosesan <s> dan </s> secara otomatis
tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> <s> $B </s>",
    special_tokens=[
        ("<s>", tokenizer.token_to_id("<s>")),
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

# === Tambahkan decoder agar hasil decode benar
tokenizer.decoder = decoders.ByteLevel()

# === Simpan tokenizer ===
tokenizer.save("tokenizer-agrolens.json")
print("✅ Tokenizer saved to tokenizer-agrolens.json")




✅ Tokenizer saved to tokenizer-agrolens.json


In [3]:
excel_path = "../../datasets/xlsx/Dataset Labeling Chatbot.xlsx"
sheet_names = ["Blast", "Tungro", "Bacterial Blight", "BrownSpot", "LeafScald"]

all_data = []
for sheet in sheet_names:
    df = pd.read_excel(excel_path, sheet_name=sheet, engine="openpyxl")
    for _, row in df.iterrows():
        prompt = str(row["prompt"]).strip()
        response = str(row["response"]).strip()
        if prompt and response:
            all_data.append({"prompt": prompt, "response": response})

# Simpan ke JSONL
with open("dataset.jsonl", "w", encoding="utf-8") as f:
    for item in all_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("✅ dataset.jsonl berhasil dibuat")

✅ dataset.jsonl berhasil dibuat


In [4]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("tokenizer-agrolens.json")
enc = tokenizer.encode("Apa itu penyakit blast?")
print(enc.tokens)
print(enc.ids)

['<s>', 'ĠApa', 'Ġitu', 'Ġpenyakit', 'Ġblast', '?', '</s>']
[1, 247, 644, 140, 229, 26, 2]


# GPTModel

In [5]:
import tensorflow as tf
from tensorflow.keras import layers


class DecoderBlock(layers.Layer):
    def __init__(self, d_model, n_heads, dropout):
        super().__init__()
        self.mha = layers.MultiHeadAttention(num_heads=n_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential([
            layers.Dense(d_model * 4, activation='relu'),
            layers.Dense(d_model),
        ])
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout)
        self.dropout2 = layers.Dropout(dropout)

    def call(self, x, training=False, mask=None):
        attn = self.mha(x, x, attention_mask=mask, use_causal_mask=True)
        attn = self.dropout1(attn, training=training)
        x = self.norm1(x + attn)

        ffn_out = self.ffn(x)
        ffn_out = self.dropout2(ffn_out, training=training)
        return self.norm2(x + ffn_out)


class AgroLensGPT(tf.keras.Model):
    def __init__(
        self,
        vocab_size,
        max_length=512,
        d_model=256,
        n_heads=4,
        n_layers=4,
        dropout=0.1,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.token_embed = layers.Embedding(vocab_size, d_model)
        self.pos_embed = layers.Embedding(max_length, d_model)
        self.blocks = [DecoderBlock(d_model, n_heads, dropout) for _ in range(n_layers)]
        self.final_norm = layers.LayerNormalization(epsilon=1e-6)
        self.output_head = layers.Dense(vocab_size)

        # Precomputed causal mask (for max_length)
        self.causal_mask = tf.linalg.band_part(tf.ones((max_length, max_length)), -1, 0)

    def call(self, x, training=False):
        B, T = tf.shape(x)[0], tf.shape(x)[1]
        token_emb = self.token_embed(x)  # (B, T, d_model)
        pos_indices = tf.range(start=0, limit=T)
        pos_emb = self.pos_embed(pos_indices)[tf.newaxis, :, :]  # (1, T, d_model)

        h = token_emb + pos_emb  # (B, T, d_model)
        mask = self.causal_mask[:T, :T][tf.newaxis, tf.newaxis, :, :]  # (1, 1, T, T)

        for block in self.blocks:
            h = block(h, training=training, mask=mask)

        h = self.final_norm(h)
        return self.output_head(h)  # (B, T, vocab_size)

    def generate(self, tokenizer, prompt, max_new_tokens=50):
        input_ids = tokenizer.encode(prompt).ids
        input_tensor = tf.constant([input_ids], dtype=tf.int32)

        for _ in range(max_new_tokens):
            logits = self(input_tensor, training=False)
            next_token = tf.argmax(logits[:, -1, :], axis=-1, output_type=tf.int32)
            input_tensor = tf.concat(
                [input_tensor, tf.expand_dims(next_token, axis=1)], axis=1
            )

            if next_token.numpy()[0] == tokenizer.token_to_id("</s>"):
                break
            if input_tensor.shape[1] >= self.max_length:
                break

        return tokenizer.decode(input_tensor[0].numpy().tolist())

In [6]:
model = AgroLensGPT(vocab_size=8000)
display(model.summary())
sample_input = tf.random.uniform((2, 64), minval=0, maxval=8000, dtype=tf.int32)
logits = model(sample_input)
print(logits.shape)  # Expected: (2, 64, 8000)

I0000 00:00:1749221379.095651    1753 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2863 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1050, pci bus id: 0000:01:00.0, compute capability: 6.1


None

(2, 64, 8000)


# Loader dan Tokenizer

In [7]:
import json
import tensorflow as tf
from tokenizers import Tokenizer


# === Dataset loader ===
class AgroDatasetTF(tf.data.Dataset):
    def __new__(cls, path, tokenizer, max_len=512):
        samples = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                text = f"{data['prompt']} {data['response']}"
                ids = tokenizer.encode(text).ids[:max_len]
                if len(ids) >= 2:
                    input_ids = ids[:-1]
                    labels = ids[1:]
                    samples.append((input_ids, labels))

        def gen():
            for input_ids, labels in samples:
                yield {
                    "input_ids": tf.constant(input_ids, dtype=tf.int32),
                    "labels": tf.constant(labels, dtype=tf.int32),
                }

        return tf.data.Dataset.from_generator(
            gen,
            output_signature={
                "input_ids": tf.TensorSpec(shape=(None,), dtype=tf.int32),
                "labels": tf.TensorSpec(shape=(None,), dtype=tf.int32),
            },
        )

In [8]:
# === Load tokenizer ===
tokenizer = Tokenizer.from_file("tokenizer-agrolens.json")
VOCAB_SIZE = tokenizer.get_vocab_size()
MAX_LEN = 512
PAD_TOKEN_ID = tokenizer.token_to_id("<pad>")

# === Load dataset ===
dataset = AgroDatasetTF("dataset_ragstyle.jsonl", tokenizer, max_len=MAX_LEN)

dataset = dataset.padded_batch(
    batch_size=16,
    padded_shapes={"input_ids": [None], "labels": [None]},
    padding_values={"input_ids": PAD_TOKEN_ID, "labels": -100},  # ✅
)
# === Define model ===
model = AgroLensGPT(vocab_size=VOCAB_SIZE, max_length=MAX_LEN)
model.build(input_shape=(None, MAX_LEN))
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction="none"
)


# === Training loop ===
@tf.function
def train_step(input_ids, labels):
    with tf.GradientTape() as tape:
        logits = model(input_ids, training=True)
        mask = tf.cast(labels != -100, tf.float32)
        loss_vals = loss_fn(labels, logits)
        loss = tf.reduce_sum(loss_vals * mask) / tf.reduce_sum(mask)



    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss


EPOCHS = 100
for epoch in range(1, EPOCHS + 1):
    total_loss = 0.0
    steps = 0
    for batch in dataset:
        loss = train_step(batch["input_ids"], batch["labels"])
        total_loss += loss.numpy()
        steps += 1
    print(f"📘 Epoch {epoch}: Loss = {total_loss / steps:.4f}")

# === Save model weights ===
model.save_weights("agrolens_model_tf_rag.weights.h5")
print("✅ Weights saved to agrolens_model_tf_rag.weights.h5")





2025-06-06 22:52:02.839931: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


📘 Epoch 1: Loss = nan


2025-06-06 22:52:06.302471: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


📘 Epoch 2: Loss = nan
📘 Epoch 3: Loss = nan


2025-06-06 22:52:13.050572: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


📘 Epoch 4: Loss = nan
📘 Epoch 5: Loss = nan
📘 Epoch 6: Loss = nan
📘 Epoch 7: Loss = nan


2025-06-06 22:52:25.962528: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


📘 Epoch 8: Loss = nan
📘 Epoch 9: Loss = nan
📘 Epoch 10: Loss = nan
📘 Epoch 11: Loss = nan
📘 Epoch 12: Loss = nan
📘 Epoch 13: Loss = nan
📘 Epoch 14: Loss = nan
📘 Epoch 15: Loss = nan


2025-06-06 22:52:54.583020: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


📘 Epoch 16: Loss = nan
📘 Epoch 17: Loss = nan
📘 Epoch 18: Loss = nan


KeyboardInterrupt: 

In [None]:
# --- Konstanta dan Load Model ---
MODEL_PATH = "agrolens_model_tf.weights.h5"
RETRIEVER_WEIGHTS = "retriever_encoder_weights.weights.h5"
TOKENIZER_PATH = "tokenizer-agrolens.json"
VOCAB_SIZE = 8000
MAX_LEN = 512
TOP_K = 3

In [None]:
def generate(prompt: str, max_new_tokens: int = 50) -> str:
    """
    Generate teks dari model AgroLensGPT berbasis prompt dan tokenizer.

    Args:
        prompt (str): Prompt input dari user.
        max_new_tokens (int): Jumlah maksimum token baru yang dihasilkan.

    Returns:
        str: Hasil teks yang dihasilkan oleh model.
    """
    # Tokenisasi prompt dan potong jika terlalu panjang
    input_ids = tokenizer.encode(prompt).ids[:MAX_LEN]
    input_tensor = tf.constant([input_ids], dtype=tf.int32)

    for _ in range(max_new_tokens):
        # Prediksi logit dari model
        logits = model(input_tensor, training=False)

        # Ambil token berikutnya (greedy decoding)
        next_token = tf.argmax(logits[:, -1, :], axis=-1, output_type=tf.int32)

        # Gabungkan token ke input
        input_tensor = tf.concat(
            [input_tensor, tf.expand_dims(next_token, axis=1)], axis=1
        )

        # Stop jika token </s> ditemukan
        if next_token.numpy()[0] == tokenizer.token_to_id("</s>"):
            break

        # Stop jika melebihi panjang maksimum
        if input_tensor.shape[1] >= MAX_LEN:
            break

    # Decode seluruh output menjadi teks
    output_ids = input_tensor[0].numpy().tolist()
    return tokenizer.decode(output_ids)


if __name__ == "__main__":
    question = """<s> Berikut adalah informasi terkait:
...
Pertanyaan: Apa itu penyakit blast?
Jawaban: Blast disebabkan oleh jamur...
</s>
"""
    print("🌾 AgroLens Menjawab:")
    print(generate(question))

🌾 AgroLens Menjawab:
 Berikut adalah informasi terkait
...
Pertanyaan Apa itu penyakit blast?
Jawaban Blast disebabkan oleh jamur...
 
 air dan menyebar melalui serangga penghisap, varietas tahan, dan membentuk appressorium, ini belum terbukti efektif dalam konteks internasional.


In [None]:
import tensorflow as tf
from tensorflow.keras import layers


class AgroTransformerEncoder(tf.keras.Model):
    def __init__(
        self, vocab_size=8000, max_length=128, d_model=256, n_heads=4, dropout=0.1
    ):
        super().__init__()
        self.token_embed = layers.Embedding(input_dim=vocab_size, output_dim=d_model)
        self.pos_embed = layers.Embedding(input_dim=max_length, output_dim=d_model)

        self.attn = layers.MultiHeadAttention(num_heads=n_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential(
            [
                layers.Dense(d_model * 4, activation="relu"),
                layers.Dropout(dropout),
                layers.Dense(d_model),
            ]
        )
        self.dropout = layers.Dropout(dropout)
        self.ln1 = layers.LayerNormalization()
        self.ln2 = layers.LayerNormalization()

    def call(self, x, training=False):
        seq_len = tf.shape(x)[1]
        pos = tf.range(start=0, limit=seq_len, delta=1)
        pos = tf.expand_dims(pos, 0)
        x = self.token_embed(x) + self.pos_embed(pos)

        attn_output = self.attn(x, x, attention_mask=None, use_causal_mask=False)
        x = self.ln1(x + self.dropout(attn_output, training=training))

        ffn_output = self.ffn(x, training=training)
        x = self.ln2(x + self.dropout(ffn_output, training=training))

        x = tf.reduce_mean(x, axis=1)  # Global average pooling
        return x  # shape: (batch, d_model)

In [None]:
RETRIEVER_WEIGHTS = "retriever_encoder_weights.weights.h5"
tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
PAD_TOKEN_ID = tokenizer.token_to_id("<pad>")

# === Load GPT model ===
model = AgroLensGPT(vocab_size=VOCAB_SIZE, max_length=MAX_LEN)
model.build(input_shape=(None, MAX_LEN))
model.load_weights(MODEL_PATH)
# === Load retriever encoder ===
retriever_encoder = AgroTransformerEncoder(vocab_size=VOCAB_SIZE, max_length=MAX_LEN)
retriever_encoder.build(input_shape=(None, MAX_LEN))
retriever_encoder.load_weights(RETRIEVER_WEIGHTS)

# === Load corpus passages (e.g. response list) ===
with open("dataset.jsonl", "r", encoding="utf-8") as f:
    corpus = [json.loads(line)["response"] for line in f if "response" in line]


with open("rag_corpus.jsonl", "r", encoding="utf-8") as f:
    corpus = [json.loads(line) for line in f]


def detect_disease(prompt: str):
    p = prompt.lower()
    if "blast" in p:
        return "blast"
    elif "tungro" in p:
        return "tungro"
    # Tambahkan deteksi lain
    return None


def retrieve_top_k_contexts(query: str, top_k=3):
    disease = detect_disease(query)
    filtered = [c for c in corpus if c["disease"] == disease]

    query_ids = tokenizer.encode(query).ids[:MAX_LEN]
    q_tensor = tf.constant([query_ids], dtype=tf.int32)
    q_embed = retriever_encoder(q_tensor)

    passage_embeddings = []
    texts = []
    for entry in filtered:
        ids = tokenizer.encode(entry["text"]).ids[:MAX_LEN]
        t_tensor = tf.constant([ids], dtype=tf.int32)
        emb = retriever_encoder(t_tensor)
        passage_embeddings.append(emb[0].numpy())
        texts.append(entry["text"])

    sims = tf.linalg.matmul(q_embed, tf.transpose(tf.constant(passage_embeddings)))
    top_idx = tf.math.top_k(sims, k=min(top_k, len(texts))).indices.numpy()[0]
    return [texts[i] for i in top_idx]


# === Generate with GPT + context ===
def generate_rag(prompt: str, max_new_tokens: int = 50) -> str:
    top_contexts = retrieve_top_k_contexts(prompt)
    for i, ctx in enumerate(top_contexts):
        print(f"[Context {i+1}] {ctx[:100]}...")
    full_prompt = (
        "Berikut adalah informasi terkait:\n"
        + "\n".join(top_contexts)
        + f"\n\nPertanyaan: {prompt}\nJawaban:"
    )

    input_ids = tokenizer.encode(full_prompt).ids[:MAX_LEN]
    input_tensor = tf.constant([input_ids], dtype=tf.int32)

    for _ in range(max_new_tokens):
        # logits.shape: [1, seq_len, vocab_size]
        logits = model(input_tensor, training=False)
        logits = logits[:, -1, :]  # ambil logit terakhir, shape: [1, vocab_size]

        # Top-k sampling
        k = 10
        values, indices = tf.math.top_k(logits, k=k)  # [1, k]
        next_token = tf.random.categorical(values, num_samples=1)  # [1, 1]
        next_token = tf.gather(indices, next_token, batch_dims=1)  # [1, 1]
        input_tensor = tf.concat(
            [input_tensor, next_token], axis=1
        )  # now both [1, seq_len]

        if next_token.numpy()[0] == tokenizer.token_to_id("</s>"):
            break
        if input_tensor.shape[1] >= MAX_LEN:
            break

    output_ids = input_tensor[0].numpy().tolist()
    return tokenizer.decode(output_ids)


# === Run test ===
if __name__ == "__main__":
    question = "Apa itu penyakit blast?"
    print("🌾 AgroLens RAG Menjawab:")
    print(generate_rag(question))



🌾 AgroLens RAG Menjawab:
[Context 1] Pengendalian hayati untuk penyakit blast bersifat lebih ramah lingkungan dan berbiaya rendah dibandi...
[Context 2] Petani dapat menerapkan strategi seperti budidaya campuran varietas padi (multilines), penggunaan va...
[Context 3] Ya, musim tanam yang jatuh di musim hujan atau transisi hujan-kemarau meningkatkan risiko blast, kar...
 Berikut adalah informasi terkait
Pengendalian hayati untuk penyakit blast bersifat lebih ramah lingkungan dan berbiaya rendah dibandingkan penggunaan fungisida kimia. Agen hayati tidak mencemari lingkungan dan bisa menjadi alternatif yang berkelanjutan dalam jangka panjang.
Petani dapat menerapkan strategi seperti budidaya campuran varietas padi (multilines), penggunaan varietas tahan spektrum luas, serta pengurangan ketergantungan pada fungisida melalui pendekatan rekayasa ekologi. Strategi ini meningkatkan hasil dan ketahanan penyakit secara berkelanjutan.
Ya, musim tanam yang jatuh di musim hujan atau transisi hujan

In [None]:
import ipywidgets as widgets
from IPython.display import display, Markdown

input_box = widgets.Text(
    value="Apa itu penyakit blast?",
    placeholder="Tulis pertanyaan di sini...",
    description="❓ Pertanyaan:",
    layout=widgets.Layout(width="100%"),
)

output_box = widgets.Output()
generate_button = widgets.Button(
    description="Jawab 🚀", button_style="success", layout=widgets.Layout(width="15%")
)


def on_generate_clicked(b):
    prompt = input_box.value
    response = rag_generate(prompt)
    output_box.clear_output()
    with output_box:
        display(
            Markdown(
                f"### 🧑 Kamu: \n{prompt}\n---\n### 🌾 AgroLens Menjawab:\n{response}"
            )
        )


generate_button.on_click(on_generate_clicked)

# Tampilkan
display(widgets.VBox([input_box, generate_button, output_box]))

VBox(children=(Text(value='Apa itu penyakit blast?', description='❓ Pertanyaan:', layout=Layout(width='100%'),…

In [None]:
! pip


Usage:   
  pip <command> [options]

Commands:
  install                     Install packages.
  download                    Download packages.
  uninstall                   Uninstall packages.
  freeze                      Output installed packages in requirements format.
  inspect                     Inspect the python environment.
  list                        List installed packages.
  show                        Show information about installed packages.
  check                       Verify installed packages have compatible dependencies.
  config                      Manage local and global configuration.
  search                      Search PyPI for packages.
  cache                       Inspect and manage pip's wheel cache.
  index                       Inspect information available from package indexes.
  wheel                       Build wheels from your requirements.
  hash                        Compute hashes of package archives.
  completion                  A helper co

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
