In [None]:
import pandas as pd
import json
import random


def load_combined_dataset(excel_path, sheet_names):
    all_data = []
    for sheet in sheet_names:
        df = pd.read_excel(excel_path, sheet_name=sheet, engine="openpyxl")
        df = df.rename(columns=str.lower)
        df["penyakit"] = sheet.lower()
        all_data.append(df)
    return pd.concat(all_data, ignore_index=True)


def generate_triplets(df, n_negatives=2, add_soft_negative=True, seed=42):
    triplets = []
    for idx, row in df.iterrows():
        query = row["prompt"]
        positive = row["response"]
        penyakit = row["penyakit"]
        topik = row.get("topik", None)

        # Hard negatives dari penyakit lain
        hard_pool = df[df["penyakit"] != penyakit]
        hard_negatives = (
            hard_pool["response"]
            .dropna()
            .sample(n=min(n_negatives, len(hard_pool)), random_state=seed)
            .tolist()
        )

        # Soft negative dari penyakit sama tapi topik beda
        soft_negative = None
        if add_soft_negative and topik:
            soft_pool = df[
                (df["penyakit"] == penyakit)
                & (df["topik"] != topik)
                & (df["response"].notna())
            ]
            if not soft_pool.empty:
                soft_negative = soft_pool.sample(n=1, random_state=seed)[
                    "response"
                ].iloc[0]

        # Gabungkan negatives
        negatives = hard_negatives
        if soft_negative:
            negatives.append(soft_negative)

        triplets.append({"query": query, "positive": positive, "negatives": negatives})
    return triplets


def save_triplets_to_jsonl(triplets, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        for item in triplets:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")


# ======== PARAMETER ========
excel_path = "../../datasets/xlsx/Dataset Labeling Chatbot.xlsx"
sheet_names = ["Blast", "Tungro"]
output_file = "triplet_dataset_with_soft_negative.jsonl"
# ===========================

df = load_combined_dataset(excel_path, sheet_names)
triplets = generate_triplets(df, n_negatives=2, add_soft_negative=  True)
save_triplets_to_jsonl(triplets, output_file)

print(f"✅ Triplet dataset with soft negatives saved to: {output_file}")

✅ Triplet dataset with soft negatives saved to: triplet_dataset_with_soft_negative.jsonl


In [None]:
import json

# === Load original dataset ===
with open("dataset.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# === Format standar: prompt + response saja ===
standard_data = []
for item in data:
    prompt = item["prompt"].strip()
    response = item["response"].strip()
    standard_data.append({"prompt": prompt, "response": response})

# === Simpan ke file JSONL ===
with open("dataset_standard.jsonl", "w", encoding="utf-8") as f:
    for item in standard_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("✅ dataset_standard.jsonl berhasil dibuat.")

✅ dataset_ragstyle.jsonl berhasil dibuat.


In [13]:
corpus = [{"text": item["response"], "disease": item["disease"]} for item in rag_data]
with open("rag_corpus.jsonl", "w", encoding="utf-8") as f:
    for item in corpus:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [2]:
import tensorflow as tf
from tensorflow.keras import layers
import json
from tokenizers import Tokenizer
import numpy as np

2025-06-05 21:59:41.530372: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-05 21:59:41.539939: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749131981.550126   21636 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749131981.553040   21636 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749131981.559932   21636 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [3]:
VOCAB_SIZE = 8000
MAX_LEN = 128
D_MODEL = 256
N_HEADS = 4
BATCH_SIZE = 16
EPOCHS = 5

In [4]:
tokenizer = Tokenizer.from_file(
    "tokenizer-agrolens.json"
)  # ganti sesuai path tokenizer kamu
def encode(text):
    ids = tokenizer.encode(text).ids[:MAX_LEN]
    ids = ids + [0] * (MAX_LEN - len(ids))  # padding
    return tf.constant(ids, dtype=tf.int32)

In [5]:
def load_dataset(path):
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            ex = json.loads(line)
            yield {
                "query_input": encode(ex["query"]),
                "passage_input": encode(ex["positive"]),
            }


def get_tf_dataset(path):
    ds = tf.data.Dataset.from_generator(
        lambda: load_dataset(path),
        output_signature={
            "query_input": tf.TensorSpec(shape=(MAX_LEN,), dtype=tf.int32),
            "passage_input": tf.TensorSpec(shape=(MAX_LEN,), dtype=tf.int32),
        },
    )
    return ds.shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


dataset = get_tf_dataset("triplet_dataset_with_soft_negative.jsonl")

I0000 00:00:1749131984.010201   21636 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


In [6]:
import tensorflow as tf
from tensorflow.keras import layers


class AgroTransformerEncoder(tf.keras.Model):
    def __init__(
        self, vocab_size=8000, max_length=128, d_model=256, n_heads=4, dropout=0.1
    ):
        super().__init__()
        self.token_embed = layers.Embedding(input_dim=vocab_size, output_dim=d_model)
        self.pos_embed = layers.Embedding(input_dim=max_length, output_dim=d_model)

        self.attn = layers.MultiHeadAttention(num_heads=n_heads, key_dim=d_model)
        self.ffn = tf.keras.Sequential(
            [
                layers.Dense(d_model * 4, activation="relu"),
                layers.Dropout(dropout),
                layers.Dense(d_model),
            ]
        )
        self.dropout = layers.Dropout(dropout)
        self.ln1 = layers.LayerNormalization()
        self.ln2 = layers.LayerNormalization()

    def call(self, x, training=False):
        seq_len = tf.shape(x)[1]
        pos = tf.range(start=0, limit=seq_len, delta=1)
        pos = tf.expand_dims(pos, 0)
        x = self.token_embed(x) + self.pos_embed(pos)

        attn_output = self.attn(x, x, attention_mask=None, use_causal_mask=False)
        x = self.ln1(x + self.dropout(attn_output, training=training))

        ffn_output = self.ffn(x, training=training)
        x = self.ln2(x + self.dropout(ffn_output, training=training))

        x = tf.reduce_mean(x, axis=1)  # Global average pooling
        return x  # shape: (batch, d_model)

In [7]:
encoder = AgroTransformerEncoder()

In [8]:
def contrastive_loss(query_embed, passage_embed):
    sim_matrix = tf.matmul(query_embed, passage_embed, transpose_b=True)  # (B, B)
    labels = tf.range(tf.shape(sim_matrix)[0])
    loss = tf.keras.losses.sparse_categorical_crossentropy(
        labels, sim_matrix, from_logits=True
    )
    return tf.reduce_mean(loss)


optimizer = tf.keras.optimizers.Adam()

In [9]:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    epoch_loss = []
    for batch in dataset:
        with tf.GradientTape() as tape:
            q_embed = encoder(batch["query_input"], training=True)
            p_embed = encoder(batch["passage_input"], training=True)
            loss = contrastive_loss(q_embed, p_embed)
        grads = tape.gradient(loss, encoder.trainable_variables)
        optimizer.apply_gradients(zip(grads, encoder.trainable_variables))
        epoch_loss.append(loss.numpy())

    print(f"Loss: {np.mean(epoch_loss):.4f}")

# ======= 8. Save Encoder =======
encoder.save_weights("retriever_encoder_weights.weights.h5")

Epoch 1/5


2025-06-05 21:59:51.372563: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Loss: 5.2228
Epoch 2/5


2025-06-05 21:59:52.507434: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Loss: 2.7777
Epoch 3/5
Loss: 2.7653
Epoch 4/5


2025-06-05 21:59:54.748732: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Loss: 2.7509
Epoch 5/5
Loss: 2.7135
