# Tokenizer

In [1]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers, decoders
from tokenizers.processors import TemplateProcessing
import json

# === Load data ===
with open("baseline-dataset.jsonl", "r", encoding="utf-8") as f:
    texts = []
    for line in f:
        data = json.loads(line)
        texts.append(data["prompt"])
        texts.append(data["response"])

# === Write to plain text (required by tokenizer trainer) ===
with open("tokenizer_corpus.txt", "w", encoding="utf-8") as f:
    for text in texts:
        f.write(text.strip() + "\n")

# === Init tokenizer ===
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
trainer = trainers.BpeTrainer(
    vocab_size=8000,
    show_progress=True,
    special_tokens=["<pad>", "<s>", "</s>", "<unk>"],
)

# === Train ===
tokenizer.train(["tokenizer_corpus.txt"], trainer)

# === Post-processing untuk auto menambahkan <s> dan </s> saat encoding
tokenizer.post_processor = TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> <s> $B </s>",
    special_tokens=[
        ("<s>", tokenizer.token_to_id("<s>")),
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

tokenizer.decoder = decoders.ByteLevel()

# === Save ===
tokenizer.save("tokenizer-agrolens.json")   
print("✅ Tokenizer saved to tokenizer-agrolens.json")




✅ Tokenizer saved to tokenizer-agrolens.json


In [2]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("tokenizer-agrolens.json")
enc = tokenizer.encode("Apa itu penyakit blast?")
print(enc.tokens)
print(enc.ids)

['<s>', 'ĠApa', 'Ġitu', 'Ġpenyakit', 'Ġblast', '?', '</s>']
[1, 187, 353, 141, 212, 10, 2]


# GPTModel

In [3]:
import torch
import torch.nn as nn


class AgroLensGPT(nn.Module):
    def __init__(
        self,
        vocab_size,
        max_length=512,
        d_model=256,
        n_heads=4,
        n_layers=4,
        dropout=0.1,
    ):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(max_length, d_model)

        self.blocks = nn.ModuleList(
            [
                nn.TransformerDecoderLayer(
                    d_model=d_model,
                    nhead=n_heads,
                    dim_feedforward=d_model * 4,
                    dropout=dropout,
                    batch_first=True,
                )
                for _ in range(n_layers)
            ]
        )

        self.ln = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        B, T = x.size()
        token_emb = self.token_embed(x)  # (B, T, d_model)
        pos = torch.arange(0, T, device=x.device).unsqueeze(0)
        pos_emb = self.pos_embed(pos)  # (1, T, d_model)
        h = token_emb + pos_emb  # (B, T, d_model)

        # Causal mask: mencegah token melihat ke depan
        causal_mask = torch.tril(torch.ones(T, T, device=x.device)).bool()

        for block in self.blocks:
            h = block(h, h, tgt_mask=causal_mask)

        out = self.ln(h)
        logits = self.head(out)  # (B, T, vocab_size)
        return logits

In [4]:
model = AgroLensGPT(vocab_size=8000)
sample_input = torch.randint(0, 8000, (2, 64))  # 2 batch, 64 token
logits = model(sample_input)
print(logits.shape)  # Expected: (2, 64, 8000)

torch.Size([2, 64, 8000])


# Loader dan Tokenizer

In [5]:
import json
import torch
from torch.utils.data import Dataset
from tokenizers import Tokenizer


class AgroDataset(Dataset):
    def __init__(self, path, tokenizer_path, max_len=256):
        self.samples = []
        self.tokenizer = Tokenizer.from_file(tokenizer_path)
        self.max_len = max_len

        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                prompt = data["prompt"]
                response = data["response"]
                combined = f"{prompt} {response}"

                # encode + truncation
                encoded = self.tokenizer.encode(combined).ids[:max_len]
                self.samples.append(torch.tensor(encoded, dtype=torch.long))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_ids = self.samples[idx]
        return {
            "input_ids": input_ids[:-1],  # input
            "labels": input_ids[1:],  # label shifted right
        }

In [6]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
# --- Hyperparameters ---
BATCH_SIZE = 16
EPOCHS = 100
LR = 3e-4
MAX_LEN = 256
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Dataset & Dataloader ---
dataset = AgroDataset(
    "baseline-dataset.jsonl", "tokenizer-agrolens.json", max_len=MAX_LEN
)


def collate_fn(batch):
    inputs = [b["input_ids"] for b in batch]
    labels = [b["labels"] for b in batch]
    inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)
    return {"input_ids": inputs, "labels": labels}


loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# --- Model ---
model = AgroLensGPT(vocab_size=8000).to(DEVICE)
optimizer = Adam(model.parameters(), lr=LR)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100)

# --- Training ---
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for batch in loader:
        input_ids = batch["input_ids"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        logits = model(input_ids)
        loss = loss_fn(logits.view(-1, logits.size(-1)), labels.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"📘 Epoch {epoch+1}: Loss = {avg_loss:.4f}")

torch.save(model.state_dict(), "agrolens_model.pt")

📘 Epoch 1: Loss = 9.1345
📘 Epoch 2: Loss = 8.4741
📘 Epoch 3: Loss = 8.0600
📘 Epoch 4: Loss = 7.7923
📘 Epoch 5: Loss = 7.6031
📘 Epoch 6: Loss = 7.4258
📘 Epoch 7: Loss = 7.2488
📘 Epoch 8: Loss = 7.0591
📘 Epoch 9: Loss = 6.8465
📘 Epoch 10: Loss = 6.6029
📘 Epoch 11: Loss = 6.3638
📘 Epoch 12: Loss = 6.1495
📘 Epoch 13: Loss = 5.9286
📘 Epoch 14: Loss = 5.7096
📘 Epoch 15: Loss = 5.5087
📘 Epoch 16: Loss = 5.3159
📘 Epoch 17: Loss = 5.0925
📘 Epoch 18: Loss = 4.8961
📘 Epoch 19: Loss = 4.6962
📘 Epoch 20: Loss = 4.4952
📘 Epoch 21: Loss = 4.3001
📘 Epoch 22: Loss = 4.0997
📘 Epoch 23: Loss = 3.9191
📘 Epoch 24: Loss = 3.7301
📘 Epoch 25: Loss = 3.5404
📘 Epoch 26: Loss = 3.3745
📘 Epoch 27: Loss = 3.2005
📘 Epoch 28: Loss = 3.0384
📘 Epoch 29: Loss = 2.8828
📘 Epoch 30: Loss = 2.7317
📘 Epoch 31: Loss = 2.5692
📘 Epoch 32: Loss = 2.4292
📘 Epoch 33: Loss = 2.2894
📘 Epoch 34: Loss = 2.1354
📘 Epoch 35: Loss = 2.0201
📘 Epoch 36: Loss = 1.9011
📘 Epoch 37: Loss = 1.7863
📘 Epoch 38: Loss = 1.6630
📘 Epoch 39: Loss = 1.

In [7]:
import torch
from tokenizers import Tokenizer

# === Konfigurasi ===
MODEL_PATH = "agrolens_model.pt"
TOKENIZER_PATH = "tokenizer-agrolens.json"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 128
VOCAB_SIZE = 8000

# === Load model dan tokenizer ===
model = AgroLensGPT(vocab_size=VOCAB_SIZE).to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.eval()

tokenizer = Tokenizer.from_file(TOKENIZER_PATH)


def generate(prompt: str, max_new_tokens=50):
    # Tokenisasi prompt
    input_ids = tokenizer.encode(prompt).ids
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(DEVICE)

    for _ in range(max_new_tokens):
        logits = model(input_tensor)
        next_token = logits[:, -1, :].argmax(dim=-1)
        input_tensor = torch.cat([input_tensor, next_token.unsqueeze(1)], dim=1)

        # Stop if </s> token muncul
        if next_token.item() == tokenizer.token_to_id("</s>"):
            break

    output_ids = input_tensor[0].tolist()
    return tokenizer.decode(output_ids)


  model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))


In [11]:
import ipywidgets as widgets
from IPython.display import display, Markdown

# Input box dan tombol
input_box = widgets.Text(
    value="Apa itu penyakit blast?",
    placeholder="Tulis pertanyaan di sini...",
    description="❓ Pertanyaan:",
    layout=widgets.Layout(width="100%"),
)

output_box = widgets.Output()
generate_button = widgets.Button(
    description="Jawab 🚀", button_style="success", layout=widgets.Layout(width="15%")
)


# Fungsi saat tombol diklik
def on_generate_clicked(b):
    prompt = input_box.value
    response = generate(prompt)
    output_box.clear_output()
    with output_box:
        display(
            Markdown(
                f"### 🧑 Kamu: \n{prompt}\n---\n### 🌾 AgroLens Menjawab:\n{response}"
            )
        )


generate_button.on_click(on_generate_clicked)

# Tampilkan
display(widgets.VBox([input_box, generate_button, output_box]))

VBox(children=(Text(value='Apa itu penyakit blast?', description='❓ Pertanyaan:', layout=Layout(width='100%'),…