# Install required libraries

* transformers -For loading the BERT model (mesolitica/bert-base-standard-bahasa-cased) and tokenizer
* torch -The backend engine that powers transformers models (e.g. embeddings)
* scikit-learn -Used to train and run the intent classification model (Logistic Regression)
* joblib -Saves and loads trained models efficiently (intent_classifier.pkl, label_encoder.pkl)

In [3]:
pip install transformers torch scikit-learn joblib

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

# Import libraries

In [4]:
# === 1. Library Imports ===
import json
import numpy as np
import joblib
import torch
import faiss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import os

# Load and Preprocess Intents

In [5]:
# === 2. Load and Preprocess Intents ===
with open("/kaggle/input/tnl6323/Intent.json", "r", encoding="utf-8") as f:
    intents = json.load(f)

texts, labels = [], []
for intent in intents["intents"]:
    for pattern in intent["patterns"]:
        texts.append(pattern)
        labels.append(intent["tag"])

label_encoder = LabelEncoder()
y_labels = label_encoder.fit_transform(labels)

# Load BERT Model for Embedding

In [6]:
# === 3. Load BERT Model for Embedding ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device used:", device)

tokenizer = AutoTokenizer.from_pretrained("mesolitica/bert-base-standard-bahasa-cased")
model = AutoModel.from_pretrained("mesolitica/bert-base-standard-bahasa-cased")
model.eval().to(device)

Device used: cuda


tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/233k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/697 [00:00<?, ?B/s]

2025-06-22 10:54:20.312126: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750589660.479143      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750589660.525125      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at mesolitica/bert-base-standard-bahasa-cased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

# Embedding Function

In [7]:
# === 4. Embedding Function ===
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        hidden = outputs.last_hidden_state
        mask = inputs['attention_mask'].unsqueeze(-1).expand(hidden.size()).float()
        masked = hidden * mask
        summed = masked.sum(1)
        counts = mask.sum(1).clamp(min=1e-9)
        mean = summed / counts
    return mean[0].cpu().numpy()

# Generate Embeddings for Intents

In [8]:
# === 5. Generate Embeddings for Intents ===
X_embed = np.array([get_embedding(text) for text in tqdm(texts)])

100%|██████████| 114/114 [00:01<00:00, 89.01it/s] 


# Train Intent Classifier

In [9]:
# === 6. Train Intent Classifier ===
clf = LogisticRegression(max_iter=1000)
clf.fit(X_embed, y_labels)

# Save Classifier and Label Encoder

In [10]:
# === 7. Save Classifier and Label Encoder ===
os.makedirs("trained_models", exist_ok=True)
joblib.dump(clf, "trained_models/intent_classifier.pkl")
joblib.dump(label_encoder, "trained_models/label_encoder.pkl")

['trained_models/label_encoder.pkl']