# Create label for Routing

In [None]:
def rule_based_classify(text):
    # คำสำคัญที่แสดงถึงการคาดการณ์ (prediction)
    prediction_keywords = [
        "อ่านคำถาม","ตอบคำถาม","Read the","Answer the"
    ]

    # ตรวจสอบว่าในข้อความมีคำคาดการณ์หรือไม่
    if any(keyword in text for keyword in prediction_keywords):
        return "multiple"
    return "prediction"

#fucntion to split text by \n and merge after first array with space
#remove specific keyword after join
remove = ["คำถาม","Question",":","Q","บริบท","Context","Answer","คำตอบ"]
def split_text(text):
    text = text.split("\n")
    text = " ".join(text[1:])
    for i in remove:
        text = text.replace(i,"")
    return text

In [None]:
import pandas as pd

test = pd.read_csv('/home/siamai/data/Focus/agentic/data/test.csv')
# Apply the classifier
test["message_type"] = test["query"].apply(rule_based_classify)
test["message_sliced"] = test["query"].apply(split_text)
test

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd

# Prepare dataset
df = test.copy()
df["labels"] = df["message_type"].apply(lambda x: 1 if x == "prediction" else 0)  # MUST be 'labels'

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Load tokenizer and model
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenizer function that includes 'labels'
def tokenize(example):
    tokens = tokenizer(example["message_sliced"], truncation=True, padding="max_length")
    tokens["labels"] = example["labels"]  # ✅ Add labels here
    return tokens

# Tokenize
tokenized_dataset = dataset.map(tokenize)

# Split
split_dataset = tokenized_dataset.train_test_split(test_size=0.6)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Training arguments
training_args = TrainingArguments(
    output_dir=None,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=20,
    save_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="none"
)

# Optional: Accuracy metric
from sklearn.metrics import accuracy_score,f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    return {"accuracy": accuracy_score(labels, preds),
            "f1_score":f1_score(labels, preds)}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

In [None]:
model.save_pretrained("../model/xlm_routing")
tokenizer.save_pretrained("../model/xlm_routing")

In [None]:
#plot confusion matrix
from sklearn.metrics import confusion_matrix

predictions, _, _ = trainer.predict(eval_dataset)
y_pred = predictions.argmax(axis=1)

cm = confusion_matrix(eval_dataset["labels"], y_pred)
cm