In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [None]:
import torch
from transformers import AutoTokenizer
from datasets import Dataset
import json

# Load your dataset
with open('/content/intent_classification_erp_dataset_updated.json', 'r') as f:
    data = json.load(f)

# Add prompt to each text
prompt = "Classify the intent of the following query: "
data = [{"input": prompt + item["input"], "output": item["output"]} for item in data]

# Convert to Hugging Face Dataset
dataset = Dataset.from_list(data)

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["input"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Label mapping
unique_labels = sorted(set(item["output"] for item in data))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}
tokenized_dataset = tokenized_dataset.map(lambda x: {"label": label2id[x["output"]]})
with open('id2label.json', 'w') as f:
    json.dump(id2label, f)
with open('label2id.json', 'w') as f:
    json.dump(label2id, f)

In [None]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(unique_labels)
)

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)


In [None]:
trainer.train()
trainer.evaluate()


In [None]:
model_dir = "/content/drive/MyDrive/roberta_intent_classifier"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

In [None]:
test_cases = [
    # ERP Complete Intent: Uncommon or complex ERP requests
    {"query": "Show invoices created today with PO number filled in.", "expected_intent": "erp_complete"},
    {"query": "Do any invoices have PO details referencing 'Urgent'?", "expected_intent": "erp_complete"},
    {"query": "Find invoices where customer PO info includes 'QTR2025'.", "expected_intent": "erp_complete"},
    {"query": "How many POS invoices had customer PO data?", "expected_intent": "erp_complete"},
]


In [None]:
from transformers import pipeline
import torch

# Load your model and tokenizer
model_dir = "/content/drive/MyDrive/roberta_intent_classifier"
classifier = pipeline(
    "text-classification",
    model=model_dir,
    tokenizer=model_dir,
    device=0 if torch.cuda.is_available() else -1,
)

# Your label mapping (example - adjust to your actual mapping)
id2label={0: "complete_question",
          1: "erp_complete",
          2: "followup_or_clarification",
          3: "greeting",
          4: "out_of_scope"
          }

for case in test_cases:
    query = case["query"]
    result = classifier(query)
    predicted_label = result[0]['label']
    label_id = int(predicted_label.split('_')[1])
    predicted_intent = id2label[label_id]
    print(f"Query: {query}")
    print(f"Predicted intent: {predicted_intent}")
    print(f"Expected intent: {case['expected_intent']}")
    print("---")
