In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers datasets torch

In [None]:
import json
with open('/content/ROBERTa.json') as f:
    raw_data = json.load(f)

In [None]:
from datasets import Dataset
raw_data = Dataset.from_list(raw_data)

In [None]:
# As for now we are not working on the multi doctypes questions, roberto can be trained on that case later to predict multi labels.
def is_single_doctype(example):
    output = example['output']
    # If output is a list (bad), or output contains ',' or ' and '
    if isinstance(output, list):
        return False
    if ',' in output or ' and ' in output.lower():
        return False
    return True

# Filter train and val datasets
filtered_data = raw_data.filter(is_single_doctype)
split = filtered_data.train_test_split(test_size=0.2, seed=42, stratify_by_column='output')
train_dataset, val_dataset = split['train'], split['test']
print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))
print("Columns:", train_dataset.column_names)


In [None]:
doctypes = sorted(list(set([record['output'] for record in raw_data])))

label2id = {label: idx for idx, label in enumerate(doctypes)}
id2label = {idx: label for label, idx in label2id.items()}
def encode_labels(example):
    example['label'] = label2id[example['output']]
    return example

train_dataset = train_dataset.map(encode_labels)
val_dataset = val_dataset.map(encode_labels)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('hyrinmansoor/text2frappe-s1-roberta')

def preprocess_function(examples):
    return tokenizer(examples['input'], truncation=True,padding='max_length',max_length=128)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score

# Load the model with the correct number of labels and mappings
model_name = "hyrinmansoor/text2frappe-s1-roberta"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    id2label=id2label,       # Pass the id2label mapping
    label2id=label2id ,
    ignore_mismatched_sizes=True# Pass the label2id mapping
)
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Changai/S1/Model/ROBERTA_12-02-2025_1",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=25,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="epoch",
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, label = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(label, preds),
        "f1_macro": f1_score(label, preds, average="macro")
    }

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()
# Evaluate the model
metrics = trainer.evaluate()
print(metrics)


In [None]:
#save to drive and then test validate and then upload to hf if better.
model.save_pretrained("/content/drive/MyDrive/Changai/S1/Model/ROBERTA_12-02-2025_1")
tokenizer.save_pretrained("/content/drive/MyDrive/Changai/S1/Model/ROBERTA_12-02-2025_1")

Testing

In [None]:
import torch
import pandas as pd
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast

repo_id = "hyrinmansoor/text2frappe-s1-roberta"
subfolder = "ROBERTA_12-02-2025_1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(repo_id, subfolder=subfolder,id2label=id2label).to(device)
tokenizer = RobertaTokenizerFast.from_pretrained(repo_id, subfolder=subfolder)
# model_path = "/content/drive/MyDrive/Changai/S1/Model/ROBERTA_12-02-2025_1"

test_data=[
    {"question": "How many sales invoices were issued in the last quarter?", "real_answer": "Sales Invoice"},
    {"question": "What is the total outstanding amount for all sales invoices?", "real_answer": "Sales Invoice"},
    {"question": "List all sales invoices with their status and customer name.", "real_answer": "Sales Invoice"},
    {"question": "What is the average sales invoice amount this quarter?", "real_answer": "Sales Invoice"},
    {"question": "Which customer has the highest total invoice amount this year?", "real_answer": "Sales Invoice"},
    {"question": "How many purchase invoices were created this month?", "real_answer": "Purchase Invoice"},
    {"question": "What is the total amount spent on purchase invoices last quarter?", "real_answer": "Purchase Invoice"},
    {"question": "List all purchase invoices with supplier and payment status.", "real_answer": "Purchase Invoice"},
    {"question": "What is the status of purchase invoice PI-0005?", "real_answer": "Purchase Invoice"},
    {"question": "Which supplier has the highest total purchase invoice amount?", "real_answer": "Purchase Invoice"},
    {"question": "How many employees are currently active?", "real_answer": "Employee"},
    {"question": "List all employees with their department and designation.", "real_answer": "Employee"},
    {"question": "What is the date of joining for employee EMP-0001?", "real_answer": "Employee"},
    {"question": "Which employee has the highest salary?", "real_answer": "Employee"},
    {"question": "How many employees are on probation?", "real_answer": "Employee"},
    {"question": "How many sales orders were created this month?", "real_answer": "Sales Order"},
    {"question": "What is the total amount for all sales orders this year?", "real_answer": "Sales Order"},
    {"question": "List all sales orders with their customer and status.", "real_answer": "Sales Order"},
    {"question": "What is the status of sales order SO-0003?", "real_answer": "Sales Order"},
    {"question": "How many sales orders are pending delivery?", "real_answer": "Sales Order"},
    {"question": "How many purchase orders were created last month?", "real_answer": "Purchase Order"},
    {"question": "What is the total value of all purchase orders this year?", "real_answer": "Purchase Order"},
    {"question": "List all purchase orders with supplier and status.", "real_answer": "Purchase Order"},
    {"question": "What is the status of purchase order PO-0007?", "real_answer": "Purchase Order"},
    {"question": "How many purchase orders are pending receipt?", "real_answer": "Purchase Order"},
    {"question": "How many items are currently in stock?", "real_answer": "Item"},
    {"question": "List all items with their item code and stock quantity.", "real_answer": "Item"},
    {"question": "What is the stock quantity of item ITEM-0001?", "real_answer": "Item"},
    {"question": "Which item has the highest stock quantity?", "real_answer": "Item"},
    {"question": "How many items have no stock?", "real_answer": "Item"}
]


# tokenizer = RobertaTokenizerFast.from_pretrained(model_path, local_files_only=True)
results = []
# for record in test_data:
inputs = tokenizer(
  record["question"],
  return_tensors="pt",
  truncation=True,
  padding="max_length",
  max_length=128
).to(device)

with torch.no_grad():
  logits = model(**inputs).logits
  predicted_class_id = logits.argmax(dim=-1).item()
print(id2label)
predicted_doctype = id2label[predicted_class_id]
results.append({
  "Question": record["question"],
  "Real Answer": record["real_answer"],
  "Model Prediction": predicted_doctype,
  "Correct?": "✅" if predicted_doctype == record["real_answer"] else "❌"
})

Upload to Hub using repo id

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="hyrinmansoor/text2frappe-s1-roberta",
    folder_path="/content/drive/MyDrive/Changai/S1/Model/ROBERTA_12-02-2025_1",
    repo_type="model",
    commit_message="Upload updated model folder",
    ignore_patterns=["*.tmp", "*.log"]  # optional, to skip temp files
)
