In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
!pip install transformers datasets torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
import json
with open('/content/drive/MyDrive/Changai/S1/Datasets/roberto_s1_v2.json') as f:
    raw_data = json.load(f)

In [4]:
from datasets import Dataset
raw_data = Dataset.from_list(raw_data)
split_dataset = raw_data.train_test_split(test_size=0.2, seed=42)

In [5]:
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))

Train size: 15859
Validation size: 3965


In [None]:
# Remove 'instruction' from both datasets
train_dataset = train_dataset.remove_columns(['instruction'])
val_dataset = val_dataset.remove_columns(['instruction'])

In [7]:
print(train_dataset.column_names)

['instruction', 'input', 'output']


In [8]:
doctypes = sorted(list(set([record['output'] for record in raw_data])))

label2id = {label: idx for idx, label in enumerate(doctypes)}
id2label = {idx: label for label, idx in label2id.items()}
def encode_labels(example):
    example['label'] = label2id[example['output']]
    return example

train_dataset = train_dataset.map(encode_labels)
val_dataset = val_dataset.map(encode_labels)
print(doctypes)

Map:   0%|          | 0/15859 [00:00<?, ? examples/s]

Map:   0%|          | 0/3965 [00:00<?, ? examples/s]

['About Us Settings', 'About Us Team Member', 'Access Log', 'Account', 'Account Closing Balance', 'Accounting Dimension', 'Accounting Dimension Detail', 'Accounting Dimension Filter', 'Accounting Period', 'Accounts Receivable', 'Accounts Settings', 'Activity Cost', 'Activity Log', 'Activity Type', 'Additional Salary', 'Address', 'Address Template', 'Advance Payment Ledger Entry', 'Advance Tax', 'Advance Taxes and Charges', 'Allowed Dimension', 'Allowed To Transact With', 'Amended Document Naming Settings', 'Applicable On Account', 'Appointment', 'Appointment Booking Settings', 'Appointment Booking Slots', 'Approval', 'Asset', 'Asset Activity', 'Asset Capitalization', 'Asset Capitalization Asset Item', 'Asset Capitalization Service Item', 'Asset Capitalization Stock Item', 'Asset Capitalization, Company', 'Asset Capitalization, Payment Entry Reference', 'Asset Capitalization, Process Payment Reconciliation Log', 'Asset Capitalization, Process Payment Reconciliation Log, Company', 'Asset

In [9]:
import csv

# Save to CSV format
with open("doctype_mapping.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Doctype", "ID"])  # Header
    for label, idx in label2id.items():
        writer.writerow([label, idx])

print("Saved as doctype_mapping.csv")

Saved as doctype_mapping.csv


In [10]:
# As for now we are not working on the multi doctypes questions, roberto can be trained on that case later to predict multi labels.
def is_single_doctype(example):
    output = example['output']
    # If output is a list (bad), or output contains ',' or ' and '
    if isinstance(output, list):
        return False
    if ',' in output or ' and ' in output.lower():
        return False
    return True

# Filter train and val datasets
train_dataset = train_dataset.filter(is_single_doctype)
val_dataset = val_dataset.filter(is_single_doctype)

# Check sizes after cleaning
print("Train size after cleaning:", len(train_dataset))
print("Validation size after cleaning:", len(val_dataset))

Filter:   0%|          | 0/15859 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3965 [00:00<?, ? examples/s]

Train size after cleaning: 15419
Validation size after cleaning: 3857


In [11]:
train_dataset[4:9]

{'instruction': ['Predict the relevant ERPNext Doctype(s) for the question below.',
  'Predict the relevant ERPNext Doctype(s) for the question below.',
  'Predict the relevant ERPNext Doctype(s) for the question below.',
  'Predict the relevant ERPNext Doctype(s) for the question below.',
  'Predict the relevant ERPNext Doctype(s) for the question below.'],
 'input': ['Is there a way to manage grouped quality procedures in the system?',
  'Which BOM records were updated in this batch run?',
  'How do I set the allowed dimensions for my accounts?',
  'Does the expense account automatically default based on the item category or company settings?',
  'Is the disabled checkbox checked for this category?'],
 'output': ['Quality Procedure',
  'BOM Update Batch',
  'Allowed Dimension',
  'Purchase Order Item',
  'Tax Category'],
 'label': [572, 72, 20, 546, 718]}

In [12]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def preprocess_function(examples):
    return tokenizer(examples['input'], truncation=True, padding="max_length")

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/15419 [00:00<?, ? examples/s]

Map:   0%|          | 0/3857 [00:00<?, ? examples/s]

In [14]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

model_name = "hyrinmansoor/text2frappe-s1-roberta"  # can be swapped anytime
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [None]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


In [None]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


In [None]:
!pip install -U transformers

In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Changai/S1/Model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    report_to="none"
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

Epoch,Training Loss,Validation Loss
1,2.7822,2.331587
2,2.0443,1.908045
3,1.5808,1.651558
4,1.2922,1.504325
5,1.1334,1.458199


{'eval_loss': 1.4581986665725708,
 'eval_runtime': 25.6448,
 'eval_samples_per_second': 150.401,
 'eval_steps_per_second': 9.437,
 'epoch': 5.0}

In [19]:
model.save_pretrained("/content/drive/MyDrive/Changai/S1/Model")
tokenizer.save_pretrained("/content/drive/MyDrive/Changai/S1/Model")

('/content/drive/MyDrive/Changai/S1/Model/tokenizer_config.json',
 '/content/drive/MyDrive/Changai/S1/Model/special_tokens_map.json',
 '/content/drive/MyDrive/Changai/S1/Model/vocab.json',
 '/content/drive/MyDrive/Changai/S1/Model/merges.txt',
 '/content/drive/MyDrive/Changai/S1/Model/added_tokens.json',
 '/content/drive/MyDrive/Changai/S1/Model/tokenizer.json')

In [None]:
from huggingface_hub import login

login()

In [None]:
from huggingface_hub import create_repo

create_repo("text2frappe-s1", private=True)

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="hyrinmansoor/text2frappe-s1-roberta",
    folder_path="/content/drive/MyDrive/Changai/S1/Model",
    path_in_repo=".",  # root of the model repo
    repo_type="model"
)


In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

model_name = "hyrinmansoor/text2frappe-s1"  # can be swapped anytime
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Now you can call through API Inference also.
#API_URL = "https://api-inference.huggingface.co/models/your-username/text2frappe-s1"

In [20]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
# Change the model_path to the directory where the model was actually saved
model_path = "/content/drive/MyDrive/Changai/S1/Model"

# Add local_files_only=True to explicitly load from the local path
model = RobertaForSequenceClassification.from_pretrained(model_path, local_files_only=True)
tokenizer = RobertaTokenizerFast.from_pretrained(model_path, local_files_only=True)

In [22]:
import torch
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
test_data = [
    {"question": "Where can I view logged work hours for tasks?", "real_answer": "Job Card Time Log"},
    {"question": "How do I manage budget allocations for departments?", "real_answer": "Budget Account"},
    {"question": "List all vehicles currently in use for delivery.", "real_answer": "Vehicle"},
    {"question": "Where can I track department-level expenses?", "real_answer": "Department"},
    {"question": "Fetch all competitor details added this quarter.", "real_answer": "Competitor"},
    {"question": "Show me all teams managing asset maintenance.", "real_answer": "Asset Maintenance Team"},
    {"question": "Where can I check ledger health insights?", "real_answer": "Ledger Health Monitor"},
    {"question": "List all purchase order items pending receipt.", "real_answer": "Purchase Order Item"},
]


id2label = {str(k): v for k, v in id2label.items()}

results = []

for record in test_data:
    test_question = record["question"]
    real_answer = record["real_answer"]

    inputs = tokenizer(test_question, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax(dim=-1).item()

    predicted_doctype = id2label[str(predicted_class_id)]

    # Store
    results.append({
        "Question": test_question,
        "Real Answer": real_answer,
        "Model Prediction": predicted_doctype,
        "Correct?": "✅" if predicted_doctype == real_answer else "❌"
    })
df_results = pd.DataFrame(results)
print(df_results)

                                            Question             Real Answer  \
0      Where can I view logged work hours for tasks?       Job Card Time Log   
1  How do I manage budget allocations for departm...          Budget Account   
2   List all vehicles currently in use for delivery.                 Vehicle   
3       Where can I track department-level expenses?              Department   
4   Fetch all competitor details added this quarter.              Competitor   
5      Show me all teams managing asset maintenance.  Asset Maintenance Team   
6          Where can I check ledger health insights?   Ledger Health Monitor   
7     List all purchase order items pending receipt.     Purchase Order Item   

         Model Prediction Correct?  
0               Timesheet        ❌  
1    Monthly Distribution        ❌  
2                 Vehicle        ✅  
3    Monthly Distribution        ❌  
4       Competitor Detail        ❌  
5  Asset Maintenance Team        ✅  
6   Ledger Health Mo