In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets torch

In [28]:
import os
import json

# Replace with the actual path to your folder in Google Drive
json_folder = '/content/drive/MyDrive/Changai/S1/Datasets/ROBERTA - Dataset Version 1'

combined_data = []

# Loop through all JSON files in the folder
for filename in os.listdir(json_folder):
    if filename.endswith('.json'):
        file_path = os.path.join(json_folder, filename)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    combined_data.extend(data)
                else:
                    combined_data.append(data)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in file: {filename}. Skipping this file. Error: {e}")
            continue


# Save the combined data to a new JSON file
output_path = '/content/drive/MyDrive/Changai/S1/Datasets/ROBERTA - Dataset Version 1/combined_output_v1.json'
with open(output_path, 'w', encoding='utf-8') as out_file:
    json.dump(combined_data, out_file, indent=4)

print(f"Combined JSON saved to: {output_path}")

Combined JSON saved to: /content/drive/MyDrive/Changai/S1/Datasets/ROBERTA - Dataset Version 1/combined_output_v1.json


In [50]:
import json
with open('/content/combined_output_v1.json') as f:
    raw_data = json.load(f)

In [51]:
from datasets import Dataset
raw_data = Dataset.from_list(raw_data)
split_dataset = raw_data.train_test_split(test_size=0.2, seed=42)

In [52]:
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))

Train size: 43940
Validation size: 10986


In [40]:
# Remove 'instruction' from both datasets
train_dataset = train_dataset.remove_columns(['instruction'])
val_dataset = val_dataset.remove_columns(['instruction'])

In [41]:
print(train_dataset.column_names)

['input', 'output']


In [53]:
doctypes = sorted(list(set([','.join(record['output']) if isinstance(record['output'], list) else record['output'] for record in raw_data])))

label2id = {label: idx for idx, label in enumerate(doctypes)}
id2label = {idx: label for label, idx in label2id.items()}
def encode_labels(example):
    example['label'] = label2id[','.join(example['output']) if isinstance(example['output'], list) else example['output']]
    return example

train_dataset = train_dataset.map(encode_labels)
val_dataset = val_dataset.map(encode_labels)
print(doctypes)

Map:   0%|          | 0/43940 [00:00<?, ? examples/s]

Map:   0%|          | 0/10986 [00:00<?, ? examples/s]

['About Us Settings', 'Account', 'Account Closing Balance', 'Accounting Dimension', 'Accounting Dimension Detail', 'Accounting Period', 'Accounts Receivable', 'Accounts Settings', 'Activity Cost', 'Activity Log', 'Activity Type', 'Additional Salary', 'Address', 'Advance Payment Ledger Entry', 'Advance Tax', 'Advance Taxes and Charges', 'Allowed Dimension', 'Allowed To Transact With', 'Applicable On Account', 'Appointment', 'Appointment Booking Settings', 'Appointment Booking Slots', 'Approval', 'Asset', 'Asset Activity', 'Asset Capitalization', 'Asset Capitalization Asset Item', 'Asset Capitalization Service Item', 'Asset Capitalization, Company', 'Asset Capitalization, Payment Entry Reference', 'Asset Capitalization, Process Payment Reconciliation Log', 'Asset Capitalization, Process Payment Reconciliation Log, Company', 'Asset Category', 'Asset Category Account', 'Asset Depreciation Schedule', 'Asset Maintenance', 'Asset Maintenance Log', 'Asset Movement', 'Asset Movement Item', 'Ass

In [54]:
import csv

# Save to CSV format
with open("doctype_mapping.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Doctype", "ID"])  # Header
    for label, idx in label2id.items():
        writer.writerow([label, idx])

print("Saved as doctype_mapping.csv")

Saved as doctype_mapping.csv


In [55]:
train_dataset[4:9]

{'instruction': ['Predict the relevant ERPNext Doctype(s) for the question below.',
  'Predict the relevant ERPNext Doctype(s) for the question below.',
  'Predict the relevant ERPNext Doctype(s) for the question below.',
  'Which ERPNext Doctype best answers this question?',
  'Predict the relevant ERPNext Doctype(s) for the question below.'],
 'input': ['List all users who viewed or edited the invoice INV-0021.',
  'Record part payment towards an invoice — how to?',
  'Check if any database table crossed 500MB.',
  "What is the exchange rate used for payment 'pay-2023-110'?",
  'Can you tell me the title for this document type state?'],
 'output': [['Activity Log'],
  ['Payment Reconciliation Payment'],
  ['System Health Report Tables'],
  ['Process Payment Reconciliation Log Allocations'],
  ['DocType State']],
 'label': [9, 391, 605, 417, 178]}

In [59]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('hyrinmansoor/text2frappe-s1-roberta')

def preprocess_function(examples):
    return tokenizer(examples['input'], truncation=True, padding="max_length")

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/43940 [00:00<?, ? examples/s]

Map:   0%|          | 0/10986 [00:00<?, ? examples/s]

In [57]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [60]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

model_name = "hyrinmansoor/text2frappe-s1-roberta"  # can be swapped anytime
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [62]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Changai/S1/Model/Model1",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    report_to="none"
)

# Load the model with the correct number of labels and mappings
model_name = "hyrinmansoor/text2frappe-s1-roberta"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    id2label=id2label,       # Pass the id2label mapping
    label2id=label2id ,
    ignore_mismatched_sizes=True# Pass the label2id mapping
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at hyrinmansoor/text2frappe-s1-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([829]) in the checkpoint and torch.Size([683]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([829, 768]) in the checkpoint and torch.Size([683, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.7601,0.382624
2,0.1637,0.1136
3,0.0784,0.071306
4,0.0461,0.049083
5,0.0338,0.028227
6,0.0116,0.024479
7,0.0124,0.023586
8,0.0066,0.019083


Epoch,Training Loss,Validation Loss
1,0.7601,0.382624
2,0.1637,0.1136
3,0.0784,0.071306
4,0.0461,0.049083
5,0.0338,0.028227
6,0.0116,0.024479
7,0.0124,0.023586
8,0.0066,0.019083
9,0.006,0.018357
10,0.0034,0.01455


{'eval_loss': 0.009094557724893093,
 'eval_runtime': 71.4998,
 'eval_samples_per_second': 153.651,
 'eval_steps_per_second': 9.608,
 'epoch': 15.0}

In [63]:
model.save_pretrained("/content/drive/MyDrive/Changai/S1/Model/Model1")
tokenizer.save_pretrained("/content/drive/MyDrive/Changai/S1/Model/Model1")
# model.push_to_hub("hyrinmansoor/text2frappe-s1-roberta")
# tokenizer.push_to_hub("hyrinmansoor/text2frappe-s1-roberta")

('/content/drive/MyDrive/Changai/S1/Model/Model1/tokenizer_config.json',
 '/content/drive/MyDrive/Changai/S1/Model/Model1/special_tokens_map.json',
 '/content/drive/MyDrive/Changai/S1/Model/Model1/vocab.json',
 '/content/drive/MyDrive/Changai/S1/Model/Model1/merges.txt',
 '/content/drive/MyDrive/Changai/S1/Model/Model1/added_tokens.json',
 '/content/drive/MyDrive/Changai/S1/Model/Model1/tokenizer.json')

In [None]:
from huggingface_hub import login

login()

In [None]:
from huggingface_hub import create_repo

create_repo("text2frappe-s1", private=True)

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="hyrinmansoor/text2frappe-s1-roberta",
    folder_path="/content/drive/MyDrive/Changai/S1/Model",
    path_in_repo=".",  # root of the model repo
    repo_type="model"
)


Upload 10 LFS files:   0%|          | 0/10 [00:00<?, ?it/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/997M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/997M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hyrinmansoor/text2frappe-s1-roberta/commit/6f1f84583ce41326cb6ce2636afaf94f78937daa', commit_message='Upload folder using huggingface_hub', commit_description='', oid='6f1f84583ce41326cb6ce2636afaf94f78937daa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/hyrinmansoor/text2frappe-s1-roberta', endpoint='https://huggingface.co', repo_type='model', repo_id='hyrinmansoor/text2frappe-s1-roberta'), pr_revision=None, pr_num=None)

In [67]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
# model_path = "/content/drive/MyDrive/Changai/S1/Model"

# model = RobertaForSequenceClassification.from_pretrained(model_path, local_files_only=True)
# tokenizer = RobertaTokenizerFast.from_pretrained(model_path, local_files_only=True)

In [76]:
import torch
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/content/drive/MyDrive/Changai/S1/Model/Model1"


model = RobertaForSequenceClassification.from_pretrained(
    model_path,
    id2label=id2label,
    label2id=label2id,
    local_files_only=True,
    ignore_mismatched_sizes=True  # important!
).to(device)


tokenizer = RobertaTokenizerFast.from_pretrained(model_path, local_files_only=True)

test_data=[
  { "input": "new staff joined this yr?", "output": ["Employee"] },
  { "input": "inactive workers list rn", "output": ["Employee"] },
  { "input": "who on leave today?", "output": ["Employee"] },

  { "input": "clients added last 2 wks?", "output": ["Customer"] },
  { "input": "any cust no orders yet?", "output": ["Customer"] },
  { "input": "overdue bills clients?", "output": ["Customer"] },

  { "input": "open so pending deliver?", "output": ["Sales Order"] },
  { "input": "sales orders this wk?", "output": ["Sales Order"] },
  { "input": "big so amt rn?", "output": ["Sales Order"] },

  { "input": "unpaid invoices show", "output": ["Sales Invoice"] },
  { "input": "avg invoice val mnth?", "output": ["Sales Invoice"] },
  { "input": "last 5 invoices any cust", "output": ["Sales Invoice"] },

  { "input": "open po list?", "output": ["Purchase Order"] },
  { "input": "po made this quarter", "output": ["Purchase Order"] },
  { "input": "cancelled purchase orders?", "output": ["Purchase Order"] },

  { "input": "unpaid purchase inv now?", "output": ["Purchase Invoice"] },
  { "input": "avg pi cost mnth?", "output": ["Purchase Invoice"] },
  { "input": "top 5 purchase invoices", "output": ["Purchase Invoice"] },

  { "input": "suppliers dubai list", "output": ["Supplier"] },
  { "input": "inactive supplier rn", "output": ["Supplier"] },
  { "input": "supplier max inv", "output": ["Supplier"] },

  { "input": "projects still running", "output": ["Project"] },
  { "input": "proj started last mnth", "output": ["Project"] },
  { "input": "any delayed projects?", "output": ["Project"] },

  { "input": "low stock items?", "output": ["Item"] },
  { "input": "products no sale mnth", "output": ["Item"] },
  { "input": "top 10 selling goods", "output": ["Item"] },

  { "input": "open tasks today", "output": ["Task"] },
  { "input": "tasks done last wk", "output": ["Task"] },
  { "input": "pending tasks hr dept", "output": ["Task"] },

  { "input": "stock entries today?", "output": ["Stock Entry"] },
  { "input": "recent stock entry item x", "output": ["Stock Entry"] },
  { "input": "cancelled stock entries", "output": ["Stock Entry"] },
   { "input": "new employees last mnth?", "output": ["Employee"] },
  { "input": "staff still active?", "output": ["Employee"] },
  { "input": "emp on leave rn", "output": ["Employee"] },

  { "input": "show me new cust this week", "output": ["Customer"] },
  { "input": "any clients with 0 invoices", "output": ["Customer"] },
  { "input": "cust overdue pay list", "output": ["Customer"] },

  { "input": "sales orders yet to ship", "output": ["Sales Order"] },
  { "input": "so count today?", "output": ["Sales Order"] },
  { "input": "biggest sales order amt", "output": ["Sales Order"] },

  { "input": "inv not paid still", "output": ["Sales Invoice"] },
  { "input": "avg sales inv val quarter", "output": ["Sales Invoice"] },
  { "input": "show 10 latest invoices", "output": ["Sales Invoice"] },

  { "input": "open pos show", "output": ["Purchase Order"] },
  { "input": "pos created this mnth?", "output": ["Purchase Order"] },
  { "input": "cancelled pos list", "output": ["Purchase Order"] },

  { "input": "purchase inv unpaid rn", "output": ["Purchase Invoice"] },
  { "input": "pi avg amount this quarter", "output": ["Purchase Invoice"] },
  { "input": "purchase invoices top value", "output": ["Purchase Invoice"] },

  { "input": "suppliers inactive list", "output": ["Supplier"] },
  { "input": "supplier with most pos", "output": ["Supplier"] }
]



# Prediction loop
results = []
for record in test_data:
    inputs = tokenizer(
        record["input"],
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
        predicted_class_id = logits.argmax(dim=-1).item()

    predicted_doctype = id2label[predicted_class_id]
    results.append({
        "Question": record["input"],
        "Real Answer": record["output"][0],
        "Model Prediction": predicted_doctype,
        "Correct?": "✅" if predicted_doctype == record["output"][0] else "❌"
    })

# Output
df_results = pd.DataFrame(results)
print(df_results)

                       Question       Real Answer  Model Prediction Correct?
0     new staff joined this yr?          Employee          Employee        ✅
1      inactive workers list rn          Employee          Employee        ✅
2           who on leave today?          Employee          Employee        ✅
3     clients added last 2 wks?          Customer          Customer        ✅
4       any cust no orders yet?          Customer          Customer        ✅
5        overdue bills clients?          Customer     Sales Invoice        ❌
6      open so pending deliver?       Sales Order     Delivery Note        ❌
7         sales orders this wk?       Sales Order       Sales Order        ✅
8                big so amt rn?       Sales Order              Item        ❌
9          unpaid invoices show     Sales Invoice     Sales Invoice        ✅
10        avg invoice val mnth?     Sales Invoice     Sales Invoice        ✅
11     last 5 invoices any cust     Sales Invoice     Sales Invoice        ✅