In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers datasets torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [13]:
import json
with open('/content/roberta data v1.json') as f:
    raw_data = json.load(f)

In [14]:
from datasets import Dataset
raw_data = Dataset.from_list(raw_data)
split_dataset = raw_data.train_test_split(test_size=0.2, seed=42)

In [15]:
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))

Train size: 840
Validation size: 211


In [16]:
# Remove 'instruction' from both datasets
train_dataset = train_dataset.remove_columns(['instruction'])
val_dataset = val_dataset.remove_columns(['instruction'])

In [17]:
print(train_dataset.column_names)

['input', 'output']


In [18]:
doctypes = sorted(list(set([record['output'] for record in raw_data])))

label2id = {label: idx for idx, label in enumerate(doctypes)}
id2label = {idx: label for label, idx in label2id.items()}
def encode_labels(example):
    example['label'] = label2id[example['output']]
    return example

train_dataset = train_dataset.map(encode_labels)
val_dataset = val_dataset.map(encode_labels)
print(doctypes)

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Map:   0%|          | 0/211 [00:00<?, ? examples/s]

['Account', 'Budget', 'Customer', 'Employee', 'Employee Contract', 'Employee Training', 'Item', 'Journal Entry', 'Payment Entry', 'Project', 'Purchase Invoice', 'Purchase Order', 'Purchase Return', 'Sales Invoice', 'Sales Order', 'Sales Partner', 'Sales Return', 'Stock Ledger Entry', 'Supplier', 'Task', 'Warehouse']


In [19]:
import csv

# Save to CSV format
with open("doctype_mapping.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Doctype", "ID"])  # Header
    for label, idx in label2id.items():
        writer.writerow([label, idx])

print("Saved as doctype_mapping.csv")

Saved as doctype_mapping.csv


In [20]:
# As for now we are not working on the multi doctypes questions, roberto can be trained on that case later to predict multi labels.
def is_single_doctype(example):
    output = example['output']
    # If output is a list (bad), or output contains ',' or ' and '
    if isinstance(output, list):
        return False
    if ',' in output or ' and ' in output.lower():
        return False
    return True

# Filter train and val datasets
train_dataset = train_dataset.filter(is_single_doctype)
val_dataset = val_dataset.filter(is_single_doctype)

# Check sizes after cleaning
print("Train size after cleaning:", len(train_dataset))
print("Validation size after cleaning:", len(val_dataset))

Filter:   0%|          | 0/840 [00:00<?, ? examples/s]

Filter:   0%|          | 0/211 [00:00<?, ? examples/s]

Train size after cleaning: 840
Validation size after cleaning: 211


In [None]:
train_dataset[4:9]

{'input': ['Which customer has the highest total invoice amount this year?',
  'List projects that have no tasks assigned.',
  "Check if 'GlobeX Trade Corp' has an introduction field entered.",
  'What is the stock aging report (items unsold for X days)?',
  'Which accounts have the highest transaction volumes?'],
 'output': ['Sales Invoice', 'Project', 'Sales Partner', 'Item', 'Account'],
 'label': [13, 9, 15, 6, 0]}

In [21]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('hyrinmansoor/text2frappe-s1-roberta')

def preprocess_function(examples):
    return tokenizer(examples['input'], truncation=True, padding="max_length")

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Map:   0%|          | 0/211 [00:00<?, ? examples/s]

In [34]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

model_name = "hyrinmansoor/text2frappe-s1-roberta"  # can be swapped anytime
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Changai/S1/Model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    report_to="none"
)

# Load the model with the correct number of labels and mappings
model_name = "hyrinmansoor/text2frappe-s1-roberta"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(doctypes), # Pass the number of labels
    id2label=id2label,       # Pass the id2label mapping
    label2id=label2id ,
    ignore_mismatched_sizes=True# Pass the label2id mapping
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at hyrinmansoor/text2frappe-s1-roberta and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([814]) in the checkpoint and torch.Size([21]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([814, 768]) in the checkpoint and torch.Size([21, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.587026
2,No log,0.407199
3,No log,0.329302
4,No log,0.302669
5,No log,0.283495


{'eval_loss': 0.2834948003292084,
 'eval_runtime': 1.4272,
 'eval_samples_per_second': 147.845,
 'eval_steps_per_second': 9.81,
 'epoch': 5.0}

In [35]:
model.save_pretrained("hyrinmansoor/text2frappe-s1-roberta")
tokenizer.save_pretrained("hyrinmansoor/text2frappe-s1-roberta")
model.push_to_hub("hyrinmansoor/text2frappe-s1-roberta")
tokenizer.push_to_hub("hyrinmansoor/text2frappe-s1-roberta")

README.md:   0%|          | 0.00/666 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/hyrinmansoor/text2frappe-s1-roberta/commit/154dd943a0f195ca38432a2c41b4fb8a518ee187', commit_message='Upload tokenizer', commit_description='', oid='154dd943a0f195ca38432a2c41b4fb8a518ee187', pr_url=None, repo_url=RepoUrl('https://huggingface.co/hyrinmansoor/text2frappe-s1-roberta', endpoint='https://huggingface.co', repo_type='model', repo_id='hyrinmansoor/text2frappe-s1-roberta'), pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import login

login()

In [None]:
from huggingface_hub import create_repo

create_repo("text2frappe-s1", private=True)

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="hyrinmansoor/text2frappe-s1-roberta",
    folder_path="/content/drive/MyDrive/Changai/S1/Model",
    path_in_repo=".",  # root of the model repo
    repo_type="model"
)


Upload 10 LFS files:   0%|          | 0/10 [00:00<?, ?it/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/997M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/997M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hyrinmansoor/text2frappe-s1-roberta/commit/6f1f84583ce41326cb6ce2636afaf94f78937daa', commit_message='Upload folder using huggingface_hub', commit_description='', oid='6f1f84583ce41326cb6ce2636afaf94f78937daa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/hyrinmansoor/text2frappe-s1-roberta', endpoint='https://huggingface.co', repo_type='model', repo_id='hyrinmansoor/text2frappe-s1-roberta'), pr_revision=None, pr_num=None)

In [3]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
model_path = "/content/drive/MyDrive/Changai/S1/Model"

model = RobertaForSequenceClassification.from_pretrained(model_path, local_files_only=True)
tokenizer = RobertaTokenizerFast.from_pretrained(model_path, local_files_only=True)

In [32]:
import torch
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "hyrinmansoor/text2frappe-s1-roberta"


model = RobertaForSequenceClassification.from_pretrained(
    model_path,
    num_labels=21,
    id2label=id2label,
    label2id=label2id,
    local_files_only=True,
    ignore_mismatched_sizes=True  # important!
).to(device)


tokenizer = RobertaTokenizerFast.from_pretrained(model_path, local_files_only=True)

test_data = [
  {"question": "How many sales invoices were created last month?", "real_answer": "Sales Invoice"},
  {"question": "What is the total outstanding amount for all sales invoices?", "real_answer": "Sales Invoice"},
  {"question": "List all sales invoices with their name and status.", "real_answer": "Sales Invoice"},
  {"question": "What is the status of sales invoice SI-0001?", "real_answer": "Sales Invoice"},
  {"question": "How many sales returns were recorded this quarter?", "real_answer": "Sales Invoice"},
  {"question": "What is the total revenue from sales invoices this year?", "real_answer": "Sales Invoice"},
  {"question": "Which customer has the highest total invoice amount this year?", "real_answer": "Sales Invoice"},
  {"question": "How many invoices were submitted last week?", "real_answer": "Sales Invoice"},
  {"question": "What is the average invoice amount this month?", "real_answer": "Sales Invoice"},
  {"question": "List all invoices created today.", "real_answer": "Sales Invoice"},

  {"question": "How many purchase invoices were created this month?", "real_answer": "Purchase Invoice"},
  {"question": "What is the total outstanding amount for all purchase invoices?", "real_answer": "Purchase Invoice"},
  {"question": "List all purchase invoices with their supplier name and status.", "real_answer": "Purchase Invoice"},
  {"question": "What is the status of purchase invoice PI-0005?", "real_answer": "Purchase Invoice"},
  {"question": "How many purchase returns were recorded last quarter?", "real_answer": "Purchase Invoice"},
  {"question": "What is the total expenditure from purchase invoices this year?", "real_answer": "Purchase Invoice"},
  {"question": "Which supplier has the highest total purchase invoice amount?", "real_answer": "Purchase Invoice"},
  {"question": "How many purchase invoices were submitted last week?", "real_answer": "Purchase Invoice"},
  {"question": "What is the average purchase invoice amount this month?", "real_answer": "Purchase Invoice"},
  {"question": "List all purchase invoices created today.", "real_answer": "Purchase Invoice"},

  {"question": "How many employees are currently active?", "real_answer": "Employee"},
  {"question": "List all employees with their department and designation.", "real_answer": "Employee"},
  {"question": "What is the date of joining for employee EMP-0001?", "real_answer": "Employee"},
  {"question": "Which employee has the highest salary?", "real_answer": "Employee"},
  {"question": "How many employees joined in the last quarter?", "real_answer": "Employee"},
  {"question": "List all employees whose contract ends this month.", "real_answer": "Employee"},
  {"question": "What is the average salary in the company?", "real_answer": "Employee"},
  {"question": "Which department has the highest average salary?", "real_answer": "Employee"},
  {"question": "How many employees are on probation?", "real_answer": "Employee"},
  {"question": "List employees who have pending performance reviews.", "real_answer": "Employee"},

  {"question": "How many sales orders were created this month?", "real_answer": "Sales Order"},
  {"question": "What is the total amount for all sales orders this year?", "real_answer": "Sales Order"},
  {"question": "List all sales orders with their customer and status.", "real_answer": "Sales Order"},
  {"question": "What is the status of sales order SO-0003?", "real_answer": "Sales Order"},
  {"question": "How many sales orders were delivered last month?", "real_answer": "Sales Order"},
  {"question": "Which customer has the highest total sales order value?", "real_answer": "Sales Order"},
  {"question": "How many sales orders were cancelled last quarter?", "real_answer": "Sales Order"},
  {"question": "What is the average sales order amount this year?", "real_answer": "Sales Order"},
  {"question": "List all sales orders created today.", "real_answer": "Sales Order"},
  {"question": "How many sales orders are pending delivery?", "real_answer": "Sales Order"},

  {"question": "How many purchase orders were created last month?", "real_answer": "Purchase Order"},
  {"question": "What is the total value of all purchase orders this year?", "real_answer": "Purchase Order"},
  {"question": "List all purchase orders with their supplier and status.", "real_answer": "Purchase Order"},
  {"question": "What is the status of purchase order PO-0007?", "real_answer": "Purchase Order"},
  {"question": "How many purchase orders were received in the last quarter?", "real_answer": "Purchase Order"},
  {"question": "Which supplier has the highest total purchase order value?", "real_answer": "Purchase Order"},
  {"question": "How many purchase orders were cancelled last month?", "real_answer": "Purchase Order"},
  {"question": "What is the average purchase order value this year?", "real_answer": "Purchase Order"},
  {"question": "List all purchase orders created today.", "real_answer": "Purchase Order"},
  {"question": "How many purchase orders are pending receipt?", "real_answer": "Purchase Order"},

  {"question": "How many items are currently in stock?", "real_answer": "Item"},
  {"question": "List all items with their item code and stock quantity.", "real_answer": "Item"},
  {"question": "What is the stock quantity of item ITEM-0001?", "real_answer": "Item"},
  {"question": "Which item has the highest stock quantity?", "real_answer": "Item"},
  {"question": "How many items are below the reorder level?", "real_answer": "Item"},
  {"question": "List all items that were added this month.", "real_answer": "Item"},
  {"question": "What is the average selling price of items?", "real_answer": "Item"},
  {"question": "Which item has the highest selling price?", "real_answer": "Item"},
  {"question": "How many items have no stock?", "real_answer": "Item"},
  {"question": "List all discontinued items.", "real_answer": "Item"}
]

# Prediction loop
results = []
for record in test_data:
    inputs = tokenizer(
        record["question"],
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
        predicted_class_id = logits.argmax(dim=-1).item()

    predicted_doctype = id2label[predicted_class_id]
    results.append({
        "Question": record["question"],
        "Real Answer": record["real_answer"],
        "Model Prediction": predicted_doctype,
        "Correct?": "✅" if predicted_doctype == record["real_answer"] else "❌"
    })

# Output
df_results = pd.DataFrame(results)
print(df_results)

                                             Question       Real Answer  \
0    How many sales invoices were created last month?     Sales Invoice   
1   What is the total outstanding amount for all s...     Sales Invoice   
2   List all sales invoices with their name and st...     Sales Invoice   
3        What is the status of sales invoice SI-0001?     Sales Invoice   
4   How many sales returns were recorded this quar...     Sales Invoice   
5   What is the total revenue from sales invoices ...     Sales Invoice   
6   Which customer has the highest total invoice a...     Sales Invoice   
7         How many invoices were submitted last week?     Sales Invoice   
8      What is the average invoice amount this month?     Sales Invoice   
9                    List all invoices created today.     Sales Invoice   
10  How many purchase invoices were created this m...  Purchase Invoice   
11  What is the total outstanding amount for all p...  Purchase Invoice   
12  List all purchase inv