In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
!pip install transformers datasets torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [18]:
import json
with open('/content/roberto_s1_v4.json') as f:
    raw_data = json.load(f)

In [19]:
from datasets import Dataset
raw_data = Dataset.from_list(raw_data)
split_dataset = raw_data.train_test_split(test_size=0.2, seed=42)

In [20]:
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))

Train size: 16309
Validation size: 4078


In [21]:
# Remove 'instruction' from both datasets
train_dataset = train_dataset.remove_columns(['instruction'])
val_dataset = val_dataset.remove_columns(['instruction'])

In [22]:
print(train_dataset.column_names)

['input', 'output']


In [23]:
doctypes = sorted(list(set([record['output'] for record in raw_data])))

label2id = {label: idx for idx, label in enumerate(doctypes)}
id2label = {idx: label for label, idx in label2id.items()}
def encode_labels(example):
    example['label'] = label2id[example['output']]
    return example

train_dataset = train_dataset.map(encode_labels)
val_dataset = val_dataset.map(encode_labels)
print(doctypes)

Map:   0%|          | 0/16309 [00:00<?, ? examples/s]

Map:   0%|          | 0/4078 [00:00<?, ? examples/s]

['About Us Settings', 'About Us Team Member', 'Access Log', 'Account', 'Account Closing Balance', 'Accounting Dimension', 'Accounting Dimension Detail', 'Accounting Dimension Filter', 'Accounting Period', 'Accounts Receivable', 'Accounts Settings', 'Activity Cost', 'Activity Log', 'Activity Type', 'Additional Salary', 'Address', 'Address Template', 'Advance Payment Ledger Entry', 'Advance Tax', 'Advance Taxes and Charges', 'Allowed Dimension', 'Allowed To Transact With', 'Amended Document Naming Settings', 'Applicable On Account', 'Appointment', 'Appointment Booking Settings', 'Appointment Booking Slots', 'Approval', 'Asset', 'Asset Activity', 'Asset Capitalization', 'Asset Capitalization Asset Item', 'Asset Capitalization Service Item', 'Asset Capitalization Stock Item', 'Asset Capitalization, Company', 'Asset Capitalization, Payment Entry Reference', 'Asset Capitalization, Process Payment Reconciliation Log', 'Asset Capitalization, Process Payment Reconciliation Log, Company', 'Asset

In [24]:
import csv

# Save to CSV format
with open("doctype_mapping.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Doctype", "ID"])  # Header
    for label, idx in label2id.items():
        writer.writerow([label, idx])

print("Saved as doctype_mapping.csv")

Saved as doctype_mapping.csv


In [25]:
# As for now we are not working on the multi doctypes questions, roberto can be trained on that case later to predict multi labels.
def is_single_doctype(example):
    output = example['output']
    # If output is a list (bad), or output contains ',' or ' and '
    if isinstance(output, list):
        return False
    if ',' in output or ' and ' in output.lower():
        return False
    return True

# Filter train and val datasets
train_dataset = train_dataset.filter(is_single_doctype)
val_dataset = val_dataset.filter(is_single_doctype)

# Check sizes after cleaning
print("Train size after cleaning:", len(train_dataset))
print("Validation size after cleaning:", len(val_dataset))

Filter:   0%|          | 0/16309 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4078 [00:00<?, ? examples/s]

Train size after cleaning: 15864
Validation size after cleaning: 3975


In [26]:
train_dataset[4:9]

{'input': ['Can you tell me how many parcels we shipped today?',
  'Tell me the batch code for this BOM update process.',
  'Can I make changes to the values of this block?',
  'Is this serial sn_ksjdkwxd still under warranty?',
  'Fetch quotations received for RFQ RFQ-202.'],
 'output': ['Shipment Parcel',
  'BOM Update Batch',
  'Web Page Block',
  'Serial No',
  'Supplier Quotation'],
 'label': [649, 72, 773, 634, 687]}

In [27]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('hyrinmansoor/text2frappe-s1-roberta')

def preprocess_function(examples):
    return tokenizer(examples['input'], truncation=True, padding="max_length")

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/15864 [00:00<?, ? examples/s]

Map:   0%|          | 0/3975 [00:00<?, ? examples/s]

In [13]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [28]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

model_name = "hyrinmansoor/text2frappe-s1-roberta"  # can be swapped anytime
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [30]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Changai/S1/Model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    report_to="none"
)

# Load the model with the correct number of labels and mappings
model_name = "hyrinmansoor/text2frappe-s1-roberta"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(doctypes), # Pass the number of labels
    id2label=id2label,       # Pass the id2label mapping
    label2id=label2id ,
    ignore_mismatched_sizes=True# Pass the label2id mapping
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

RuntimeError: Error(s) in loading state_dict for Linear:
	size mismatch for bias: copying a param with shape torch.Size([814]) from checkpoint, the shape in current model is torch.Size([815]).

In [None]:
model.save_pretrained("hyrinmansoor/text2frappe-s1-roberta")
tokenizer.save_pretrained("hyrinmansoor/text2frappe-s1-roberta")

In [None]:
from huggingface_hub import login

login()

In [None]:
from huggingface_hub import create_repo

create_repo("text2frappe-s1", private=True)

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="hyrinmansoor/text2frappe-s1-roberta",
    folder_path="/content/drive/MyDrive/Changai/S1/Model",
    path_in_repo=".",  # root of the model repo
    repo_type="model"
)


In [None]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
# Change the model_path to the directory where the model was actually saved
model_path = "/content/drive/MyDrive/Changai/S1/Model"

# Add local_files_only=True to explicitly load from the local path
model = RobertaForSequenceClassification.from_pretrained(model_path, local_files_only=True)
tokenizer = RobertaTokenizerFast.from_pretrained(model_path, local_files_only=True)

In [None]:
import torch
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
test_data = [
    {"question": "Where can I view logged work hours for tasks?", "real_answer": "Job Card Time Log"},
    {"question": "How do I manage budget allocations for departments?", "real_answer": "Budget Account"},
    {"question": "List all vehicles currently in use for delivery.", "real_answer": "Vehicle"},
    {"question": "Where can I track department-level expenses?", "real_answer": "Department"},
    {"question": "Fetch all competitor details added this quarter.", "real_answer": "Competitor"},
    {"question": "Show me all teams managing asset maintenance.", "real_answer": "Asset Maintenance Team"},
    {"question": "Where can I check ledger health insights?", "real_answer": "Ledger Health Monitor"},
    {"question": "List all purchase order items pending receipt.", "real_answer": "Purchase Order Item"},
]


id2label = {str(k): v for k, v in id2label.items()}

results = []

for record in test_data:
    test_question = record["question"]
    real_answer = record["real_answer"]

    inputs = tokenizer(test_question, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax(dim=-1).item()

    predicted_doctype = id2label[str(predicted_class_id)]

    # Store
    results.append({
        "Question": test_question,
        "Real Answer": real_answer,
        "Model Prediction": predicted_doctype,
        "Correct?": "✅" if predicted_doctype == real_answer else "❌"
    })
df_results = pd.DataFrame(results)
print(df_results)