In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers datasets torch

In [None]:
import json
with open('/content/combined_output_v1.json') as f:
    raw_data = json.load(f)

In [None]:
print(raw_data[:5])

In [None]:
from datasets import Dataset
raw_data = Dataset.from_list(raw_data)
split_dataset = raw_data.train_test_split(test_size=0.2, seed=42)

In [None]:
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))

In [None]:
print(train_dataset.column_names)

In [None]:
doctypes = sorted(list(set([','.join(record['output']) if isinstance(record['output'], list) else record['output'] for record in raw_data])))

label2id = {label: idx for idx, label in enumerate(doctypes)}
id2label = {idx: label for label, idx in label2id.items()}
def encode_labels(example):
    example['label'] = label2id[','.join(example['output']) if isinstance(example['output'], list) else example['output']]
    return example

train_dataset = train_dataset.map(encode_labels)
val_dataset = val_dataset.map(encode_labels)
print(doctypes)

In [None]:
import csv

# Save to CSV format
with open("doctype_mapping.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Doctype", "ID"])  # Header
    for label, idx in label2id.items():
        writer.writerow([label, idx])

print("Saved as doctype_mapping.csv")

In [None]:
train_dataset[4:9]

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('hyrinmansoor/text2frappe-s1-roberta')

def preprocess_function(examples):
    return tokenizer(examples['input'], truncation=True, padding="max_length")

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

In [None]:
from huggingface_hub import login

login()

In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

model_name = "hyrinmansoor/text2frappe-s1-roberta"  # can be swapped anytime
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification

training_args = TrainingArguments(
    output_dir="./results/ROBERTA-t1",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    report_to="none"
)

# Load the model with the correct number of labels and mappings
model_name = "hyrinmansoor/text2frappe-s1-roberta"
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    id2label=id2label,       # Pass the id2label mapping
    label2id=label2id ,
    ignore_mismatched_sizes=True# Pass the label2id mapping
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

# Train the model
trainer.train()

trainer.evaluate()

In [None]:
model.save_pretrained("/content/drive/MyDrive/Changai/S1/Model_trained_results/1")
tokenizer.save_pretrained("/content/drive/MyDrive/Changai/S1/Model_trained_results/1")
# model.push_to_hub("hyrinmansoor/text2frappe-s1-roberta")
# tokenizer.push_to_hub("hyrinmansoor/text2frappe-s1-roberta")

In [None]:
from huggingface_hub import login

login()

In [None]:
from huggingface_hub import create_repo

create_repo("text2frappe-s1", private=True)

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="hyrinmansoor/text2frappe-s1-roberta",
    folder_path="/content/drive/MyDrive/Changai/S1/Model_trained_results/1",
    path_in_repo="",  # root of the model repo
    repo_type="model"
)


In [None]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
# model_path = "/content/drive/MyDrive/Changai/S1/Model"

# model = RobertaForSequenceClassification.from_pretrained(model_path, local_files_only=True)
# tokenizer = RobertaTokenizerFast.from_pretrained(model_path, local_files_only=True)

In [None]:
print(id2label)
with open("id2label.json","w",encoding="utf-8") as f:
  json.dump(id2label,f, indent=4, ensure_ascii=False)

In [None]:
import torch
import pandas as pd


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/content/drive/MyDrive/Changai/S1/Model_trained_results/1"


model = RobertaForSequenceClassification.from_pretrained(
    model_path,
    id2label=id2label,
    label2id=label2id,
    local_files_only=True,
    ignore_mismatched_sizes=True  # important!
).to(device)


tokenizer = RobertaTokenizerFast.from_pretrained(model_path, local_files_only=True)
with open("/content/openAI_SD_raw.json","r",encoding="utf-8") as f:
  test_data=json.load(f)

# test_data=[
#   { "input": "new staff joined this yr?", "output": ["Employee"] },
#   { "input": "inactive workers list rn", "output": ["Employee"] },
#   { "input": "who on leave today?", "output": ["Employee"] },

#   { "input": "clients added last 2 wks?", "output": ["Customer"] },
#   { "input": "any cust no orders yet?", "output": ["Customer"] },
#   { "input": "overdue bills clients?", "output": ["Customer"] },

#   { "input": "open so pending deliver?", "output": ["Sales Order"] },
#   { "input": "sales orders this wk?", "output": ["Sales Order"] },
#   { "input": "big so amt rn?", "output": ["Sales Order"] },

#   { "input": "unpaid invoices show", "output": ["Sales Invoice"] },
#   { "input": "avg invoice val mnth?", "output": ["Sales Invoice"] },
#   { "input": "last 5 invoices any cust", "output": ["Sales Invoice"] },

#   { "input": "open po list?", "output": ["Purchase Order"] },
#   { "input": "po made this quarter", "output": ["Purchase Order"] },
#   { "input": "cancelled purchase orders?", "output": ["Purchase Order"] },

#   { "input": "unpaid purchase inv now?", "output": ["Purchase Invoice"] },
#   { "input": "avg pi cost mnth?", "output": ["Purchase Invoice"] },
#   { "input": "top 5 purchase invoices", "output": ["Purchase Invoice"] },

#   { "input": "suppliers dubai list", "output": ["Supplier"] },
#   { "input": "inactive supplier rn", "output": ["Supplier"] },
#   { "input": "supplier max inv", "output": ["Supplier"] },

#   { "input": "projects still running", "output": ["Project"] },
#   { "input": "proj started last mnth", "output": ["Project"] },
#   { "input": "any delayed projects?", "output": ["Project"] },

#   { "input": "low stock items?", "output": ["Item"] },
#   { "input": "products no sale mnth", "output": ["Item"] },
#   { "input": "top 10 selling goods", "output": ["Item"] },

#   { "input": "open tasks today", "output": ["Task"] },
#   { "input": "tasks done last wk", "output": ["Task"] },
#   { "input": "pending tasks hr dept", "output": ["Task"] },

#   { "input": "stock entries today?", "output": ["Stock Entry"] },
#   { "input": "recent stock entry item x", "output": ["Stock Entry"] },
#   { "input": "cancelled stock entries", "output": ["Stock Entry"] },
#    { "input": "new employees last mnth?", "output": ["Employee"] },
#   { "input": "staff still active?", "output": ["Employee"] },
#   { "input": "emp on leave rn", "output": ["Employee"] },

#   { "input": "show me new cust this week", "output": ["Customer"] },
#   { "input": "any clients with 0 invoices", "output": ["Customer"] },
#   { "input": "cust overdue pay list", "output": ["Customer"] },

#   { "input": "sales orders yet to ship", "output": ["Sales Order"] },
#   { "input": "so count today?", "output": ["Sales Order"] },
#   { "input": "biggest sales order amt", "output": ["Sales Order"] },

#   { "input": "inv not paid still", "output": ["Sales Invoice"] },
#   { "input": "avg sales inv val quarter", "output": ["Sales Invoice"] },
#   { "input": "show 10 latest invoices", "output": ["Sales Invoice"] },

#   { "input": "open pos show", "output": ["Purchase Order"] },
#   { "input": "pos created this mnth?", "output": ["Purchase Order"] },
#   { "input": "cancelled pos list", "output": ["Purchase Order"] },

#   { "input": "purchase inv unpaid rn", "output": ["Purchase Invoice"] },
#   { "input": "pi avg amount this quarter", "output": ["Purchase Invoice"] },
#   { "input": "purchase invoices top value", "output": ["Purchase Invoice"] },

#   { "input": "suppliers inactive list", "output": ["Supplier"] },
#   { "input": "supplier with most pos", "output": ["Supplier"] },
#    { "input": "new staff onboard this mnth?", "output": ["Employee"] },
#   { "input": "who is on leave rn", "output": ["Employee"] },
#   { "input": "inactive emp list pls", "output": ["Employee"] },

#   { "input": "new clients joined this wk?", "output": ["Customer"] },
#   { "input": "cust not ordered anything yet", "output": ["Customer"] },
#   { "input": "overdue bills for customers rn", "output": ["Customer"] },

#   { "input": "open so still not shipped", "output": ["Sales Order"] },
#   { "input": "so count today?", "output": ["Sales Order"] },
#   { "input": "highest value sales order rn", "output": ["Sales Order"] },

#   { "input": "show unpaid invoices rn", "output": ["Sales Invoice"] },
#   { "input": "avg invoice amt this quarter", "output": ["Sales Invoice"] },
#   { "input": "10 latest invoices pls", "output": ["Sales Invoice"] },

#   { "input": "open po list rn", "output": ["Purchase Order"] },
#   { "input": "pos created this mnth", "output": ["Purchase Order"] },
#   { "input": "cancelled purchase orders show", "output": ["Purchase Order"] },

#   { "input": "unpaid purchase invs rn", "output": ["Purchase Invoice"] },
#   { "input": "avg purchase invoice val mnth", "output": ["Purchase Invoice"] },
#   { "input": "top 5 purchase invoices val", "output": ["Purchase Invoice"] },

#   { "input": "suppliers in dubai list", "output": ["Supplier"] },
#   { "input": "inactive suppliers rn", "output": ["Supplier"] },
#   { "input": "supplier with max invoices", "output": ["Supplier"] },

#   { "input": "projects still running now", "output": ["Project"] },
#   { "input": "proj started last mnth?", "output": ["Project"] },
#   { "input": "any delayed projects rn", "output": ["Project"] },

#   { "input": "low stock items show", "output": ["Item"] },
#   { "input": "items not sold this mnth", "output": ["Item"] },
#   { "input": "top 10 selling items", "output": ["Item"] },

#   { "input": "open tasks today rn", "output": ["Task"] },
#   { "input": "tasks completed last wk", "output": ["Task"] },
#   { "input": "pending tasks hr dept", "output": ["Task"] },

#   { "input": "stock entries today pls", "output": ["Stock Entry"] },
#   { "input": "recent stock entry item y", "output": ["Stock Entry"] },
#   { "input": "cancelled stock entries list", "output": ["Stock Entry"] }

# ]

results = []
for record in test_data:
    inputs = tokenizer(
        record["input"],
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
        predicted_class_id = logits.argmax(dim=-1).item()

    predicted_doctype = id2label[predicted_class_id]
    results.append({
        "Question": record["input"],
        "Real Answer": record["output"][0],
        "Model Prediction": predicted_doctype,
        "Correct?": "✅" if predicted_doctype == record["output"][0] else "❌"
    })

# Output
df_results = pd.DataFrame(results)
print(df_results)

In [None]:
import torch, pandas as pd
with open("/content/openAI_SD_raw.json","r",encoding="utf-8") as f:
  test_data=json.load(f)
correct = 0
total = 0
rows = []
results=[]
for rec in test_data:
    gold = rec["output"][0]
    enc = tokenizer(
        rec["input"],
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    ).to(device)

    with torch.no_grad():
        logits = model(**enc).logits
        predicted_class_id = logits.argmax(dim=-1).item()

    predicted_doctype = id2label[predicted_class_id]

    correct += (predicted_doctype == gold)
    total += 1
    results.append({
        "Question": rec["input"],
        "Real Answer": rec["output"][0],
        "Model Prediction": predicted_doctype,
        "Correct?": "✅" if predicted_doctype == rec["output"][0] else "❌"
    })

acc = correct / total if total else 0.0
print("Accuracy:", round(acc, 4))
df_results = pd.DataFrame(results)
df_results.to_csv("/content/results.csv",index=False)
