<a href="https://colab.research.google.com/github/ERPGulf/changai/blob/hyrin/changai/notebooks/s1_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers datasets torch

In [None]:
import json
with open('/content/drive/MyDrive/Changai/S1/Datasets/roberto_s1.json') as f:
    raw_data = json.load(f)

In [None]:
from datasets import Dataset
raw_data = Dataset.from_list(raw_data)
split_dataset = raw_data.train_test_split(test_size=0.2, seed=42)

In [None]:
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))

In [None]:
# Save train
train_dataset.to_json('/content/sample_data/train_data.json')

# Save validation
val_dataset.to_json('/content/sample_data/val_data.json')


In [None]:
# Remove 'instruction' from both datasets
train_dataset = train_dataset.remove_columns(['instruction'])
val_dataset = val_dataset.remove_columns(['instruction'])

In [None]:
print(train_dataset.column_names)

In [None]:
doctypes = sorted(list(set([record['output'] for record in raw_data])))

label2id = {label: idx for idx, label in enumerate(doctypes)}
id2label = {idx: label for label, idx in label2id.items()}
def encode_labels(example):
    example['label'] = label2id[example['output']]
    return example

train_dataset = train_dataset.map(encode_labels)
val_dataset = val_dataset.map(encode_labels)
print(doctypes)

In [None]:
import csv

# Save to CSV format
with open("doctype_mapping.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Doctype", "ID"])  # Header
    for label, idx in label2id.items():
        writer.writerow([label, idx])

print("Saved as doctype_mapping.csv")

In [None]:
# As for now we are not working on the multi doctypes questions, roberto can be trained on that case later to predict multi labels.
def is_single_doctype(example):
    output = example['output']
    # If output is a list (bad), or output contains ',' or ' and '
    if isinstance(output, list):
        return False
    if ',' in output or ' and ' in output.lower():
        return False
    return True

# Filter train and val datasets
train_dataset = train_dataset.filter(is_single_doctype)
val_dataset = val_dataset.filter(is_single_doctype)

# Check sizes after cleaning
print("Train size after cleaning:", len(train_dataset))
print("Validation size after cleaning:", len(val_dataset))

In [None]:
train_dataset[4:9]

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def preprocess_function(examples):
    return tokenizer(examples['input'], truncation=True, padding="max_length")

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


In [None]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


In [None]:
!pip install -U transformers

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Changai/S1/Model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    report_to="none"
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

In [None]:
model.save_pretrained("/content/drive/MyDrive/Changai/S1/Model")
tokenizer.save_pretrained("/content/drive/MyDrive/Changai/S1/Model")

In [None]:
from huggingface_hub import login

login()

In [None]:
from huggingface_hub import create_repo

create_repo("text2frappe-s1", private=True)

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="hyrinmansoor/text2frappe-s1",
    folder_path="/content/drive/MyDrive/Changai/S1/Model",
    path_in_repo=".",  # root of the model repo
    repo_type="model"
)


In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification

model_name = "hyrinmansoor/text2frappe-s1"  # can be swapped anytime
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Now you can call through API Inference also.
#API_URL = "https://api-inference.huggingface.co/models/your-username/text2frappe-s1"

In [None]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
# Change the model_path to the directory where the model was actually saved
model_path = "/content/drive/MyDrive/Changai/S1/Model"

# Add local_files_only=True to explicitly load from the local path
model = RobertaForSequenceClassification.from_pretrained(model_path, local_files_only=True)
tokenizer = RobertaTokenizerFast.from_pretrained(model_path, local_files_only=True)

In [None]:
import torch
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

test_data = [
    {"question": "Where can I view all active sales invoices?", "real_answer": "Sales Invoice"},
    {"question": "How do I create a new quotation for a customer?", "real_answer": "Quotation"},
    {"question": "List all delivery notes that are pending submission.", "real_answer": "Delivery Note"},
    {"question": "Fetch all journal entries created last month.", "real_answer": "Journal Entry"},
    {"question": "Show me approved purchase receipts from supplier ABC.", "real_answer": "Purchase Receipt"},
    {"question": "Where can I check employee attendance records?", "real_answer": "Attendance"},
    {"question": "Retrieve all leave applications pending approval.", "real_answer": "Leave Application"},
    {"question": "Find stock entries made for production today.", "real_answer": "Stock Entry"},
    {"question": "Display open support tickets for customers.", "real_answer": "Issue"},
    {"question": "Where can I configure default buying policies?", "real_answer": "Buying Settings"},
    {"question": "List all active customer subscriptions.", "real_answer": "Subscription"},
    {"question": "Show all approved supplier quotations.", "real_answer": "Supplier Quotation"},
    {"question": "Fetch submitted payment entries from this week.", "real_answer": "Payment Entry"},
    {"question": "How to track production planning orders?", "real_answer": "Production Plan"},
    {"question": "View all assets that are under maintenance.", "real_answer": "Asset Maintenance"},
]

id2label = {str(k): v for k, v in id2label.items()}

results = []

for record in test_data:
    test_question = record["question"]
    real_answer = record["real_answer"]

    inputs = tokenizer(test_question, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = logits.argmax(dim=-1).item()

    predicted_doctype = id2label[str(predicted_class_id)]

    # Store
    results.append({
        "Question": test_question,
        "Real Answer": real_answer,
        "Model Prediction": predicted_doctype,
        "Correct?": "✅" if predicted_doctype == real_answer else "❌"
    })
df_results = pd.DataFrame(results)
print(df_results)