<a href="https://colab.research.google.com/github/ERPGulf/changAI/blob/alpha/changai/notebooks/s3_flan_query.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers datasets accelerate
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import json
import torch

In [None]:
with open('/content/flans3_final_v2.json') as f:
    raw_data = json.load(f)

In [None]:
!pip install transformers datasets torch

In [None]:
import json
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq

In [None]:
model_name = "hyrinmansoor/text2frappe-s3-flan-query"
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [None]:
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
import json

# # ✅ Load your dataset
# with open("/content/drive/MyDrive/Changai/S3/Datasets/S3_flan_old.json") as f:
#     raw_data = json.load(f)

INSTRUCTION = "Generate the correct Frappe query for the given question, using the provided doctype and fields."

# ✅ Prepare prompt-based data
processed_data = []
for entry in raw_data:
    try:
        input_data = entry["input"]
        if not isinstance(input_data, dict):
            continue

        prompt = (
            f"Instruction: {INSTRUCTION}\n"
            f"Doctype: {input_data['doctype']}\n"
            f"Question: {input_data['question']}\n"
            f"Fields: {input_data['fields']}"
        )

        processed_data.append({
            "input": prompt,
            "output": entry["output"]
        })
    except Exception as e:
        print("Skipping due to error:", e)
        continue

# ✅ Convert to Hugging Face Dataset
dataset = Dataset.from_list(processed_data)
dataset = dataset.train_test_split(test_size=0.1)


model_name = "hyrinmansoor/text2frappe-s3-flan-query"
tokenizer = T5Tokenizer.from_pretrained(model_name)


# ✅ Load and resize model to accommodate new tokens
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# ✅ Tokenize with `{}` token loss reinforcement
def tokenize_and_enhance_loss(example):
    model_inputs = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["output"], max_length=128, truncation=True, padding="max_length")

    # Reinforce loss on `{` and `}`
    target_ids = labels["input_ids"]

    model_inputs["labels"] = target_ids
    return model_inputs

# ✅ Tokenize dataset
tokenized_datasets = dataset.map(tokenize_and_enhance_loss, batched=False)

# ✅ Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/Changai/S3/Model/text2frappe-s3-flan_18_08_25_3",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=16,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100
)

# ✅ Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# ✅ Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ✅ Train and Save
trainer.train()
trainer.evaluate()
model.save_pretrained("/content/drive/MyDrive/Changai/S3/Model/text2frappe-s3-flan_18_08_25_3")
tokenizer.save_pretrained("/content/drive/MyDrive/Changai/S3/Model/text2frappe-s3-flan_18_08_25_3")

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load tokenizer and model
model_path = "/content/drive/MyDrive/Changai/S3/Model/text2frappe-s3-flan_18_08_25_3"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
prompts = [
    # Sales Invoice
    {
        "doctype": "Sales Invoice",
        "question": "List all sales invoices for customer 'ABC Ltd'.",
        "fields": ["customer"]
    },
    {
        "doctype": "Sales Invoice",
        "question": "Find invoices with total greater than 1000.",
        "fields": ["total"]
    },
    # Purchase Invoice
    {
        "doctype": "Purchase Invoice",
        "question": "List invoices for supplier 'XYZ Pvt Ltd'.",
        "fields": ["supplier"]
    },
    {
        "doctype": "Purchase Invoice",
        "question": "Show invoices posted in July 2025.",
        "fields": ["posting_date"]
    },
    # Customer
    {
        "doctype": "Customer",
        "question": "List all customers in 'Retail' group.",
        "fields": ["customer_group"]
    },
    {
        "doctype": "Customer",
        "question": "Find customers in territory 'North'.",
        "fields": ["territory"]
    },
    # Employee
    {
        "doctype": "Employee",
        "question": "List employees in department 'Sales'.",
        "fields": ["department"]
    },
    {
        "doctype": "Employee",
        "question": "Find employees with designation 'Manager'.",
        "fields": ["designation"]
    },
    # Item
    {
        "doctype": "Item",
        "question": "List items in item group 'Electronics'.",
        "fields": ["item_group"]
    },
    {
        "doctype": "Item",
        "question": "Find items with valuation rate above 5000.",
        "fields": ["valuation_rate"]
    },
    {
        "doctype": "Sales Invoice",
        "question": "Get all sales invoices for customer 'ABC Ltd'.",
        "fields": ["customer"]
    },
    {
        "doctype": "Sales Invoice",
        "question": "Find invoices with total more than 1000.",
        "fields": ["total"]
    },

    # Purchase Invoice
    {
        "doctype": "Purchase Invoice",
        "question": "Get all purchase invoices for supplier 'XYZ Pvt Ltd'.",
        "fields": ["supplier"]
    },
    {
        "doctype": "Purchase Invoice",
        "question": "Show invoices posted in July 2025.",
        "fields": ["posting_date"]
    },

    # Customer
    {
        "doctype": "Customer",
        "question": "List customers in group 'Retail'.",
        "fields": ["customer_group"]
    },
    {
        "doctype": "Customer",
        "question": "Find customers in 'North' territory.",
        "fields": ["territory"]
    },

    # Employee
    {
        "doctype": "Employee",
        "question": "List employees in 'Sales' department.",
        "fields": ["department"]
    },
    {
        "doctype": "Employee",
        "question": "Find employees with designation 'Manager'.",
        "fields": ["designation"]
    },

    # Item
    {
        "doctype": "Item",
        "question": "List items in group 'Electronics'.",
        "fields": ["item_group"]
    },
    {
        "doctype": "Item",
        "question": "Find items with price above 5000.",
        "fields": ["valuation_rate"]
    },

    # Supplier
    {
        "doctype": "Supplier",
        "question": "List suppliers in country 'India'.",
        "fields": ["country"]
    },
    {
        "doctype": "Supplier",
        "question": "Find suppliers with status 'Active'.",
        "fields": ["status"]
    },

    # Project
    {
        "doctype": "Project",
        "question": "List projects in 'Construction' category.",
        "fields": ["project_name", "project_type"]
    },
    {
        "doctype": "Project",
        "question": "Get projects owned by 'John Doe'.",
        "fields": ["owner"]
    },
    # Sales Invoice
    {
        "doctype": "Sales Invoice",
        "question": "Get all sales invoices.",
        "fields": ["name"]
    },
    {
        "doctype": "Sales Invoice",
        "question": "Count total sales invoices.",
        "fields": ["name"]
    },

    # Purchase Invoice
    {
        "doctype": "Purchase Invoice",
        "question": "Get all purchase invoices.",
        "fields": ["name"]
    },
    {
        "doctype": "Purchase Invoice",
        "question": "Count total purchase invoices.",
        "fields": ["name"]
    },

    # Customer
    {
        "doctype": "Customer",
        "question": "Get all customers.",
        "fields": ["name"]
    },
    {
        "doctype": "Customer",
        "question": "Count total customers.",
        "fields": ["name"]
    },

    # Employee
    {
        "doctype": "Employee",
        "question": "Get all employees.",
        "fields": ["name"]
    },
    {
        "doctype": "Employee",
        "question": "Count total employees.",
        "fields": ["name"]
    },

    # Item
    {
        "doctype": "Item",
        "question": "Get all items.",
        "fields": ["name"]
    },
    {
        "doctype": "Item",
        "question": "Count total items.",
        "fields": ["name"]
    },

    # Supplier
    {
        "doctype": "Supplier",
        "question": "Get all suppliers.",
        "fields": ["name"]
    },
    {
        "doctype": "Supplier",
        "question": "Count total suppliers.",
        "fields": ["name"]
    }
]




for idx, item in enumerate(prompts, 1):
    test_prompt = f"""
Instruction: Generate the correct Frappe query for the given question, using the provided doctype and fields.
Doctype: {item['doctype']}
Question: {item['question']}
Fields: {item['fields']}
"""
    inputs = tokenizer(test_prompt.strip(), return_tensors="pt")
    output_ids = model.generate(
        input_ids=inputs["input_ids"],
        max_length=128,
        do_sample=False,
        num_beams=4,
        early_stopping=True
    )
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # output_text=output_text.replace("[BT]","`")

    # output_text = output_text.replace("<pad>", "").replace("</s>", "").strip()

    print(f"\n🔢 Example {idx}")
    print("🗒️ Prompt:\n", test_prompt.strip())
    print("📄 Generated Query:\n", output_text)


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_path = "/content/drive/MyDrive/Changai/S3_test"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# 🔍 Sample test input
test_prompt = """Instruction: Generate the correct Frappe query for the given question, using the provided doctype and fields.
Doctype: Company
Question: Show me company details where the name is 'France'.
Fields: name, country"""

inputs = tokenizer(test_prompt, return_tensors="pt")

output_ids = model.generate(
    input_ids=inputs["input_ids"],
    max_length=128,
    do_sample=False,
    num_beams=4,
    early_stopping=True
)

# ✅ Decode and print output
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=False)
output_text=output_text.replace("<pad>","")
output_text=output_text.replace("</s>","")
print("📄 Generated Query:\n", output_text)


In [None]:
model_path="/content/drive/MyDrive/Changai/S3_test"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

In [None]:
from huggingface_hub import HfApi, HfFolder, notebook_login
notebook_login()

In [None]:
from huggingface_hub import create_repo

create_repo("text2frappe-s3-flan-query", private=True)

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="hyrinmansoor/text2frappe-s3-flan-query",
    folder_path=model_path,
    path_in_repo=".",
    repo_type="model"
)
