In [None]:
!pip install transformers sentence-transformers gradio

import gradio as gr
import torch
import json
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    T5Tokenizer, T5ForConditionalGeneration
)
from sentence_transformers import SentenceTransformer, util

In [None]:
from huggingface_hub import login
login()

In [None]:
def select_fields_with_flan(doctype, question, top_fields):
    prompt = f"Instruction: Select only the correct field(s) from the given top fields that answer the question.\nDoctype: {doctype}\nQuestion: {question}\nTop Fields: {', '.join(top_fields)}"
    input_ids = tokenizer_s2(prompt, return_tensors="pt").input_ids
    output_ids = model_s2.generate(input_ids, max_length=64)
    return tokenizer_s2.decode(output_ids[0], skip_special_tokens=True).split(", ")
doctype="Journal Entry"
question="Fetch all journal entries created last month."
top_fields=['posting_date', 'due_date', 'cheque_date', 'clearance_date']
try:
    selected_fields = select_fields_with_flan(doctype, question, top_fields)
    print("✅ Selected Fields (FLAN):", selected_fields)
except Exception as flan_err:
    print(f"❌ Error in select_fields_with_flan: {flan_err}")
    selected_fields = []

In [None]:
with open("/content/drive/MyDrive/Changai/S1/Datasets/id2label") as f:
    id2label = json.load(f)

In [None]:
# Stage 1: Doctype Classifier (RoBERTa)
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
s1_model = "hyrinmansoor/text2frappe-s1-roberta"
tokenizer_s1 = RobertaTokenizerFast.from_pretrained(s1_model)
model_s1 = RobertaForSequenceClassification.from_pretrained(s1_model)

# Load label mapping from uploaded file
with open("/content/drive/MyDrive/Changai/meta.json") as f:
    meta = json.load(f)
# with open("/content/meta.json") as f:
#     s2_meta = json.load(f)

sbert = SentenceTransformer("hyrinmansoor/text2frappe-s2-sbert")

s2_model = "hyrinmansoor/text2frappe-s2-flan-field"
tokenizer_s2 = T5Tokenizer.from_pretrained(s2_model)
model_s2 = T5ForConditionalGeneration.from_pretrained(s2_model)

s3_model = "hyrinmansoor/text2frappe-s3-flan-query"
tokenizer_s3 = T5Tokenizer.from_pretrained(s3_model)
model_s3 = T5ForConditionalGeneration.from_pretrained(s3_model)

with open("/content/drive/MyDrive/Changai/S1/Datasets/id2label") as f:
    id2label = json.load(f)

def predict_doctype(question):
    try:
        inputs = tokenizer_s1(question, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = model_s1(**inputs)
        pred_id = str(outputs.logits.argmax().item())
        print("🔢 Predicted ID:", pred_id)

        predicted_doctype = id2label.get(pred_id, "Unknown")

        print("📄 Predicted Doctype:", predicted_doctype)
        return predicted_doctype

    except Exception as e:
        print("❌ Exception in predict_doctype:", str(e))
        return "Unknown"

def get_top_k_fields(question, doctype, k=4):
    try:
        if doctype not in meta or "fields" not in meta[doctype]:
            raise ValueError(f"❌ No metadata or fields found for doctype '{doctype}'")

        fields = meta[doctype]["fields"]
        print(f"📋 Fields for {doctype}: {fields}")

        if not fields:
            print(f"⚠️ Field list is empty for doctype: {doctype}")
            return []

        sbert_prompt = f"Doctype: {doctype}\nQuestion: {question}"

        query_emb = sbert.encode(sbert_prompt, convert_to_tensor=True)
        field_embs = sbert.encode(fields, convert_to_tensor=True)

        sim_scores = util.pytorch_cos_sim(query_emb, field_embs)[0]
        top_k = torch.topk(sim_scores, k=min(k, len(fields)))

        return [fields[i] for i in top_k.indices.tolist()]

    except Exception as e:
        print("❌ Exception in get_top_k_fields:", str(e))
        return []

def select_fields_with_flan(doctype, question, top_fields):
    prompt = f"Instruction: Select only the correct field(s) from the given top fields that answer the question.\nDoctype: {doctype}\nQuestion: {question}\nTop Fields: {', '.join(top_fields)}"
    input_ids = tokenizer_s2(prompt, return_tensors="pt").input_ids
    output_ids = model_s2.generate(input_ids, max_length=64)
    return tokenizer_s2.decode(output_ids[0], skip_special_tokens=True).split(", ")

def generate_frappe_query(doctype, question, fields):
    prompt = f"Generate the correct Frappe query for the given question, using the provided doctype and fields.\nDoctype: {doctype}\nQuestion: {question}\nFields: {', '.join(fields)}"
    input_ids = tokenizer_s3(prompt, return_tensors="pt").input_ids
    output_ids = model_s3.generate(input_ids, max_length=128)
    decoded= tokenizer_s3.decode(output_ids[0], skip_special_tokens=False)
    decoded=decoded.replace("<pad>","")
    decoded=decoded.replace("</s>","")
    return decoded

def full_pipeline(question):
    try:
        print("\n📌 Input Question:", question)
        doctype = predict_doctype(question)
        print("✅ Predicted Doctype:", doctype)

        try:
            top_fields = get_top_k_fields(question, doctype)
            print("🔎 Top Fields (SBERT):", top_fields)
        except Exception as field_err:
            print(f"❌ Error in get_top_k_fields: {field_err}")
            top_fields = [{field_err}]
        try:
            selected_fields = select_fields_with_flan(doctype, question, top_fields)
            print("✅ Selected Fields (FLAN):", selected_fields)
        except Exception as flan_err:
            print(f"❌ Error in select_fields_with_flan: {flan_err}")
            selected_fields = []
        try:
            frappe_query = generate_frappe_query(doctype, question, selected_fields)
            print("✅ Frappe Query (FLAN):", frappe_query)
        except:
            frappe_query = "❌ Error in generate_frappe_query"
        return (
            f"📄 **Predicted Doctype**: {doctype}",
            f"🔎 **Top Fields (SBERT)**: {top_fields}",
            f"✅ **Selected Fields (FLAN)**: {selected_fields}",
            f"📌**Frappe Query (FLAN)**:{frappe_query}"
        )

    except Exception as e:
        print("❌ ERROR in full_pipeline:", str(e))
        return (
            "❌ Error in Doctype Prediction",
            "❌ Error in SBERT or Meta Lookup",
            "❌ Error in FLAN Field Selection",
            "❌ Error in FLAN QUERY Generation"
            f"💥 Exception: {str(e)}"
        )
iface = gr.Interface(
    fn=full_pipeline,
    inputs=[
        gr.Textbox(label="❓ Natural Language Question"),
    ],
    outputs=[
        gr.Markdown(label="Doctype Prediction"),
        gr.Markdown(label="Top Fields (SBERT)"),
        gr.Markdown(label="Selected Fields (FLAN)"),
        gr.Markdown(label="Frappe Query (FLAN)")
    ],
    title="🔗 Text2Frappe End-to-End Demo",
    description="This pipeline integrates Stage 1 (RoBERTa), Stage 2 Hybrid (SBERT + FLAN), and Stage 3 (FLAN-T5) to convert natural language questions into ERPNext queries."
)
iface.launch()
