In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
#✅ Step 1: Install dependencies
!pip install transformers datasets accelerate

from google.colab import drive
drive.mount("/content/drive")

# ✅ Step 2: Imports
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import json
import torch

In [None]:
# ✅ Step 3: Load your JSON dataset and preprocess (REMOVE instruction, add instruction dynamically)
with open('/content/drive/MyDrive/Changai/S3/Datasets/S3_flan_new.json') as f:
    raw_data = json.load(f)

In [None]:
!pip install transformers datasets torch

In [None]:
# Standard Library
import json

# Hugging Face Libraries
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq

In [None]:
INSTRUCTION = "Generate the correct Frappe query for the given question, using the provided doctype and fields."

# Convert into new list with flattened instruction
processed_data = []
processed_data = []
for entry in raw_data:
    try:
        input_data = entry["input"]
        if not isinstance(input_data, dict):
            continue  # skip malformed entry

        prompt = (
            f"Instruction: {INSTRUCTION}\n"
            f"Doctype: {input_data['doctype']}\n"
            f"Question: {input_data['question']}\n"
            f"Fields: {input_data['fields']}"
        )

        processed_data.append({
            "input": prompt,
            "output": entry["output"]
        })

    except Exception as e:
        print("Skipping due to error:", e)
        continue


# ✅ Step 4: Convert to HuggingFace Dataset
dataset = Dataset.from_list(processed_data)
dataset = dataset.train_test_split(test_size=0.1)

# ✅ Step 5: Tokenization
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

def tokenize_function(example):
    model_inputs = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(example["output"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=False)

# ✅ Step 6: Load model
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

# ✅ Step 7: Training configuration
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/Changai/S3/Model/flan_s3_query_generator",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=4,
    predict_with_generate=True,
    logging_dir="./logs"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# ✅ Step 8: Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ✅ Step 9: Train
trainer.train()
trainer.evaluate()
model.save_pretrained("/content/drive/MyDrive/Changai/S3/Model/flan_s3_query_generator")
tokenizer.save_pretrained("/content/drive/MyDrive/Changai/S3/Model/flan_s3_query_generator")

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/Changai/S3/Model/flan_s3_query_generator')
tokenizer = T5Tokenizer.from_pretrained('/content/drive/MyDrive/Changai/S3/Model/flan_s3_query_generator')

# Define test cases
test_cases = [
    """Instruction: Generate the correct Frappe query for the given question, using the provided doctype and fields.
Doctype: Company
Fields: company_name
Question: What's the full name of our primary registered company named 'HTS Pvt Ltd'?""",

    """Instruction: Generate the correct Frappe query for the given question, using the provided doctype and fields.
Doctype: Company
Fields: parent_company
Question: Who is listed as the parent company of 'FusionCorp International'?""",

    """Instruction: Generate the correct Frappe query for the given question, using the provided doctype and fields.
Doctype: Finance Book
Fields: finance_book_name
Question: Can I rename or update the name of an existing finance book?"""
]

# Define generation function
def generate_query(test_input):
    inputs = tokenizer(test_input, return_tensors="pt", max_length=512, truncation=True, padding=True)
    outputs = model.generate(inputs['input_ids'], max_length=512, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Run test cases
for i, test_input in enumerate(test_cases, start=1):
    print(f"\n--- Test Case {i} ---")
    print(f"Input:\n{test_input}")
    print("\nGenerated Query:")
    print(generate_query(test_input))
    print("\n")
