In [None]:
!pip install transformers datasets torch

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# ✅ Step 3: Load your JSON dataset and preprocess (REMOVE instruction, add instruction dynamically)
with open('/content/drive/MyDrive/Changai/S2/S2 Datasets/S2_flan_new (1).json') as f:
    raw_data = json.load(f)

In [None]:
# Standard Library
import json

# Hugging Face Libraries
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq

In [None]:
INSTRUCTION = "Select only the correct field(s) from the given top fields that answer the question."
with open('/content/S2_flan_new.json') as f:
    raw_data = json.load(f)
formatted_data = []
for entry in raw_data:
    inp = entry["input"]
    prompt = f"Instruction: {INSTRUCTION}\nDoctype: {inp['doctype']}\nQuestion: {inp['question']}\nTop Fields: {inp['top fields']}"
    output = ', '.join(entry["output"])
    formatted_data.append({"input": prompt, "output": output})

# Convert to HuggingFace Dataset
dataset = Dataset.from_list(formatted_data)
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

# ======================
# STEP 2: Load FLAN-T5 and tokenizer
# ======================
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# ======================
# STEP 3: Tokenize
# ======================
def tokenize_fn(example):
    model_inputs = tokenizer(
        example["input"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        example["output"],
        max_length=64,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)

# ======================
# STEP 4: Training setup
# ======================
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/Changai/S2/S2 Model/flan_field_selector_final",
    eval_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

trainer.train()

# Save the model
model.save_pretrained("/content/drive/MyDrive/Changai/S2/S2 Model/flan_field_selector_final")
tokenizer.save_pretrained("/content/drive/MyDrive/Changai/S2/S2 Model/flan_field_selector_final")

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

checkpoint_path = "/content/drive/MyDrive/Changai/S2/S2 Model/flan_field_selector_final/checkpoint-180"

# Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)
tokenizer = T5Tokenizer.from_pretrained(checkpoint_path)


In [None]:
save_path = "/content/drive/MyDrive/Changai/S2/S2 Model/flan_s3_query_generator_new"

# Save model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


('/content/drive/MyDrive/Changai/S2/S2 Model/flan_s3_query_generator_new/tokenizer_config.json',
 '/content/drive/MyDrive/Changai/S2/S2 Model/flan_s3_query_generator_new/special_tokens_map.json',
 '/content/drive/MyDrive/Changai/S2/S2 Model/flan_s3_query_generator_new/spiece.model',
 '/content/drive/MyDrive/Changai/S2/S2 Model/flan_s3_query_generator_new/added_tokens.json')

In [None]:
test_cases = [
    """Instruction: Select only the correct field(s) from the given top fields that answer the question.
Doctype: Purchase Invoice Advance
Question: What is the document type linked to advance entry ADV-9668?
Top Fields: [reference_name,reference_type, advance_amount, allocated_amount, ref_exchange_rate, remarks]""",

    """Instruction: Select only the correct field(s) from the given top fields that answer the question.
Doctype: Purchase Invoice Advance
Question: List the reference types used in advance payments made this month.
Top Fields: [reference_type, reference_name, difference_posting_date, advance_amount, allocated_amount, exchange_gain_loss]""",

    """Instruction: Select only the correct field(s) from the given top fields that answer the question.
Doctype: Purchase Invoice Advance
Question: Which advance entries are connected to a 'Purchase Invoice'?
Top Fields: [reference_type, advance_amount, reference_name, allocated_amount, exchange_gain_loss]""",

    """Instruction: Select only the correct field(s) from the given top fields that answer the question.
Doctype: Purchase Invoice Advance
Question: Is there any advance entry linked to an 'Expense Claim'?
Top Fields: [reference_type, advance_amount, reference_name, allocated_amount, ref_exchange_rate]""",

    """Instruction: Select only the correct field(s) from the given top fields that answer the question.
Doctype: Purchase Invoice Advance
Question: Retrieve the allocated amount and document ID for the entry ADV-5412.
Top Fields: [reference_name, allocated_amount, ref_exchange_rate, remarks, advance_amount]""",

    """Instruction: Select only the correct field(s) from the given top fields that answer the question.
Doctype: Purchase Invoice Advance
Question: What is the exchange rate applied to the reference document of advance ADV-7771?
Top Fields: [ref_exchange_rate, difference_posting_date, exchange_gain_loss, reference_type]"""
]
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/Changai/S2/S2 Model/flan_field_selector_final")
tokenizer = T5Tokenizer.from_pretrained("/content/drive/MyDrive/Changai/S2/S2 Model/flan_field_selector_final")

def predict_field(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs["input_ids"], max_length=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Run predictions
for i, case in enumerate(test_cases):
    print(f"\n--- Test Case {i+1} ---")
    print("Input:\n", case)
    print("Prediction:\n", predict_field(case))
