In [2]:
import json

# Improved instruction block for high-quality and flexible SPO triple extraction
INSTRUCTION_BLOCK = """### INSTRUCTION:
Your task is to analyze the input text and extract high-quality subject-predicate-object (SPO) triples suitable for constructing a knowledge graph.

- Use the provided  Ontology Concepts and Ontology Relations as your primary reference.
- You may include new concepts or relations only if they are clearly and unambiguously implied by the sentence.
- Ensure all extracted relations are domain-relevant and meaningful.
- Avoid inferring or hallucinating facts not stated in the text.
- Output only the SPO triples in the format: predicate(subject, object), one per line.
- Do NOT include extra commentary, headers, bullet points, or JSON.
- Be concise, clean, and precise in your extraction."""

def enhance_prompt(original_prompt):
    """
    Extracts and restructures a raw prompt into a well-formatted instruction-driven prompt.
    It assumes specific markers like 'Ontology Concepts:', 'Example Sentence:', etc., are present.
    """
    parts = {
        "context": "",
        "example_sentence": "",
        "example_output": "",
        "test_sentence": "",
        "test_output": ""
    }

    try:
        # Extract index positions of each section
        context_start = original_prompt.index("Ontology Concepts:")
        example_start = original_prompt.index("Example Sentence:")
        example_output_start = original_prompt.index("Example Output:")
        test_start = original_prompt.index("Test Sentence:")
        test_output_start = original_prompt.index("Test Output:")

        # Slice out sections based on markers
        parts["context"] = original_prompt[context_start:example_start].strip()
        parts["example_sentence"] = original_prompt[example_start:example_output_start].strip()
        parts["example_output"] = original_prompt[example_output_start:test_start].strip()
        parts["test_sentence"] = original_prompt[test_start:test_output_start].strip()
        parts["test_output"] = original_prompt[test_output_start:].strip()

    except ValueError as e:
        print("Skipping malformed prompt:", e)
        return original_prompt  # fallback to original if parsing fails

    # Rebuild with improved structure and instruction block
    enhanced_prompt = (
        f"{INSTRUCTION_BLOCK}\n\n"
        f"### CONTEXT:\n{parts['context']}\n\n"
        f"### {parts['example_sentence']}\n\n"
        f"### {parts['example_output']}\n\n"
        f"### {parts['test_sentence']}\n\n"
        f"### {parts['test_output']}"
    )
    return enhanced_prompt


In [None]:
input_output_paths = [
    ("ont_1_university_prompts.jsonl",        "ont_1_university_prompts_improved.jsonl"),
    ("ont_10_comicscharacter_prompts.jsonl",  "ont_10_comicscharacter_prompts_improved.jsonl"),
    ("ont_11_meanoftransportation_prompts.jsonl", "ont_11_meanoftransportation_prompts_improved.jsonl"),
    ("ont_12_monument_prompts.jsonl",         "ont_12_monument_prompts_improved.jsonl"),
    ("ont_13_food_prompts.jsonl",             "ont_13_food_prompts_improved.jsonl"),
    ("ont_14_writtenwork_prompts.jsonl",      "ont_14_writtenwork_prompts_improved.jsonl"),
    ("ont_15_sportsteam_prompts.jsonl",       "ont_15_sportsteam_prompts_improved.jsonl"),
    ("ont_16_city_prompts.jsonl",             "ont_16_city_prompts_improved.jsonl"),
    ("ont_17_artist_prompts.jsonl",           "ont_17_artist_prompts_improved.jsonl"),
    ("ont_18_scientist_prompts.jsonl",        "ont_18_scientist_prompts_improved.jsonl"),
    ("ont_19_film_prompts.jsonl",             "ont_19_film_prompts_improved.jsonl"),
    ("ont_2_musicalwork_prompts.jsonl",       "ont_2_musicalwork_prompts_improved.jsonl"),
    ("ont_3_airport_prompts.jsonl",           "ont_3_airport_prompts_improved.jsonl"),
    ("ont_4_building_prompts.jsonl",          "ont_4_building_prompts_improved.jsonl"),
    ("ont_5_athlete_prompts.jsonl",           "ont_5_athlete_prompts_improved.jsonl"),
    ("ont_6_politician_prompts.jsonl",        "ont_6_politician_prompts_improved.jsonl"),
    ("ont_7_company_prompts.jsonl",           "ont_7_company_prompts_improved.jsonl"),
    ("ont_8_celestialbody_prompts.jsonl",     "ont_8_celestialbody_prompts_improved.jsonl"),
    ("ont_9_astronaut_prompts.jsonl",         "ont_9_astronaut_prompts_improved.jsonl"),
]

base_input_path = "/upb/users/b/balram/profiles/unix/cs/Text2KG_exp1_thesis/data/input/dbpedia/baseline_input_prompts/"
base_output_path = "/upb/users/b/balram/profiles/unix/cs/Text2KG_exp1_thesis/data/input/dbpedia/improved_input_prompts/"

for in_file, out_file in input_output_paths:
    input_path = base_input_path + in_file
    #output_path = base_output_path + out_file
    print(((input_path)))
    #print(f"Output: {output_path}\n")
