In [4]:
# ✅ JSONL extractor: split Example triple into {sub, rel, obj} dicts
# ---------------------------------------------------------------
# Output JSONL per line:
# {
#   "id": "...",
#   "Example sentence": "...",
#   "Example triples": [{"sub":"...","rel":"...","obj":"..."}, ...]
# }

from pathlib import Path
import json
import re
from typing import Dict, Any, Optional, Tuple, List

# --- CHANGE THESE PATHS ---
input_path = Path("/upb/users/b/balram/profiles/unix/cs/promptKG/data/benchmark_prompts/dbpedia/ont_10_comicscharacter_prompts.jsonl")
output_path = Path("/upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_10_comicscharacter_few_shot.jsonl")
# --------------------------

# Regex to capture Example Sentence / Example Output blocks from the prompt
EX_SENTENCE_RE = re.compile(
    r'Example\s*Sentence\s*:\s*(.+?)\s*(?=\n\s*Example\s*Output\b|\Z)',
    flags=re.IGNORECASE | re.DOTALL
)
EX_OUTPUT_RE = re.compile(
    r'Example\s*Output\s*:\s*(.+?)\s*(?=\n\s*(?:Test\s*Sentence\b|Test\s*Output\b|Example\s*Sentence\b)|\Z)',
    flags=re.IGNORECASE | re.DOTALL
)

# Regex to capture triples like:
#   rel(sub, obj)
#   rel("sub with spaces", "obj with spaces")
#   rel(University of X, 250)
TRIPLE_RE = re.compile(
    r'\b(?P<rel>[A-Za-z_][A-Za-z0-9_]*)\s*'
    r'\(\s*(?P<sub>"[^"]*"|[^,()]+?)\s*,\s*(?P<obj>"[^"]*"|[^()]+?)\s*\)',
    flags=re.DOTALL
)

def _strip_outer_quotes(s: str) -> str:
    s = s.strip()
    if len(s) >= 2 and s[0] == '"' and s[-1] == '"':
        return s[1:-1]
    return s

def _clean_text_block(text: str) -> str:
    lines = [ln.rstrip() for ln in text.strip().splitlines()]
    return " ".join([ln.strip() for ln in lines]).strip()

def parse_prompt_for_example(prompt: str) -> Tuple[Optional[str], Optional[str]]:
    sent_match = EX_SENTENCE_RE.search(prompt or "")
    out_match = EX_OUTPUT_RE.search(prompt or "")
    sent = _clean_text_block(sent_match.group(1)) if sent_match else None
    out = " ".join([ln.strip() for ln in (out_match.group(1).strip().splitlines())]) if out_match else None
    return sent, out

def triples_from_text(triple_text: Optional[str]) -> List[Dict[str, str]]:
    """Parse all relation(subject, object) triples into list of dicts."""
    if not triple_text:
        return []
    triples = []
    for m in TRIPLE_RE.finditer(triple_text):
        rel = m.group("rel").strip()
        sub = _strip_outer_quotes(m.group("sub")).strip()
        obj = _strip_outer_quotes(m.group("obj")).strip()
        triples.append({"sub": sub, "rel": rel, "obj": obj})
    return triples

def extract_fields(src: Path, dst: Path) -> Dict[str, int]:
    total, written, skipped = 0, 0, 0
    if str(dst.parent):
        dst.parent.mkdir(parents=True, exist_ok=True)

    with src.open("r", encoding="utf-8") as fin, dst.open("w", encoding="utf-8") as fout:
        for line in fin:
            raw = line.strip()
            if not raw:
                continue
            total += 1
            try:
                obj = json.loads(raw)
            except json.JSONDecodeError:
                skipped += 1
                continue

            _id = obj.get("id")
            prompt = obj.get("prompt", "")

            ex_sentence, ex_output_text = parse_prompt_for_example(prompt)
            print('ex_sentence123: ',ex_sentence)
            print('ex_output_text123',ex_output_text )
            ex_triples = triples_from_text(ex_output_text)

            record = {
                "id": _id,
                "Example sentence": ex_sentence,
                "Example triples output": ex_triples,
            }
            fout.write(json.dumps(record, ensure_ascii=False) + "\n")
            written += 1

    return {"total_read": total, "total_written": written, "total_skipped": skipped}

# --- Run extraction ---
stats = extract_fields(input_path, output_path)

print("\n✅ Done.")
print(f"Read: {stats['total_read']} lines")
print(f"Wrote: {stats['total_written']} lines to: {output_path}")
print(f"Skipped (malformed JSON only): {stats['total_skipped']}")

# --- Preview first 3 lines of the output ---
print("\nFirst 3 lines of the output:")
try:
    with output_path.open("r", encoding="utf-8") as f:
        for i, l in enumerate(f):
            if i >= 3:
                break
            print(l.rstrip())
except FileNotFoundError:
    print("Output file not found.")


ex_sentence123:  The comic character Asterix, was created by René Goscinny and Albert Uderzo.
ex_output_text123 creator(Asterix (comicsCharacter), René Goscinny) alternativeName(Asterix (comicsCharacter), "Astérix") creator(Asterix (comicsCharacter), Albert Uderzo)
ex_sentence123:  The comic character Asterix, was created by René Goscinny and Albert Uderzo.
ex_output_text123 creator(Asterix (comicsCharacter), René Goscinny) alternativeName(Asterix (comicsCharacter), "Astérix") creator(Asterix (comicsCharacter), Albert Uderzo)
ex_sentence123:  The comic character Asterix, was created by René Goscinny and Albert Uderzo.
ex_output_text123 creator(Asterix (comicsCharacter), René Goscinny) alternativeName(Asterix (comicsCharacter), "Astérix") creator(Asterix (comicsCharacter), Albert Uderzo)
ex_sentence123:  The comic book character Ben Urich was created by Gene Colan and the comic book writer Roger Mckenzie, who is a US national.
ex_output_text123 creator(Ben Urich, Roger McKenzie (comic b

In [7]:
# Below is to run the dbpedia datstes all 19 file
import re
from pathlib import Path

# ---- keep these base paths fixed (do not change) ----
BASE_IN_PROMPTS = Path("/upb/users/b/balram/profiles/unix/cs/promptKG/data/benchmark_prompts/dbpedia/")
BASE_OUT_FEWS   = Path("/upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/")

# The 19 filenames from your screenshot (used only to derive idx+category)
FILENAMES = [
    "ont_1_university_test.jsonl",
    "ont_10_comicscharacter_test.jsonl",
    "ont_11_meanoftransportation_test.jsonl",
    "ont_12_monument_test.jsonl",
    "ont_13_food_test.jsonl",
    "ont_14_writtenwork_test.jsonl",
    "ont_15_sportsteam_test.jsonl",
    "ont_16_city_test.jsonl",
    "ont_17_artist_test.jsonl",
    "ont_18_scientist_test.jsonl",
    "ont_19_film_test.jsonl",
    "ont_2_musicalwork_test.jsonl",
    "ont_3_airport_test.jsonl",
    "ont_4_building_test.jsonl",
    "ont_5_athlete_test.jsonl",
    "ont_6_politician_test.jsonl",
    "ont_7_company_test.jsonl",
    "ont_8_celestialbody_test.jsonl",
    "ont_9_astronaut_test.jsonl",
]

# Extract idx & category from the above names
PATTERN = re.compile(r"^ont_(\d+)_([a-z]+)_test\.jsonl$")

def derive_paths(fname: str):
    m = PATTERN.match(fname)
    if not m:
        raise ValueError(f"Unexpected filename format: {fname}")
    idx, cat = m.groups()
    # Build the prompt input and few-shot output filenames (paths are fixed)
    input_path  = BASE_IN_PROMPTS / f"ont_{idx}_{cat}_prompts.jsonl"
    output_path = BASE_OUT_FEWS   / f"ont_{idx}_{cat}_few_shot.jsonl"
    return input_path, output_path, f"ont_{idx}_{cat}"

# ---- run extractor for all 19 files ----
all_stats = {}
for fname in FILENAMES:
    try:
        input_path, output_path, tag = derive_paths(fname)
        print("\n" + "="*80)
        print(f"[RUN] {tag}")
        print("INPUT :", input_path)
        print("OUTPUT:", output_path)

        stats = extract_fields(input_path, output_path)   # uses your function as-is

        print(f"[DONE] {tag} | read={stats['total_read']} wrote={stats['total_written']} skipped={stats['total_skipped']}")
        all_stats[tag] = stats
    except Exception as e:
        print(f"[ERROR] {fname}: {e}")

# Optional: summary
print("\nSummary:")
for tag, s in all_stats.items():
    print(f" - {tag}: read={s['total_read']}, wrote={s['total_written']}, skipped={s['total_skipped']}")



[RUN] ont_1_university
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/benchmark_prompts/dbpedia/ont_1_university_prompts.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_1_university_few_shot.jsonl
[DONE] ont_1_university | read=71 wrote=71 skipped=0

[RUN] ont_10_comicscharacter
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/benchmark_prompts/dbpedia/ont_10_comicscharacter_prompts.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_10_comicscharacter_few_shot.jsonl
[DONE] ont_10_comicscharacter | read=36 wrote=36 skipped=0

[RUN] ont_11_meanoftransportation
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/benchmark_prompts/dbpedia/ont_11_meanoftransportation_prompts.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_11_meanoftransportation_few_shot.jsonl
[DONE] ont_11_meanoftransportation | read=92 wrote=92 skipped=0

[RUN

In [6]:
# Below is to run the wikidata datstes all 19 file
import re
from pathlib import Path

# ---- keep these base paths fixed (do not change) ----
BASE_IN_PROMPTS = Path("/upb/users/b/balram/profiles/unix/cs/promptKG/data/benchmark_prompts/wikidata/")
BASE_OUT_FEWS   = Path("/upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/wikidata/")

# The 19 filenames from your screenshot (used only to derive idx+category)
FILENAMES = [
    "ont_1_movie_test.jsonl",
    "ont_2_music_test.jsonl",
    "ont_3_sport_test.jsonl",
    "ont_4_book_test.jsonl",
    "ont_5_military_test.jsonl",
    "ont_6_computer_test.jsonl",
    "ont_7_space_test.jsonl",
    "ont_8_politics_test.jsonl",
    "ont_9_nature_test.jsonl",
    "ont_10_culture_test.jsonl",
]


# Extract idx & category from the above names
PATTERN = re.compile(r"^ont_(\d+)_([a-z]+)_test\.jsonl$")

def derive_paths(fname: str):
    m = PATTERN.match(fname)
    if not m:
        raise ValueError(f"Unexpected filename format: {fname}")
    idx, cat = m.groups()
    # Build the prompt input and few-shot output filenames (paths are fixed)
    input_path  = BASE_IN_PROMPTS / f"ont_{idx}_{cat}_prompts.jsonl"
    output_path = BASE_OUT_FEWS   / f"ont_{idx}_{cat}_few_shot.jsonl"
    return input_path, output_path, f"ont_{idx}_{cat}"

# ---- run extractor for all 19 files ----
all_stats = {}
for fname in FILENAMES:
    try:
        input_path, output_path, tag = derive_paths(fname)
        print("\n" + "="*80)
        print(f"[RUN] {tag}")
        print("INPUT :", input_path)
        print("OUTPUT:", output_path)

        stats = extract_fields(input_path, output_path)   # uses your function as-is

        print(f"[DONE] {tag} | read={stats['total_read']} wrote={stats['total_written']} skipped={stats['total_skipped']}")
        all_stats[tag] = stats
    except Exception as e:
        print(f"[ERROR] {fname}: {e}")

# Optional: summary
print("\nSummary:")
for tag, s in all_stats.items():
    print(f" - {tag}: read={s['total_read']}, wrote={s['total_written']}, skipped={s['total_skipped']}")



[RUN] ont_1_movie
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/benchmark_prompts/wikidata/ont_1_movie_prompts.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/wikidata/ont_1_movie_few_shot.jsonl
[DONE] ont_1_movie | read=840 wrote=840 skipped=0

[RUN] ont_2_music
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/benchmark_prompts/wikidata/ont_2_music_prompts.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/wikidata/ont_2_music_few_shot.jsonl
[DONE] ont_2_music | read=675 wrote=675 skipped=0

[RUN] ont_3_sport
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/benchmark_prompts/wikidata/ont_3_sport_prompts.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/wikidata/ont_3_sport_few_shot.jsonl
[DONE] ont_3_sport | read=487 wrote=487 skipped=0

[RUN] ont_4_book
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/benchmark_prompts/wikidata/ont_4_book_prompt