In [2]:
!nvidia-smi

Tue Oct 28 22:29:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:81:00.0 Off |                  N/A |
| 71%   65C    P0            293W /  370W |   14406MiB /  24576MiB |     54%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!kill 1306860

In [1]:
import sys, torch
print("Python:", sys.version)
print("Executable:", sys.executable)
print("CUDA available:", torch.cuda.is_available())


Python: 3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:45:41) [GCC 13.3.0]
Executable: /upb/users/b/balram/profiles/unix/cs/.conda/envs/kg_pipeline/bin/python3
CUDA available: True


In [2]:
# Block 0 — Imports & Config

import os, json, re, time
import torch
from textwrap import dedent
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


In [3]:
def setup_model(model_id="mistralai/Mistral-7B-Instruct-v0.3"):
    print("⏳ Loading model:", model_id)
    torch.backends.cudnn.benchmark = True

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    model.config.use_cache = True  # KV caching is fine.

    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto"
    )
    return generator, tokenizer

In [4]:
# ---------- Utilities ----------

def _quote(v: object) -> str:
    """
    Convert any value to a safe, double-quoted string for the legacy triple format.
    Escapes backslashes and double quotes.
    """
    s = str(v)
    s = s.replace("\\", "\\\\").replace('"', '\\"')
    return f'"{s}"'

def _coerce_triples_to_str(triples_val):
    """
    Legacy helper (kept for compatibility): normalize triples into newline-separated
    lines: rel("Subject","Object"). Not used in the new JSONL prompt, but harmless.
    """
    if triples_val is None:
        return ""

    if isinstance(triples_val, str):
        return triples_val.strip()

    if isinstance(triples_val, list) and all(isinstance(x, str) for x in triples_val):
        return "\n".join(x.strip() for x in triples_val if x and x.strip())

    if isinstance(triples_val, list) and all(isinstance(x, dict) for x in triples_val):
        lines = []
        for t in triples_val:
            rel = (t.get("rel") or "").strip()
            sub = t.get("sub")
            obj = t.get("obj")
            if rel and (sub is not None) and (obj is not None):
                lines.append(f'{rel}({_quote(sub)},{_quote(obj)})')
        return "\n".join(lines)

    return str(triples_val).strip()


def _to_jsonl_block_from_list(triples_list):
    """
    Given a list[dict] of {"sub","rel","obj"}, return a JSONL string block.
    """
    if not isinstance(triples_list, list):
        return ""
    lines = []
    for t in triples_list:
        if not isinstance(t, dict):
            continue
        sub = t.get("sub")
        rel = t.get("rel")
        obj = t.get("obj")
        if sub is None or rel is None or obj is None:
            continue
        lines.append(json.dumps({"sub": sub, "rel": rel, "obj": obj}, ensure_ascii=False))
    return "\n".join(lines)


In [5]:
# ---------- Few-shot loader for FIXED schema (unchanged IO) ----------

def read_jsonl(path, max_items: int | None = None):
    """
    Yields JSON objects from a .jsonl file.
    If max_items is set, stops after that many records.
    """
    count = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            yield json.loads(line)
            count += 1
            if max_items is not None and count >= max_items:
                break


def write_jsonl(path, records):
    with open(path, "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False))
            f.write("\n")


def load_one_shot_examples(few_shot_jsonl_path: str) -> dict:
    """
    Loads {id: {"example_sentence": <str>, "example_triples_output": <any>}}.
    Reads fields exactly:
        "Example sentence"
        "Example triples output" (or falls back to "Example triples").
    """
    out = {}
    for rec in read_jsonl(few_shot_jsonl_path, max_items=None):
        rid = rec.get("id")
        if rid is None:
            continue
        out[str(rid).strip()] = {
            "example_sentence": rec.get("Example sentence"),
            "example_triples_output": rec.get("Example triples output", rec.get("Example triples")),
        }
    print(f"[FEW-SHOT] Loaded {len(out)} examples from: {few_shot_jsonl_path}")
    return out

In [6]:
# Block 2 — Prompt Builder (Reason → Verify → FINAL TRIPLES as JSONL)

from textwrap import dedent
import json

# Helpers (strict access; fail fast if ontology keys are missing)
def _concept_label(ontology_json, qid):
    return next((c["label"] for c in ontology_json["concepts"] if c["qid"] == qid), "")

def format_ontology_concepts(ontology_json):
    return ", ".join(c["label"] for c in ontology_json["concepts"])

def format_ontology_relations(ontology_json):
    lines = []
    for r in ontology_json["relations"]:
        dom = _concept_label(ontology_json, r["domain"])
        rng = _concept_label(ontology_json, r["range"])
        lines.append(f'- {r["label"]}({dom},{rng})')
    return "\n".join(lines)


def build_reason_then_extract_prompt(
    ontology_json,
    test_sentence,
    worked_example=None,
    allow_light_norm=True,
    relation_cues_text=None,   # optional Step 0 paraphrase cues
    strict_format=True,        # enforce exact output shape
):
    """
    Build a prompt that ends with the header `### FINAL TRIPLES` and *no trailing guidance*,
    so generations appear directly under the header in the required JSONL format.
    """

    concepts_line   = format_ontology_concepts(ontology_json)
    relations_block = format_ontology_relations(ontology_json)
    norm_note = (
        "normalize only trivial cases (e.g., Japanese→Japan)"
        if allow_light_norm else
        "copy spans verbatim; no normalization"
    )

    # Optional Step 0 cues
    step0_block = ""
    if relation_cues_text:
        step0_block = dedent(f"""
        Step 0 (Paraphrase expansion for this sentence):
        {relation_cues_text.strip()}
        """).strip()

    # Optional worked example — now rendered as JSONL beneath "Example FINAL TRIPLES"
    example_block = ""
    if worked_example:
        ex_sent = worked_example.get("example_sentence")
        ex_out  = worked_example.get("example_triples_output")
        ex_sent_json = json.dumps(ex_sent, ensure_ascii=False)  # quoted sentence
        # Convert example triples (often list[dict]) into JSONL text
        if isinstance(ex_out, list) and all(isinstance(x, dict) for x in ex_out):
            ex_out_jsonl = "\n".join(json.dumps(x, ensure_ascii=False) for x in ex_out)
        else:
            # If already a string block, keep as-is (user may have supplied JSONL text)
            ex_out_jsonl = str(ex_out).strip() if ex_out is not None else ""

        example_block = dedent(f'''
        "Example sentence": {ex_sent_json}

        "Example FINAL TRIPLES":
        {ex_out_jsonl}
        ''').strip()

    # Always include a tiny multi-line FORMAT DEMO to bias the model to output >1 line.
    # (This is format-only; not semantic guidance.)
    format_demo = dedent("""
    OUTPUT FORMAT DEMO (JSON Lines, not part of reasoning):
    {"sub":"Subject 1","rel":"<predicate label 1>","obj":"<Object 1>"}
    {"sub":"Subject 2","rel":"<predicate label 2>","obj":"<Object 2>"}
    {"sub":"Subject 3","rel":" <predicate label 3>","obj":"<Object 3>"}
    """).strip()

    # Strict output notes merged into RULES
    strict_notes = ""
    if strict_format:
        strict_notes = dedent("""
        OUTPUT SHAPE (strict):
        - After Step 2, output the header exactly:
        ### FINAL TRIPLES
        - Then output one JSON object per line (JSON Lines), each with exactly these keys:
          {"sub": "<Subject>", "rel": "<predicate label>", "obj": "<Object>"}
        - Output ALL supported facts from the TEST SENTENCE; do NOT stop after the first triple.
        - Do not output brackets, arrays, bullets, or code fences. Only raw JSON lines.
        - No extra commentary after the JSON lines. If no triples, just output the header.
        """).strip()

    rules = dedent(f"""
    RULES:
    - Prefer relation labels from ONTOLOGY RELATIONS when they match the evidence.
    - If the sentence clearly states another SPO relation that is NOT in the ontology, still include it using a concise predicate phrase derived from the trigger words in the sentence (do NOT invent facts).
    - Resolve simple coreference (e.g., "the film", "this movie", "it" → the film title in this sentence).
    - Quote/mention evidence in Step 2 for each emitted triple.
    - Base decisions ONLY on the TEST SENTENCE.
    - Avoid duplicates; {norm_note}.
    - For each distinct supported fact, write a separate JSON object on its own line.
    {strict_notes}
    """).strip()

    # IMPORTANT: end the prompt with the header and NOTHING after it.
    prompt = dedent(f"""
    TASK: Extract all SPO triples that the TEST SENTENCE clearly supports.
    - Use the ontology to guide relation labeling when possible.
    - If a valid SPO fact has no matching ontology label, still output it in the same JSONL under FINAL TRIPLES (single section, multiple lines).

    ONTOLOGY CONCEPTS:
    {concepts_line}

    ONTOLOGY RELATIONS (argument types):
    {relations_block}

    {rules}

    {step0_block}

    {example_block if example_block else ""}

    {format_demo}

    ### TEST SENTENCE
    "{test_sentence}"

    Step 1 (Entities & types):
    Step 2 (Verify relations): For each candidate triple, quote or mention the trigger phrase and spans.

    ### FINAL TRIPLES
    """).strip()

    return prompt

In [7]:
# Block 2 — Prompt Builder (Reason → Verify → FINAL TRIPLES as JSONL; domain-agnostic)

from textwrap import dedent
import json

# ---------- Helpers (strict access; fail fast if ontology keys are missing) ----------

def _concept_label(ontology_json, qid):
    return next((c["label"] for c in ontology_json["concepts"] if c["qid"] == qid), "")

def format_ontology_concepts(ontology_json):
    return ", ".join(c["label"] for c in ontology_json["concepts"])

def format_ontology_relations(ontology_json):
    lines = []
    for r in ontology_json["relations"]:
        dom = _concept_label(ontology_json, r["domain"])
        rng = _concept_label(ontology_json, r["range"])
        lines.append(f'- {r["label"]}({dom},{rng})')
    return "\n".join(lines)

def _jsonl_lines_from_list(triples):
    if isinstance(triples, list) and all(isinstance(x, dict) for x in triples):
        return "\n".join(json.dumps(x, ensure_ascii=False) for x in triples)
    return None


# ---------- Prompt Builder ----------

def build_reason_then_extract_prompt(
    ontology_json,
    test_sentence,
    worked_example=None,
    allow_light_norm=True,
    relation_cues_text=None,   # optional Step 0 paraphrase cues
    strict_format=True,        # enforce exact output shape
):
    """
    Build a prompt that ends with the header `### FINAL TRIPLES` and *no trailing guidance*,
    so generations appear directly under the header in JSON Lines (JSONL).
    Domain-agnostic: uses whatever concepts/relations are provided by `ontology_json`.
    """

    concepts_line   = format_ontology_concepts(ontology_json)
    relations_block = format_ontology_relations(ontology_json)
    norm_note = (
        "apply only light normalization (e.g., demonyms→country names; unambiguous years → 'YYYY')"
        if allow_light_norm else
        "copy spans verbatim; no normalization"
    )

    # Optional Step 0 cues
    step0_block = ""
    if relation_cues_text:
        step0_block = dedent(f"""
        Step 0 (Paraphrase expansion for this sentence):
        {relation_cues_text.strip()}
        """).strip()

    # Worked example — render as JSONL beneath "Example FINAL TRIPLES" (whatever domain the user provides)
    example_block = ""
    if worked_example:
        ex_sent = worked_example.get("example_sentence")
        ex_out  = worked_example.get("example_triples_output")
        ex_sent_json = json.dumps(ex_sent, ensure_ascii=False)

        ex_out_jsonl = _jsonl_lines_from_list(ex_out)
        if ex_out_jsonl is None:
            ex_out_jsonl = (str(ex_out).strip() if ex_out is not None else "")

        example_block = dedent(f'''
        "Example sentence": {ex_sent_json}

        "Example FINAL TRIPLES":
        {ex_out_jsonl}
        ''').strip()

    # Domain-agnostic FORMAT DEMO (no ontology-specific labels; shows out-of-ontology case too)
    format_demo = dedent("""
    OUTPUT FORMAT DEMO (JSON Lines, not part of reasoning):
    {"sub":"Entity A","rel":"<ontology relation label>","obj":"Entity B"}
    {"sub":"Entity A","rel":"<ontology relation label>","obj":"2020"}
    {"sub":"Entity A","rel":"<phrase from sentence>","obj":"Entity C"}  # out-of-ontology relation allowed
    """).strip()

    # Strict output notes merged into RULES
    strict_notes = ""
    if strict_format:
        strict_notes = dedent("""
        OUTPUT SHAPE (strict):
        - After Step 2, output the header exactly:
        ### FINAL TRIPLES
        - Then output one JSON object per line (JSON Lines), each with exactly these keys:
          {"sub":"<Subject>","rel":"<predicate label>","obj":"<Object>"}
        - Output ALL supported facts from the TEST SENTENCE; do NOT stop after the first triple.
        - Do not output brackets, arrays, bullets, or code fences. Only raw JSON lines.
        - No extra commentary after the JSON lines. If no triples, just output the header.
        """).strip()

    rules = dedent(f"""
    RULES:
    - Extract ALL SPO facts explicitly supported by the TEST SENTENCE.
    - Prefer relation labels from ONTOLOGY RELATIONS when they match the evidence and argument types.
    - If a supported SPO fact has NO matching ontology relation, STILL INCLUDE IT:
      use a concise, lowercase predicate phrase taken directly from the trigger wording in the sentence
      (e.g., "announced by", "located near", "compatible with", "launched from"). Do NOT invent facts.
    - Use the provided ONTOLOGY CONCEPTS for typing when possible; if none fits, type as "other" internally (do not print types).
    - {norm_note}; keep names/titles/IDs verbatim unless trivial normalization applies.
    - Avoid duplicates (treat case/spacing variants as identical).
    - Quote/mention evidence in Step 2 for each emitted triple (internal reasoning only; do not output the steps).
    - Base decisions ONLY on the TEST SENTENCE.
    {strict_notes}
    """).strip()

    # IMPORTANT: end the prompt with the header and NOTHING after it.
    prompt = dedent(f"""
    TASK: Extract all SPO triples that the TEST SENTENCE clearly supports.
    - Use the ontology to guide relation labeling when possible.
    - If a valid SPO fact has no matching ontology label, still output it in the same JSONL under FINAL TRIPLES (single section, multiple lines).

    ONTOLOGY CONCEPTS:
    {concepts_line}

    ONTOLOGY RELATIONS (argument types):
    {relations_block}

    {rules}

    {step0_block}

    {example_block if example_block else ""}

    {format_demo}

    ### TEST SENTENCE
    "{test_sentence}"

    Step 1 (Entities & types — internal scratchpad, do not output):
    - Identify candidate SUBJECT and OBJECT spans in the sentence.
    - When possible, map each entity to a concept from ONTOLOGY CONCEPTS (via surface form and context).
    - Keep spans verbatim except for light normalization noted above.
    - Resolve simple coreference (e.g., pronouns or descriptors → the correct entity mention).
    - Discard uncertain or implied facts.

    Step 2 (Verify relations — internal scratchpad, do not output):
    - For each (SUBJECT, OBJECT) pair, check for explicit trigger wording in the sentence.
    - If a matching ONTOLOGY RELATION label fits the wording and argument types, use that label.
    - Otherwise, if the relation is explicitly stated but not in ontology, emit a concise, lowercase phrase
      copied from the trigger wording (e.g., "compatible with", "announced by", "launched from").
    - Only include relations explicitly supported by the text; no inference.
    - One supported fact → one JSON object line in FINAL TRIPLES.

    ### FINAL TRIPLES
    """).strip()

    return prompt


In [8]:
# Block 3 — Single Inference (chat template; continuation-only)

def generate_triples_text(generator, tokenizer, prompt_text: str,
                          max_new_tokens: int = 768, temperature: float = 0.25) -> str:
    """
    Calls the model once. Returns the generated continuation.
    """
    chat = [
        {"role": "system", "content": "You are a precise information-extraction model. Follow instructions carefully."},
        {"role": "user", "content": prompt_text}
    ]
    formatted = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

    out = generator(
        formatted,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=0.9,
        do_sample=True,
        return_full_text=False,
        truncation=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    return out[0]["generated_text"] if isinstance(out[0], dict) else out[0]


In [9]:
# Block 4 — Extract & Parse `FINAL TRIPLES` (JSONL-first; legacy fallback)

FINAL_HEADER_REGEX = r'^\s*#{2,}\s*FINAL\s+TRIPLES\s*$'  # matches ## or ### FINAL TRIPLES


def extract_final_triples_block(model_output_text: str) -> str:
    """
    Grab everything after the '##/### FINAL TRIPLES' header.
    """
    header = re.search(FINAL_HEADER_REGEX, model_output_text, re.IGNORECASE | re.MULTILINE)
    if header:
        start = header.end()
        tail = model_output_text[start:].strip()
        nxt = re.search(r'^\s*#{2,}\s+[A-Z].*$', tail, re.MULTILINE)
        return tail[:nxt.start()].strip() if nxt else tail

    # fallback: take everything and let the parser fish valid lines out
    return model_output_text


def _parse_triples_block_jsonl(block_text: str):
    """
    NEW: Parse JSONL lines: each line is a JSON object with keys {sub, rel, obj}.
    Ignore invalid lines. Return list[dict].
    """
    triples = []
    for raw in block_text.splitlines():
        line = raw.strip()
        if not line or line.startswith("#"):
            continue
        # tolerate bullets before JSON (robustness)
        if line[:1] in "-*•":
            line = line[1:].strip()
        try:
            obj = json.loads(line)
        except Exception:
            continue
        if not isinstance(obj, dict):
            continue
        sub = obj.get("sub")
        rel = obj.get("rel")
        ob  = obj.get("obj")
        if isinstance(sub, str) and isinstance(rel, str) and isinstance(ob, (str, int, float)):
            # stringify obj to keep consistent typing downstream
            triples.append({"sub": sub, "rel": rel, "obj": str(ob)})
    return triples


def _parse_triples_block_legacy(block_text: str):
    """
    Legacy fallback: parse lines like predicate("Subject","Object").
    """
    triples = []

    line_re = re.compile(r"""
    ^\s*(?:[-*•]\s*)?
    (?P<rel>[A-Za-z][A-Za-z0-9_ ]*?)
    \s*\(\s*
        (?:
            ["'](?P<sub_q>[^"']+?)["']
            | (?P<sub_u>[^,)\n]+?)
        )
    \s*,\s*
        (?:
            ["'](?P<obj_q>[^"']+?)["']
            | (?P<obj_u>[^)\n]+?)
        )
    \s*\)\s*\.?\s*$
    """, re.VERBOSE)

    for raw in block_text.splitlines():
        line = raw.strip()
        if not line or line.startswith("#"):
            continue
        m = line_re.match(line)
        if not m:
            continue

        rel = m.group("rel").strip()
        sub = (m.group("sub_q") or m.group("sub_u") or "").strip()
        obj = (m.group("obj_q") or m.group("obj_u") or "").strip()
        if not rel or not sub or not obj:
            continue
        triples.append({"sub": sub, "rel": rel, "obj": obj})
    return triples


def parse_triples_block(block_text: str):
    """
    Main entry: prefer JSONL (new format). If none parsed, fall back to legacy.
    Returns list[dict] with keys "sub","rel","obj" (all strings).
    """
    triples = _parse_triples_block_jsonl(block_text)
    if triples:
        return triples
    return _parse_triples_block_legacy(block_text)


In [12]:
# Block 6 — Orchestrator (loop inputs → Steps 1–4 → output)

def run_pipeline(
    input_jsonl_path: str,
    ontology_json_path: str,
    output_jsonl_path: str,
    max_items: int = 4,
    max_new_tokens: int = 768,
    temperature: float = 0.25,
    verbose: bool = True,
    few_shot_jsonl_path: str | None = None,
):
    """
    Runs the extraction pipeline.
    Required: input_jsonl_path, ontology_json_path, output_jsonl_path.
    Optional: max_items, max_new_tokens, temperature, verbose, few_shot_jsonl_path.
    """
    # Load ontology once
    with open(ontology_json_path, "r", encoding="utf-8") as f:
        ontology = json.load(f)

    # Load one-shot examples (if provided)
    if few_shot_jsonl_path:
        one_shot_by_id = load_one_shot_examples(few_shot_jsonl_path)
    else:
        one_shot_by_id = {}
        print("[FEW-SHOT] No few_shot_jsonl_path provided — proceeding without examples.")

    # Setup model once
    generator, tokenizer = setup_model("mistralai/Mistral-7B-Instruct-v0.3")

    out_records = []
    t0 = time.time()

    # Iterate inputs
    for idx, item in enumerate(read_jsonl(input_jsonl_path, max_items=max_items), start=1):
        sent_id = item.get("id")
        sent    = item.get("sent", "")

        worked_example = one_shot_by_id.get(sent_id)

        # Build prompt (ends at ### FINAL TRIPLES; expects JSONL after)
        prompt_text = build_reason_then_extract_prompt(
            ontology_json=ontology,
            test_sentence=sent,
            worked_example=worked_example,
            allow_light_norm=True,
            relation_cues_text=None
        )

        if verbose:
            print("\n" + "="*80)
            print(f"[{idx}] ID={sent_id}")
            print("prompt_text:\n", prompt_text)
            pass

        # Generate
        t_gen0 = time.time()
        model_output = generate_triples_text(
            generator, tokenizer, prompt_text,
            max_new_tokens=max_new_tokens, temperature=temperature
        )
        gen_time = time.time() - t_gen0
        if verbose:
            print("[GEN OUTPUT]\n", model_output)
            print(f"[GEN TIME] {gen_time:.2f}s")
            pass

        # Extract & Parse (JSONL-first, legacy fallback)
        final_block = extract_final_triples_block(model_output)
        triples = parse_triples_block(final_block)

        out_records.append({"id": sent_id, "sentence": sent, "triples": triples})

    # Write output
    write_jsonl(output_jsonl_path, out_records)
    print(f"\n✅ Done. Wrote {len(out_records)} lines to: {output_jsonl_path} | Total time: {time.time()-t0:.1f}s")

    return out_records


In [14]:
# Block 7 — Example Run (paths unchanged)

INPUT_JSONL    = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_1_movie_test.jsonl"
ONTOLOGY_JSON  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/1_movie_ontology.json"
OUTPUT_JSONL   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_1_movie_output_test.jsonl"
FEW_SHOT_JSONL = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/wikidata/ont_1_movie_few_shot.jsonl"
MAX_ITEMS = 4  # or None

out = run_pipeline(
    input_jsonl_path=INPUT_JSONL,
    ontology_json_path=ONTOLOGY_JSON,
    output_jsonl_path=OUTPUT_JSONL,
    max_items=MAX_ITEMS,
    max_new_tokens=768,
    temperature=0.25,
    verbose=True,
    few_shot_jsonl_path=FEW_SHOT_JSONL,
)


[FEW-SHOT] Loaded 840 examples from: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/wikidata/ont_1_movie_few_shot.jsonl
⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0



[1] ID=ont_1_movie_test_1
prompt_text:
 TASK: Extract all SPO triples that the TEST SENTENCE clearly supports.
    - Use the ontology to guide relation labeling when possible.
    - If a valid SPO fact has no matching ontology label, still output it in the same JSONL under FINAL TRIPLES (single section, multiple lines).

    ONTOLOGY CONCEPTS:
    human, city, country, film, film genre, genre, film production company, film award, award, written work, film character, film organization

    ONTOLOGY RELATIONS (argument types):
    - director(film,human)
- screenwriter(film,human)
- genre(film,genre)
- based on(film,written work)
- cast member(film,human)
- award received(film,award)
- production company(film,film production company)
- country of origin(film,country)
- publication date(film,)
- characters(film,film character)
- narrative location(film,city)
- filming location(film,city)
- main subject(film,)
- nominated for(film,award)
- cost(film,)

    RULES:
    - Extract ALL SPO fact

In [15]:
import os, re

# ---- Fixed base paths (unchanged) ----
BASE_INPUT = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/"
BASE_ONTO  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/"
BASE_OUT   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/"
BASE_FEW   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/"

# ---- The 19 filenames from your screenshot ----
FILENAMES = [
    "ont_12_monument_test.jsonl",
    "ont_1_university_test.jsonl",
    "ont_10_comicscharacter_test.jsonl",
    "ont_11_meanoftransportation_test.jsonl",
    "ont_13_food_test.jsonl",
    "ont_14_writtenwork_test.jsonl",
    "ont_15_sportsteam_test.jsonl",
    "ont_16_city_test.jsonl",
    "ont_17_artist_test.jsonl",
    "ont_18_scientist_test.jsonl",
    "ont_19_film_test.jsonl",
    "ont_2_musicalwork_test.jsonl",
    "ont_3_airport_test.jsonl",
    "ont_4_building_test.jsonl",
    "ont_5_athlete_test.jsonl",
    "ont_6_politician_test.jsonl",
    "ont_7_company_test.jsonl",
    "ont_8_celestialbody_test.jsonl",
    "ont_9_astronaut_test.jsonl",
]

# ont_{index}_{category}_test.jsonl  ->  {index}_{category}_ontology.json, ont_{index}_{category}_few_shot.jsonl, ont_{index}_{category}_output.jsonl
PATTERN = re.compile(r"^ont_(\d+)_([a-z]+)_test\.jsonl$")

def make_paths(filename: str):
    m = PATTERN.match(filename)
    if not m:
        raise ValueError(f"Unexpected filename format: {filename}")
    idx, cat = m.groups()

    input_jsonl = os.path.join(BASE_INPUT, filename)
    ontology_json = os.path.join(BASE_ONTO, f"{idx}_{cat}_ontology.json")
    few_shot_jsonl = os.path.join(BASE_FEW, f"ont_{idx}_{cat}_few_shot.jsonl")

    # ont_{idx}_{cat}_test.jsonl -> ont_{idx}_{cat}_output.jsonl
    out_name = filename.replace("_test.jsonl", "_output.jsonl")
    output_jsonl = os.path.join(BASE_OUT, out_name)

    return input_jsonl, ontology_json, few_shot_jsonl, output_jsonl, f"ont_{idx}_{cat}"

# ---- Run all files ----
for fname in FILENAMES:
    try:
        INPUT_JSONL, ONTOLOGY_JSON, FEW_SHOT_JSONL, OUTPUT_JSONL, tag = make_paths(fname)

        print("\n" + "="*80)
        print(f"[RUN] {tag}")
        print("INPUT :", INPUT_JSONL)
        print("ONTO  :", ONTOLOGY_JSON)
        print("FEWS  :", FEW_SHOT_JSONL)
        print("OUTPUT:", OUTPUT_JSONL)

        out = run_pipeline(
            input_jsonl_path=INPUT_JSONL,
            ontology_json_path=ONTOLOGY_JSON,
            output_jsonl_path=OUTPUT_JSONL,
            max_items=None,          # keep as-is; change if you want to limit
            max_new_tokens=768,
            temperature=0.25,
            verbose=True,
            few_shot_jsonl_path=FEW_SHOT_JSONL,
        )

        print(f"[DONE] {tag}")
    except Exception as e:
        print(f"[ERROR] {fname}: {e}")



[RUN] ont_12_monument
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/ont_12_monument_test.jsonl
ONTO  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/12_monument_ontology.json
FEWS  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_12_monument_few_shot.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_12_monument_output.jsonl
[FEW-SHOT] Loaded 19 examples from: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_12_monument_few_shot.jsonl
⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



✅ Done. Wrote 19 lines to: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_12_monument_output.jsonl | Total time: 116.7s
[DONE] ont_12_monument

[RUN] ont_1_university
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/ont_1_university_test.jsonl
ONTO  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/1_university_ontology.json
FEWS  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_1_university_few_shot.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_1_university_output.jsonl
[FEW-SHOT] Loaded 71 examples from: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_1_university_few_shot.jsonl
⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0



✅ Done. Wrote 71 lines to: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_1_university_output.jsonl | Total time: 512.5s
[DONE] ont_1_university

[RUN] ont_10_comicscharacter
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/ont_10_comicscharacter_test.jsonl
ONTO  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/10_comicscharacter_ontology.json
FEWS  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_10_comicscharacter_few_shot.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_10_comicscharacter_output.jsonl
[FEW-SHOT] Loaded 36 examples from: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_10_comicscharacter_few_shot.jsonl
⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0



✅ Done. Wrote 36 lines to: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_10_comicscharacter_output.jsonl | Total time: 134.8s
[DONE] ont_10_comicscharacter

[RUN] ont_11_meanoftransportation
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/ont_11_meanoftransportation_test.jsonl
ONTO  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/11_meanoftransportation_ontology.json
FEWS  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_11_meanoftransportation_few_shot.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_11_meanoftransportation_output.jsonl
[FEW-SHOT] Loaded 92 examples from: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_11_meanoftransportation_few_shot.jsonl
⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0



✅ Done. Wrote 92 lines to: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_11_meanoftransportation_output.jsonl | Total time: 407.5s
[DONE] ont_11_meanoftransportation

[RUN] ont_13_food
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/ont_13_food_test.jsonl
ONTO  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/13_food_ontology.json
FEWS  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_13_food_few_shot.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_13_food_output.jsonl
[FEW-SHOT] Loaded 153 examples from: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_13_food_few_shot.jsonl
⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


KeyboardInterrupt: 

In [None]:
import os, re

# ---- Fixed base paths (unchanged) ----
BASE_INPUT = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/"
BASE_ONTO  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/"
BASE_OUT   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/"
BASE_FEW   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/wikidata/"

# ---- The 19 filenames from your screenshot ----
FILENAMES = [
    "ont_1_movie_test.jsonl",
    "ont_2_music_test.jsonl",
    "ont_3_sport_test.jsonl",
    "ont_4_book_test.jsonl",
    "ont_5_military_test.jsonl",
    "ont_6_computer_test.jsonl",
    "ont_7_space_test.jsonl",
    "ont_8_politics_test.jsonl",
    "ont_9_nature_test.jsonl",
    "ont_10_culture_test.jsonl",
]

# ont_{index}_{category}_test.jsonl  ->  {index}_{category}_ontology.json, ont_{index}_{category}_few_shot.jsonl, ont_{index}_{category}_output.jsonl
PATTERN = re.compile(r"^ont_(\d+)_([a-z]+)_test\.jsonl$")

def make_paths(filename: str):
    m = PATTERN.match(filename)
    if not m:
        raise ValueError(f"Unexpected filename format: {filename}")
    idx, cat = m.groups()

    input_jsonl = os.path.join(BASE_INPUT, filename)
    ontology_json = os.path.join(BASE_ONTO, f"{idx}_{cat}_ontology.json")
    few_shot_jsonl = os.path.join(BASE_FEW, f"ont_{idx}_{cat}_few_shot.jsonl")

    # ont_{idx}_{cat}_test.jsonl -> ont_{idx}_{cat}_output.jsonl
    out_name = filename.replace("_test.jsonl", "_output.jsonl")
    output_jsonl = os.path.join(BASE_OUT, out_name)

    return input_jsonl, ontology_json, few_shot_jsonl, output_jsonl, f"ont_{idx}_{cat}"

# ---- Run all files ----
for fname in FILENAMES:
    try:
        INPUT_JSONL, ONTOLOGY_JSON, FEW_SHOT_JSONL, OUTPUT_JSONL, tag = make_paths(fname)

        print("\n" + "="*80)
        print(f"[RUN] {tag}")
        print("INPUT :", INPUT_JSONL)
        print("ONTO  :", ONTOLOGY_JSON)
        print("FEWS  :", FEW_SHOT_JSONL)
        print("OUTPUT:", OUTPUT_JSONL)

        out = run_pipeline(
            input_jsonl_path=INPUT_JSONL,
            ontology_json_path=ONTOLOGY_JSON,
            output_jsonl_path=OUTPUT_JSONL,
            max_items=None,          # keep as-is; change if you want to limit
            max_new_tokens=768,
            temperature=0.25,
            verbose=True,
            few_shot_jsonl_path=FEW_SHOT_JSONL,
        )

        print(f"[DONE] {tag}")
    except Exception as e:
        print(f"[ERROR] {fname}: {e}")



[RUN] ont_1_movie
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_1_movie_test.jsonl
ONTO  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/1_movie_ontology.json
FEWS  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/wikidata/ont_1_movie_few_shot.jsonl
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_1_movie_output.jsonl
[FEW-SHOT] Loaded 840 examples from: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/wikidata/ont_1_movie_few_shot.jsonl
⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0



[1] ID=ont_1_movie_test_1
prompt_text:
 TASK: Extract all SPO triples that the TEST SENTENCE clearly supports.
    - Use the ontology to guide relation labeling when possible.
    - If a valid SPO fact has no matching ontology label, still output it in the same JSONL under FINAL TRIPLES (single section, multiple lines).

    ONTOLOGY CONCEPTS:
    human, city, country, film, film genre, genre, film production company, film award, award, written work, film character, film organization

    ONTOLOGY RELATIONS (argument types):
    - director(film,human)
- screenwriter(film,human)
- genre(film,genre)
- based on(film,written work)
- cast member(film,human)
- award received(film,award)
- production company(film,film production company)
- country of origin(film,country)
- publication date(film,)
- characters(film,film character)
- narrative location(film,city)
- filming location(film,city)
- main subject(film,)
- nominated for(film,award)
- cost(film,)

    RULES:
    - Prefer relation labe