In [6]:
!nvidia-smi

Fri Oct 10 10:57:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:81:00.0 Off |                  N/A |
|  0%   41C    P8             32W /  370W |       4MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [7]:
!kill 824205

/bin/bash: line 1: kill: (824205) - No such process


In [8]:
import sys, torch
print("Python:", sys.version)
print("Executable:", sys.executable)
print("CUDA available:", torch.cuda.is_available())


Python: 3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:45:41) [GCC 13.3.0]
Executable: /upb/users/b/balram/profiles/unix/cs/.conda/envs/kg_pipeline/bin/python3
CUDA available: True


In [9]:
# Block 0 — Imports & Config

import os, json, re, time
import torch
from textwrap import dedent
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


In [10]:
def setup_model(model_id="mistralai/Mistral-7B-Instruct-v0.3"):
    print("⏳ Loading model:", model_id)
    torch.backends.cudnn.benchmark = True

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    model.config.use_cache = True  # KV caching is fine; doesn't cause "same output"

    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto"
    )
    #print("✅ Model loaded successfully.")
    return generator, tokenizer


In [11]:
# ---------- Utilities ----------

def _quote(v: object) -> str:
    """
    Convert any value to a safe, double-quoted string for the triple format.
    Escapes backslashes and double quotes.
    """
    s = str(v)
    s = s.replace("\\", "\\\\").replace('"', '\\"')
    return f'"{s}"'


def _coerce_triples_to_str(triples_val):
    """
    Normalize triples into newline-separated lines: rel("Subject","Object")

    Accepts:
      - str: a preformatted block
      - list[str]: each a preformatted line
      - list[dict]: each with keys {sub, rel, obj}
    """
    if triples_val is None:
        return ""

    # Already a full block string
    if isinstance(triples_val, str):
        return triples_val.strip()

    # List of preformatted strings
    if isinstance(triples_val, list) and all(isinstance(x, str) for x in triples_val):
        return "\n".join(x.strip() for x in triples_val if x and x.strip())

    # List of dicts -> format to rel("sub","obj")
    if isinstance(triples_val, list) and all(isinstance(x, dict) for x in triples_val):
        lines = []
        for t in triples_val:
            rel = (t.get("rel") or "").strip()
            sub = t.get("sub")
            obj = t.get("obj")
            if rel and (sub is not None) and (obj is not None):
                lines.append(f'{rel}({_quote(sub)},{_quote(obj)})')
        return "\n".join(lines)

    # Fallback
    return str(triples_val).strip()


# ---------- Few-shot loader for FIXED schema ----------

def load_one_shot_examples(few_shot_jsonl_path: str) -> dict:
    """
    Loads {id: {"example_sentence": <str>, "example_triples_output": <any>}}.
    - Reads fields exactly as written in your file:
        "Example sentence"
        "Example triples output"   (and falls back to "Example triples" if the former is absent)
    - No validation / coercion.
    """
    out = {}
    for rec in read_jsonl(few_shot_jsonl_path, max_items=None):
        rid = rec.get("id")
        if rid is None:
            continue  # keep this tiny guard so the dict key isn't None
        out[str(rid).strip()] = {
            "example_sentence": rec.get("Example sentence"),
            # prefer "Example triples output", else fallback to "Example triples"
            "example_triples_output": rec.get("Example triples output", rec.get("Example triples")),
        }
    print(f"[FEW-SHOT] Loaded {len(out)} examples from: {few_shot_jsonl_path}")
    return out



In [12]:
# Block 2 — Prompt Builder (Reason → Verify → Final Triples)

from textwrap import dedent
import json

# ---------- Block 2 — Prompt Builder (Reason → Verify → Final Triples) ----------

# Helpers (strict access; fail fast if ontology keys are missing)
def _concept_label(ontology_json, qid):
    return next((c["label"] for c in ontology_json["concepts"] if c["qid"] == qid), "")

def format_ontology_concepts(ontology_json):
    return ", ".join(c["label"] for c in ontology_json["concepts"])

def format_ontology_relations(ontology_json):
    lines = []
    for r in ontology_json["relations"]:
        dom = _concept_label(ontology_json, r["domain"])
        rng = _concept_label(ontology_json, r["range"])
        lines.append(f'- {r["label"]}({dom},{rng})')
    return "\n".join(lines)


def build_reason_then_extract_prompt(
    ontology_json,
    test_sentence,
    worked_example=None,
    allow_light_norm=True,
    relation_cues_text=None,   # optional Step 0 paraphrase cues
    strict_format=True,        # enforce exact output shape
):
    """
    Build a prompt that ends with the header `### FINAL TRIPLES` and *no trailing guidance*,
    so generations appear directly under the header in the required line format.

    This version matches the downstream extractor which searches for `##/### FINAL TRIPLES`.
    """

    concepts_line   = format_ontology_concepts(ontology_json)
    relations_block = format_ontology_relations(ontology_json)
    norm_note = (
        "normalize only trivial cases (e.g., Japanese→Japan)"
        if allow_light_norm else
        "copy spans verbatim; no normalization"
    )

    # Optional Step 0 cues
    step0_block = ""
    if relation_cues_text:
        step0_block = dedent(f"""
        Step 0 (Paraphrase expansion for this sentence):
        {relation_cues_text.strip()}
        """)

    # Optional worked example — supports either the new shape or the older example fields
    example_block = ""
    if worked_example:
        ex_sent = worked_example.get("example_sentence")
        ex_out  = worked_example.get("example_triples_output")
        #         # Use JSON dumps to preserve quotes/escapes for the sentence,
        # and to serialize lists/dicts as-is for the output block.
        ex_sent_json = json.dumps(ex_sent, ensure_ascii=False)  # -> "The number of staff..."
        ex_out_json  = json.dumps(ex_out, ensure_ascii=False)   # -> [{"sub":...,"rel":...,"obj":...}, ...]
        example_block = dedent(f'''
        "Example sentence": {ex_sent_json}

        "Example FINAL TRIPLES": {ex_out_json}
        ''').strip()

    # Strict output notes merged into RULES (no trailing text after the header in the final prompt)
    strict_notes = ""
    if strict_format:
        strict_notes = dedent("""
        OUTPUT SHAPE (strict):
        - After Step 2, output the header exactly:
        ### FINAL TRIPLES
        - Then output one triple per line as: predicate("Subject","Object")
        - No extra commentary after the triples. If no triples, just output the header.
        - Do NOT output JSON, code fences, or any other sections.
        """).strip()

    rules = dedent(f"""
    RULES:
    - Prefer relation labels from ONTOLOGY RELATIONS when they match the evidence.
    - If the sentence clearly states another SPO relation that is NOT in the ontology, still include it using a concise predicate phrase derived from the trigger words in the sentence (do NOT invent facts).
    - Resolve simple coreference (e.g., "the film", "this movie", "it" → the film title in this sentence).
    - Quote/mention evidence in Step 2 for each emitted triple.
    - Output one triple per line, format: predicate("Subject","Object").
    - Base decisions ONLY on the TEST SENTENCE.
    - Avoid duplicates; apply light normalization only (e.g., Japanese→Japan).
    {strict_notes}
    """).strip()

    # IMPORTANT: end the prompt with the header and NOTHING after it.
    prompt = dedent(f"""
    TASK: Extract all SPO triples that the TEST SENTENCE clearly supports.
    - Use the ontology to guide relation labeling when possible.
    - {norm_note}.
    - If a valid SPO fact has no matching ontology label, still output it (single FINAL TRIPLES list).

    ONTOLOGY CONCEPTS:
    {concepts_line}

    ONTOLOGY RELATIONS (argument types):
    {relations_block}

    {rules}

    {step0_block}

    {example_block if example_block else ""}

    ### TEST SENTENCE
    "{test_sentence}"

    Step 1 (Entities & types):
    Step 2 (Verify relations): For each candidate triple, quote or mention the trigger phrase and spans.

    ### FINAL TRIPLES
    """).strip()

    return prompt


In [13]:
# Block 3 — Single Inference (chat template; continuation-only)

def generate_triples_text(generator, tokenizer, prompt_text: str,
                          max_new_tokens: int = 768, temperature: float = 0.3) -> str:
    """
    Calls the model once. Returns the full generated continuation (reasoning + FINAL TRIPLES).
    """
    chat = [
        {"role": "system", "content": "You are a precise information-extraction model. Follow instructions carefully."},
        {"role": "user", "content": prompt_text}
    ]
    formatted = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

    out = generator(
        formatted,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=0.9,
        do_sample=True,
        return_full_text=False,
        truncation=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    return out[0]["generated_text"] if isinstance(out[0], dict) else out[0]


In [14]:
# Block 4 — Extract & Parse `FINAL TRIPLES`

FINAL_HEADER_REGEX = r'^\s*#{2,}\s*FINAL\s+TRIPLES\s*$'  # matches ## or ### FINAL TRIPLES

def extract_final_triples_block(model_output_text: str) -> str:
    # try normal header first
    #print("model_output_text: ",model_output_text)
    
    header = re.search(FINAL_HEADER_REGEX, model_output_text, re.IGNORECASE | re.MULTILINE)
    if header:
        start = header.end()
        tail = model_output_text[start:].strip()
        nxt = re.search(r'^\s*#{2,}\s+[A-Z].*$', tail, re.MULTILINE)
        return tail[:nxt.start()].strip() if nxt else tail

    # fallback: take everything and let the parser fish valid lines out
    return model_output_text

def parse_triples_block(block_text: str):
    """
    Parse lines of the form:
        predicate("Subject","Object")
        predicate('Subject','Object')
        predicate(Subject,Object)
        predicate(Subject, 250)
    Keeps both ontology and non-ontology predicates (open-set allowed).
    Returns a list of dicts: {"sub": ..., "rel": ..., "obj": ...}
    """
    triples = []

    line_re = re.compile(r"""
    ^\s*(?:[-*•]\s*)?                 # optional markdown bullet
    (?P<rel>[A-Za-z][A-Za-z0-9_ ]*?)  # predicate name
    \s*\(\s*
    (?:                               # SUBJECT (quoted or unquoted)
        ["'](?P<sub_q>[^"']+?)["']    # "Subject" or 'Subject'
        | (?P<sub_u>[^,)\n]+?)        # Subject (unquoted)
    )
    \s*,\s*
    (?:                               # OBJECT (quoted or unquoted)
        ["'](?P<obj_q>[^"']+?)["']    # "Object" or 'Object'
        | (?P<obj_u>[^)\n]+?)         # Object (unquoted)
    )
    \s*\)\s*\.?\s*$                   # optional trailing period
""", re.VERBOSE)


    for raw in block_text.splitlines():
        line = raw.strip()
        if not line or line.startswith("#"):
            continue

        m = line_re.match(line)
        if not m:
            # skip non-matching lines quietly
            continue

        rel = m.group("rel").strip()
        sub = (m.group("sub_q") or m.group("sub_u") or "").strip()
        obj = (m.group("obj_q") or m.group("obj_u") or "").strip()

        # Optional: guard against empties
        if not rel or not sub or not obj:
            continue

        triples.append({"sub": sub, "rel": rel, "obj": obj})

    return triples


In [15]:
# Block 5 — JSONL I/O Helpers

def read_jsonl(path, max_items: int | None = None):
    """
    Yields JSON objects from a .jsonl file.
    If max_items is set, stops after that many records (for debugging).
    """
    count = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            yield json.loads(line)
            count += 1
            if max_items is not None and count >= max_items:
                break

def write_jsonl(path, records):
    with open(path, "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False))
            f.write("\n")


In [16]:
# Block 6 — Orchestrator (loop inputs → Steps 1–4 → output)

def run_pipeline(
    input_jsonl_path: str,
    ontology_json_path: str,
    output_jsonl_path: str,
    max_items: int = 4,
    max_new_tokens: int = 768,
    temperature: float = 0.25,
    verbose: bool = True,
    few_shot_jsonl_path: str | None = None,   # pass as keyword in the call below
):
    """
    Runs the extraction pipeline.
    Required: input_jsonl_path, ontology_json_path, output_jsonl_path.
    Optional: max_items, max_new_tokens, temperature, verbose, few_shot_jsonl_path.
    """
    # ✅ Load ontology once
    with open(ontology_json_path, "r", encoding="utf-8") as f:
        ontology = json.load(f)

    # ✅ Load one-shot examples (if provided)
    if few_shot_jsonl_path:
        one_shot_by_id = load_one_shot_examples(few_shot_jsonl_path)
    else:
        one_shot_by_id = {}
        print("[FEW-SHOT] No few_shot_jsonl_path provided — proceeding without examples.")

    # ✅ Setup model once
    generator, tokenizer = setup_model("mistralai/Mistral-7B-Instruct-v0.3")

    out_records = []
    t0 = time.time()

    # ✅ Iterate inputs
    for idx, item in enumerate(read_jsonl(input_jsonl_path, max_items=max_items), start=1):
        sent_id = item.get("id")
        sent    = item.get("sent", "")

        # Match example by id
        worked_example = one_shot_by_id.get(sent_id)
        # Build prompt
        prompt_text = build_reason_then_extract_prompt(
            ontology_json=ontology,
            test_sentence=sent,
            worked_example=worked_example,
            allow_light_norm=True,
            relation_cues_text=None
        )


        # Generate
        t_gen0 = time.time()
        model_output = generate_triples_text(
            generator, tokenizer, prompt_text,
            max_new_tokens=max_new_tokens, temperature=temperature
        )
        gen_time = time.time() - t_gen0

        # Extract & Parse
        final_block = extract_final_triples_block(model_output)

        triples = parse_triples_block(final_block)

        out_records.append({"id": sent_id, "triples": triples})

    # Write output
    write_jsonl(output_jsonl_path, out_records)
    print(f"\n✅ Done. Wrote {len(out_records)} lines to: {output_jsonl_path} | Total time: {time.time()-t0:.1f}s")

    return out_records


In [17]:
# Block 7 — Example Run (edit these paths)

INPUT_JSONL   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/ont_1_university_test.jsonl"     # your 3 sample lines as JSONL
ONTOLOGY_JSON = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/1_university_ontology.json"         # your ontology JSON
OUTPUT_JSONL  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_1_university.jsonl"             # where to write results
FEW_SHOT_JSONL = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_1_university_few_shot.jsonl"
MAX_ITEMS =None   # change to 20 (or None for full file) to control batches during debugging

# Uncomment to execute:
# run_pipeline(INPUT_JSONL, ONTOLOGY_JSON, OUTPUT_JSONL,
#              max_items=MAX_ITEMS, max_new_tokens=768, temperature=0.25, verbose=True)


In [None]:
out = run_pipeline(
    input_jsonl_path=INPUT_JSONL,
    ontology_json_path=ONTOLOGY_JSON,
    output_jsonl_path=OUTPUT_JSONL,
    max_items=MAX_ITEMS,
    max_new_tokens=768,
    temperature=0.25,
    verbose=True,
    few_shot_jsonl_path=FEW_SHOT_JSONL,  # <-- pass by keyword
)


[FEW-SHOT] Loaded 71 examples from: /upb/users/b/balram/profiles/unix/cs/promptKG/data/fewshots_example/dbpedia/ont_1_university_few_shot.jsonl
⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
