In [1]:
!nvidia-smi

Wed Oct 29 20:04:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:81:00.0 Off |                  N/A |
| 87%   74C    P0            308W /  370W |   14658MiB /  24576MiB |     53%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!kill 1023795

/bin/bash: line 1: kill: (1023795) - No such process


In [4]:
!hostname
!which python
import torch
print("CUDA available:", torch.cuda.is_available())

limbo
/opt/miniforge3/envs/jupyterhub/bin/python
CUDA available: True


In [5]:
########################################
# BLOCK 1: IMPORTS / GLOBAL CONFIG / MODEL SETUP
########################################

import os
import re
import json
from textwrap import dedent
from typing import Dict, Any, List, Tuple, Optional

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


# -------- Runtime config (edit these before running Block 13 manual test) --------
ONTOLOGY_JSON = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/1_movie_ontology.json"
INPUT_JSONL   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_1_movie_test.jsonl"
OUTPUT_JSONL  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt2/wikidata/ont_1_movie_output_test.jsonl"

MODEL_ID   = "mistralai/Mistral-7B-Instruct-v0.3"
K_TRIPLES  = 1        # "extract up to {k} triples"
MAX_ITEMS  = 2     # None = use all rows from INPUT_JSONL
VERBOSE    = True     # default True for manual testing first


def setup_model(model_id: str = "mistralai/Mistral-7B-Instruct-v0.3"):
    """
    Load the chat model + tokenizer and return a text-generation pipeline.
    Uses half precision + device_map='auto' for efficiency.
    """
    print(f"[LOAD] model={model_id}")
    torch.backends.cudnn.benchmark = True

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    model.config.use_cache = True

    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
    )

    return generator, tokenizer


In [7]:
########################################
# BLOCK 2: DATA I/O HELPERS
########################################

_TEXT_KEYS_PRIORITY = ("sent", "text", "Text", "sentence", "Sentence")


def read_jsonl(path: str, max_items: Optional[int] = None):
    """
    Stream records from a .jsonl file.
    Stops early if max_items is provided.
    Yields dicts.
    """
    count = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            yield json.loads(line)
            count += 1
            if max_items is not None and count >= max_items:
                break


def write_jsonl(path: str, records: List[Dict[str, Any]]):
    """
    Write a list of dicts as JSON lines.
    Creates parent directory if needed.
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False))
            f.write("\n")


def extract_text_field(rec: Dict[str, Any]) -> Tuple[str, str]:
    """
    Heuristic to pick the text field from an input record.
    Returns (text_value, key_used).
    Falls back to the longest string field if none of the preferred keys exist.
    """
    for k in _TEXT_KEYS_PRIORITY:
        v = rec.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip(), k

    # fallback: choose longest string in record
    best_key, best_val = "", ""
    for k, v in rec.items():
        if isinstance(v, str) and len(v) > len(best_val):
            best_key, best_val = k, v
    return best_val.strip(), best_key


def _escape_multiline(s: str) -> str:
    """
    Escape backslashes and quotes so we can safely embed text
    inside quoted blocks in the USER prompt.
    """
    return s.replace("\\", "\\\\").replace('"', '\\"')


In [8]:
########################################
# BLOCK 3: ONTOLOGY HELPERS + PROMPT 2 MESSAGE BUILDERS
########################################

def load_ontology_json(path: str) -> Dict[str, Any]:
    """
    Load ontology JSON file.
    Expected structure:
      {
        "concepts": [
          {"id": "...", "qid": "...", "label": "SomeClass"},
          ...
        ],
        "relations": [
          {
            "id": "...",
            "label": "location",
            "domain": "SomeConceptID",
            "range": "SomeConceptID"
          },
          ...
        ]
      }
    """
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def _build_concept_index(ontology_json: Dict[str, Any]) -> Dict[str, str]:
    """
    Map any known identifier (qid/id/label) -> canonical label string.
    This lets us convert domain/range IDs into human-readable names.
    """
    idx: Dict[str, str] = {}
    for concept in ontology_json.get("concepts", []):
        label = str(concept.get("label", "")).strip()
        if not label:
            continue

        for keyname in ("qid", "id", "label"):
            raw_val = concept.get(keyname)
            if raw_val is None:
                continue

            sval = str(raw_val).strip()
            if sval:
                idx[sval] = label
    return idx


def _label_for(raw_val: Any, cindex: Dict[str, str]) -> str:
    """
    Convert domain/range IDs to readable labels.
    Fallback to string form of raw_val.
    """
    if raw_val is None:
        return ""
    rval = str(raw_val).strip()
    return cindex.get(rval, rval)


def format_ontology_concepts(ontology_json: Dict[str, Any]) -> str:
    """
    Return a bullet list of ontology concepts by label.
    We'll present these to the model.
    """
    lines: List[str] = []
    for c in ontology_json.get("concepts", []):
        label = str(c.get("label", "")).strip()
        if label:
            lines.append(f"- {label}")
    return "\n".join(lines)


def format_ontology_relations(ontology_json: Dict[str, Any]) -> str:
    """
    Return a bullet list of relations with (domain, range) in human-readable form.
    Format: - relationLabel(domainLabel,rangeLabel)
    """
    cindex = _build_concept_index(ontology_json)
    lines: List[str] = []
    for r in ontology_json.get("relations", []):
        rel_label = str(r.get("label", "")).strip()
        dom_label = _label_for(r.get("domain"), cindex)
        rng_label = _label_for(r.get("range"), cindex)
        if rel_label:
            lines.append(f"- {rel_label}({dom_label},{rng_label})")
    return "\n".join(lines)


def build_p2_system() -> str:
    """
    System message for Prompt 2.
    """
    return (
        "You are a KG triple extractor. "
        "Match relation cues in the text and return only triples that satisfy the "
        "ontology’s domain→range. Cite exact evidence. Output JSON only."
    )

def build_p2_user(text: str, ontology_json: Dict[str, Any], k: int) -> str:
    """
    Prompt 2 user message:
    - No few-shot examples
    - No per-domain lists
    - Includes a universal explanation of how to recognize and record lexical cues
    """

    ontology_concepts_block = format_ontology_concepts(ontology_json)
    ontology_relations_block = format_ontology_relations(ontology_json)

    return dedent(f"""\
    Task: Using explicit lexical cues found in the text, extract up to {k} triples [subject, relation, object].
    Enforce ontology domain→range strictly; if a triple is invalid, omit it.

    CUE GENERATION GUIDANCE:
    - A lexical cue is a word or short phrase in the text that directly connects the subject and object
      and signals the relation between them.
    - It can be a **verb**, **verb phrase**, or **prepositional phrase** such as:
        "directed by", "founded in", "built by", "located in", "written by", "headed by".
      These are only illustrative examples — you must infer the appropriate cue for each relation label and sentence.
    - The cue MUST appear literally in the text and should sit between, or very near, the subject and object mentions.
    - The cue expresses the natural-language realization of the ontology relation.
    - If no such linking phrase appears in the text for a given relation, skip that relation.
    - Do NOT invent cues or use world knowledge; work only from the surface text.

    PROCEDURE:
    1. Identify candidate subjects and objects that match ontology domain and range types.
    2. Locate a surface phrase that connects them and expresses the ontology relation label.
    3. Record that phrase as the "cue" and quote the full supporting span that contains it.
    4. If none is found, do not emit a triple for that relation.
    5. Resolve simple pronouns only if doing so maintains correct domain→range typing.

    Text:
    "{_escape_multiline(text)}"

    Ontology concepts:
    {ontology_concepts_block}

    Ontology relations (domain → range):
    {ontology_relations_block}

    Output JSON only in this exact schema:
    {{
      "triples": [
        {{
          "triple": ["subject","relation","object"],
          "confidence": 0.0,  // confidence 0–1
          "cue": "matched cue phrase from text",
          "support": "exact quoted span(s)",
          "notes": "domain/range check; or pronoun resolution note if applied"
        }}
      ]
    }}

    Rules:
    - Work only from visible text evidence.
    - If no relation cue appears, return an empty list of triples.
    - Do NOT invent or assume facts not in text.
    - Always quote exact spans for support and cue.
    """)


In [9]:
########################################
# BLOCK 4: GENERATION + PARSER HELPERS
########################################

def generate_raw_json(
    generator,
    tokenizer,
    system_prompt: str,
    user_prompt: str,
    max_new_tokens: int = 768,
    temperature: float = 0.25,
    top_p: float = 0.9,
) -> str:
    """
    Build chat-style prompt for Prompt 2 and get model output.
    We expect JSON-only, but we'll still post-parse later.
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": user_prompt},
    ]

    prompt_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    out = generator(
        prompt_text,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        return_full_text=False,
        truncation=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

    # HF pipeline returns list[{"generated_text": "..."}]
    if isinstance(out[0], dict) and "generated_text" in out[0]:
        return out[0]["generated_text"].strip()
    else:
        # fallback
        return str(out[0]).strip()


def try_parse_json(raw: str) -> Optional[Dict[str, Any]]:
    """
    Best-effort parse of model output (should be JSON).
    1. direct json.loads
    2. fallback: grab first {...} block
    """
    raw_strip = raw.strip()

    # direct attempt
    try:
        return json.loads(raw_strip)
    except Exception:
        pass

    # fallback: regex for first {...}
    m = re.search(r"\{.*\}", raw_strip, flags=re.DOTALL)
    if m:
        block = m.group(0)
        try:
            return json.loads(block)
        except Exception:
            return None

    return None


In [10]:
########################################
# BLOCK 5: SINGLE-FILE PIPELINE (Prompt 2)
########################################


def run_pipeline_prompt2(
    ontology_path: str,
    input_jsonl_path: str,
    output_jsonl_path: str,
    k_triples: int = 5,
    max_items: Optional[int] = None,
    verbose: bool = True,
    model_id: str = "mistralai/Mistral-7B-Instruct-v0.3",
    generator=None,
    tokenizer=None,
):
    """
    Run Prompt 2 over a single dataset file:
      - load ontology + rows
      - build prompts
      - generate model output
      - parse JSON
      - write trace jsonl

    Behavior:
      • If `generator` and `tokenizer` are provided, reuse them (no new model load).
      • Otherwise, load the model from `model_id` internally (backward compatible).
    """

    # 1. load ontology
    ontology_json = load_ontology_json(ontology_path)

    # 2. init / reuse model
    local_model_loaded = False
    if generator is None or tokenizer is None:
        generator, tokenizer = setup_model(model_id=model_id)
        local_model_loaded = True  # so we know we "own" it, if you ever want cleanup

    results: List[Dict[str, Any]] = []

    # 3. iterate input rows
    for idx, rec in enumerate(read_jsonl(input_jsonl_path, max_items=max_items)):
        rec_id = str(rec.get("id") or f"item_{idx}")
        text_val, text_key = extract_text_field(rec)

        sys_prompt = build_p2_system()
        usr_prompt = build_p2_user(text_val, ontology_json, k_triples)

        if verbose:
            print("======================================")
            print(f"[ID] {rec_id}")
            print(f"[TEXT_KEY] {text_key}")
            print("[SYSTEM PROMPT]\n", sys_prompt)
            print("[USER PROMPT]\n", usr_prompt)
            print("[SOURCE TEXT]\n", text_val)

        raw_response = generate_raw_json(
            generator=generator,
            tokenizer=tokenizer,
            system_prompt=sys_prompt,
            user_prompt=usr_prompt,
            max_new_tokens=768,
            temperature=0.25,
            top_p=0.9,
        )

        parsed_json = try_parse_json(raw_response)

        if verbose:
            print("[RAW RESPONSE]\n", raw_response)
            print("[PARSED JSON]\n", parsed_json)

        out_record = {
            "id": rec_id,
            "input text": text_val,
            "prompts": {
                "system_prompt": sys_prompt,
                "user_prompt": usr_prompt,
            },
            "response": {
                "LLM_output": raw_response,
                "json": parsed_json,
            },
        }

        results.append(out_record)

    # 4. write collected results
    write_jsonl(output_jsonl_path, results)

    if verbose:
        print(f"\n[P2 WRITE] {len(results)} rows -> {output_jsonl_path}")

    # optional: if we loaded the model here locally we *could* free VRAM,
    # but usually you keep it for interactive use.
    # if local_model_loaded:
    #     torch.cuda.empty_cache()




In [9]:




# ########################################
# # BLOCK 9: MANUAL TEST RUN (single file Prompt 2)
# ########################################

# run_pipeline_prompt2(
#     ontology_path=ONTOLOGY_JSON,
#     input_jsonl_path=INPUT_JSONL,
#     output_jsonl_path=OUTPUT_JSONL,
#     k_triples=K_TRIPLES,
#     max_items=MAX_ITEMS,
#     verbose=VERBOSE,   #True for first check
#     model_id=MODEL_ID,
#     generator=None,
#     tokenizer=None,
# )



In [10]:
import os
import re

PATTERN = re.compile(r"^ont_(\d+)_([a-zA-Z0-9]+)_test\.jsonl$")

def make_paths(filename: str, base_input: str, base_onto: str, base_out: str):
    """
    Given something like 'ont_8_politician_test.jsonl', build:
      - input_jsonl_path      -> <base_input>/ont_8_politician_test.jsonl
      - ontology_json_path    -> <base_onto>/8_politician_ontology.json
      - output_jsonl_path     -> <base_out>/ont_8_politician_output.jsonl
      - tag                   -> 'ont_8_politician'
    """
    m = PATTERN.match(filename)
    if not m:
        raise ValueError(f"Unexpected filename format: {filename}")
    idx, cat = m.groups()

    input_jsonl_path = os.path.join(base_input, filename)
    ontology_json_path = os.path.join(base_onto, f"{idx}_{cat}_ontology.json")

    out_name = filename.replace("_test.jsonl", "_output.jsonl")
    output_jsonl_path = os.path.join(base_out, out_name)

    tag = f"ont_{idx}_{cat}"
    return input_jsonl_path, ontology_json_path, output_jsonl_path, tag


def run_wikidata_batch_p2(verbose: bool = True):
    BASE_INPUT = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/"
    BASE_ONTO  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/"
    BASE_OUT   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt2/wikidata/"

    FILENAMES = [
        "ont_1_movie_test.jsonl",
        "ont_2_music_test.jsonl",
        "ont_3_sport_test.jsonl",
        "ont_4_book_test.jsonl",
        "ont_5_military_test.jsonl",
        "ont_6_building_test.jsonl",
        "ont_7_tv_test.jsonl",
        "ont_8_politician_test.jsonl",
        "ont_9_organization_test.jsonl",
        "ont_10_airport_test.jsonl",
    ]

    os.makedirs(BASE_OUT, exist_ok=True)

    # 1. load the model ONCE and reuse it for all files
    generator, tokenizer = setup_model(model_id=MODEL_ID)

    # 2. loop over all files
    for fname in FILENAMES:
        try:
            # build all required paths for this file using the shared logic
            input_jsonl_path, ontology_json_path, output_jsonl_path, tag = make_paths(
                filename=fname,
                base_input=BASE_INPUT,
                base_onto=BASE_ONTO,
                base_out=BASE_OUT,
            )

            print("\n" + "=" * 80)
            print(f"[RUN P2] wikidata {tag}")
            print(f"[INPUT ] {input_jsonl_path}")
            print(f"[ONTO  ] {ontology_json_path}")
            print(f"[OUTPUT] {output_jsonl_path}")

            # 3. run the single-file pipeline, reusing the loaded model
            run_pipeline_prompt2(
                ontology_path=ontology_json_path,
                input_jsonl_path=input_jsonl_path,
                output_jsonl_path=output_jsonl_path,
                k_triples=K_TRIPLES,
                max_items=None,
                verbose=False,
                model_id=MODEL_ID,
                generator=generator,   # reuse
                tokenizer=tokenizer,   # reuse
            )

            print(f"[DONE P2] wikidata {tag}")

        except Exception as exc:
            print(f"[ERROR P2] wikidata {fname}: {exc}")


In [None]:
run_wikidata_batch_p2()

[LOAD] model=mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0



[RUN P2] wikidata ont_1_movie
[INPUT ] /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_1_movie_test.jsonl
[ONTO  ] /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/1_movie_ontology.json
[OUTPUT] /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt2/wikidata/ont_1_movie_output.jsonl
[ID] ont_1_movie_test_1
[TEXT_KEY] sent
[SYSTEM PROMPT]
 You are a KG triple extractor. Match relation cues in the text and return only triples that satisfy the ontology’s domain→range. Cite exact evidence. Output JSON only.
[USER PROMPT]
     Task: Using explicit lexical cues found in the text, extract up to 1 triples [subject, relation, object].
    Enforce ontology domain→range strictly; if a triple is invalid, omit it.

    CUE GENERATION GUIDANCE:
    - A lexical cue is a word or short phrase in the text that directly connects the subject and object
      and signals the relation between them.
    - It can be a **verb**, **ver

In [11]:
import os
import re

DBPEDIA_PATTERN = re.compile(r"^ont_(\d+)_([a-zA-Z0-9]+)_test\.jsonl$")

def make_dbpedia_paths(filename: str, base_input: str, base_onto: str, base_out: str):
    """
    Given 'ont_14_writtenwork_test.jsonl', return:
      - input_jsonl_path      -> <base_input>/ont_14_writtenwork_test.jsonl
      - ontology_json_path    -> <base_onto>/14_writtenwork_ontology.json
      - output_jsonl_path     -> <base_out>/ont_14_writtenwork_output.jsonl
      - tag                   -> 'ont_14_writtenwork'
    """
    m = DBPEDIA_PATTERN.match(filename)
    if not m:
        raise ValueError(f"Unexpected filename format: {filename}")
    idx, cat = m.groups()

    input_jsonl_path = os.path.join(base_input, filename)
    ontology_json_path = os.path.join(base_onto, f"{idx}_{cat}_ontology.json")
    out_name = filename.replace("_test.jsonl", "_output.jsonl")
    output_jsonl_path = os.path.join(base_out, out_name)

    tag = f"ont_{idx}_{cat}"
    return input_jsonl_path, ontology_json_path, output_jsonl_path, tag


def run_dbpedia_batch_p2(verbose: bool = True):
    BASE_INPUT = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/"
    BASE_ONTO  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/"
    BASE_OUT   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt2/dbpedia/"

    FILENAMES = [
        "ont_12_monument_test.jsonl",
        "ont_1_university_test.jsonl",
        "ont_10_comicscharacter_test.jsonl",
        "ont_11_meanoftransportation_test.jsonl",
        "ont_13_food_test.jsonl",
        "ont_14_writtenwork_test.jsonl",
        "ont_15_software_test.jsonl",
        "ont_16_person_test.jsonl",
        "ont_17_athlete_test.jsonl",
        "ont_18_organization_test.jsonl",
        "ont_19_film_test.jsonl",
    ]

    os.makedirs(BASE_OUT, exist_ok=True)

    # 1. load the model ONCE and reuse it for all dbpedia files
    generator, tokenizer = setup_model(model_id=MODEL_ID)

    # 2. iterate over each file
    for fname in FILENAMES:
        try:
            # construct all paths for this file
            input_jsonl_path, ontology_json_path, output_jsonl_path, tag = make_dbpedia_paths(
                filename=fname,
                base_input=BASE_INPUT,
                base_onto=BASE_ONTO,
                base_out=BASE_OUT,
            )

            print("\n" + "=" * 80)
            print(f"[RUN P2] dbpedia {tag}")
            print(f"[INPUT ] {input_jsonl_path}")
            print(f"[ONTO  ] {ontology_json_path}")
            print(f"[OUTPUT] {output_jsonl_path}")

            # 3. call the single-file pipeline, reusing the SAME model
            run_pipeline_prompt2(
                ontology_path=ontology_json_path,
                input_jsonl_path=input_jsonl_path,
                output_jsonl_path=output_jsonl_path,
                k_triples=K_TRIPLES,
                max_items=None,
                verbose=False,
                model_id=MODEL_ID,
                generator=generator,   # reuse the already-loaded model
                tokenizer=tokenizer,   # reuse tokenizer
            )

            print(f"[DONE P2] dbpedia {tag}")

        except Exception as exc:
            print(f"[ERROR P2] dbpedia {fname}: {exc}")


In [None]:
########################################
# FULL DATASET BATCH RUNS
########################################
run_dbpedia_batch_p2()


[LOAD] model=mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0



[RUN P2] dbpedia ont_12_monument
[INPUT ] /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/ont_12_monument_test.jsonl
[ONTO  ] /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/12_monument_ontology.json
[OUTPUT] /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt2/dbpedia/ont_12_monument_output.jsonl
