In [1]:
!nvidia-smi

Sun Nov  2 12:54:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:81:00.0 Off |                  N/A |
| 84%   62C    P3            123W /  370W |       2MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!kill 1377479

/bin/bash: line 1: kill: (1377479) - No such process


In [3]:
# === Prompt 3 Notebook (Open-IE under ontology) ===

import os
import json
import re
from textwrap import dedent
from typing import Dict, Any, List, Tuple, Optional, Iterable

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


In [4]:
def setup_model(model_id: str = "mistralai/Mistral-7B-Instruct-v0.3"):
    """
    Load tokenizer + model + text-generation pipeline (GPU if available).
    Mirrors Prompt 1/2: device_map='auto', fp16, cache enabled.
    Returns (generator_pipeline, tokenizer).
    """
    print("⏳ Loading model:", model_id)
    torch.backends.cudnn.benchmark = True

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",            # use GPU(s) if available
        torch_dtype=torch.float16,    # fp16 on CUDA
    )
    model.config.use_cache = True

    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",            # keep consistent with model
    )
    return generator, tokenizer


In [5]:
def read_jsonl(path: str, max_items: Optional[int] = None) -> Iterable[Dict[str, Any]]:
    """
    Generator yielding JSON objects from a JSONL file.
    Honors max_items if provided (int).
    """
    n = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            yield json.loads(line)
            n += 1
            if isinstance(max_items, int) and n >= max_items:
                break

def write_jsonl(path: str, records: List[Dict[str, Any]]):
    """
    Write a list of dict records to a JSONL file.
    Creates parent directories if needed.
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


In [6]:
_TEXT_KEYS_PRIORITY = ("sent", "text", "Text", "sentence", "Sentence")

def _extract_text(rec: Dict[str, Any]) -> Tuple[str, str]:
    """
    Return (text, key_used) from an input record.
    Mirrors Prompt 1/2 behavior: prefer common keys; else longest string field.
    """
    for k in _TEXT_KEYS_PRIORITY:
        v = rec.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip(), k
    # Fallback: choose the longest string-valued field
    best_k, best_v = "", ""
    for k, v in rec.items():
        if isinstance(v, str) and len(v) > len(best_v):
            best_k, best_v = k, v
    return best_v, best_k


In [7]:
########################################
# BLOCK 2: DATA I/O HELPERS
########################################

_TEXT_KEYS_PRIORITY = ("sent", "text", "Text", "sentence", "Sentence")


def read_jsonl(path: str, max_items: Optional[int] = None):
    """
    Stream records from a .jsonl file.
    Stops early if max_items is provided.
    Yields dicts.
    """
    count = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            yield json.loads(line)
            count += 1
            if max_items is not None and count >= max_items:
                break


def write_jsonl(path: str, records: List[Dict[str, Any]]):
    """
    Write a list of dicts as JSON lines.
    Creates parent directory if needed.
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False))
            f.write("\n")


def extract_text_field(rec: Dict[str, Any]) -> Tuple[str, str]:
    """
    Heuristic to pick the text field from an input record.
    Returns (text_value, key_used).
    Falls back to the longest string field if none of the preferred keys exist.
    """
    for k in _TEXT_KEYS_PRIORITY:
        v = rec.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip(), k

    # fallback: choose longest string in record
    best_key, best_val = "", ""
    for k, v in rec.items():
        if isinstance(v, str) and len(v) > len(best_val):
            best_key, best_val = k, v
    return best_val.strip(), best_key


def _escape_multiline(s: str) -> str:
    """
    Escape backslashes and quotes so we can safely embed text
    inside quoted blocks in the USER prompt.
    """
    return s.replace("\\", "\\\\").replace('"', '\\"')


In [8]:
########################################
# BLOCK 3: ONTOLOGY HELPERS + PROMPT 2 MESSAGE BUILDERS
########################################

def load_ontology_json(path: str) -> Dict[str, Any]:
    """
    Load ontology JSON file.
    Expected structure:
      {
        "concepts": [
          {"id": "...", "qid": "...", "label": "SomeClass"},
          ...
        ],
        "relations": [
          {
            "id": "...",
            "label": "location",
            "domain": "SomeConceptID",
            "range": "SomeConceptID"
          },
          ...
        ]
      }
    """
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def _build_concept_index(ontology_json: Dict[str, Any]) -> Dict[str, str]:
    """
    Map any known identifier to its human-readable label.
    Keys include 'qid', 'id', 'label' (all as strings).
    """
    idx: Dict[str, str] = {}
    for c in ontology_json.get("concepts", []):
        label = str(c.get("label", "")).strip()
        if not label:
            continue
        for keyname in ("qid", "id", "label"):
            val = c.get(keyname)
            if val is None:
                continue
            sval = str(val).strip()
            if sval:
                idx[sval] = label
    return idx

def _label_for(value: Any, cindex: Dict[str, str]) -> str:
    """Return the human-readable label for a given value, if known."""
    if value is None:
        return ""
    sval = str(value).strip()
    return cindex.get(sval, sval)

def format_ontology_concepts(ontology_json: Dict[str, Any]) -> str:
    """Render concepts as a comma-separated list of labels (like Prompt 1/2)."""
    labels: List[str] = []
    for c in ontology_json.get("concepts", []):
        lab = str(c.get("label", "")).strip()
        if lab:
            labels.append(lab)
    return ", ".join(labels)

def format_ontology_relations(ontology_json: Dict[str, Any]) -> str:
    """
    Render relations one per line, showing domain → range with labels.
    Example:
      - director(Film, Human)
      - country of origin(Film, Country)
    """
    cindex = _build_concept_index(ontology_json)
    lines: List[str] = []
    for r in ontology_json.get("relations", []):
        rel_label = str(r.get("label", "")).strip()
        dom_label = _label_for(r.get("domain"), cindex)
        rng_label = _label_for(r.get("range"), cindex)
        if rel_label:
            lines.append(f"- {rel_label}({dom_label},{rng_label})")
    return "\n".join(lines)

def _escape_multiline(s: str) -> str:
    """
    Escape backslashes and quotes for safe embedding in the user prompt.
    """
    return s.replace("\\", "\\\\").replace('"', '\\"')


In [9]:
def build_p3_system() -> str:
    return (
        "You are an open IE extractor operating under a fixed ontology. From the text, propose "
        "triples [subject, relation, object] that satisfy the ontology’s domain→range. For every "
        "triple, cite exact supporting span(s) and give a 0–1 confidence. Output JSON only."
    )

def build_p3_user(TEXT: str, ONTO: Dict[str, Any], k: int) -> str:
    """
    Build the user message for Prompt 3 (open IE under ontology),
    embedding the text and ontology, with list-style support spans.
    """
    return dedent(f"""\
    Task: Extract up to k triples that are directly supported by the text. You may paraphrase, but you must quote the evidence substrings. Enforce domain→range strictly; if a triple is invalid, omit it.

    Requirements
    - Only produce triples whose subject type matches the relation’s domain and whose object type matches the relation’s range.
    - Return JSON only, with this schema
        {{
          "triples": [
            {{
              "triple": ["subject","relation","object"],
              "subject_type": "Concept",
              "object_type": "Concept",
              "confidence": 0.0,
              "support": [
                {{"quote": "exact substring", "char_span": [start,end]}}
              ],
              "notes": "brief justification, including any pronoun/coref used"
            }}
          ]
        }}

    Text
    "{_escape_multiline(TEXT)}"

    Ontology concepts
    {format_ontology_concepts(ONTO)}

    Ontology relations (domain → range)
    {format_ontology_relations(ONTO)}

    Constraints
    - Extract ALL clearly stated factual triples in the text.
    - If a triple matches an ontology relation, enforce domain→range consistency.
    - If a triple does NOT match any ontology relation, you MUST STILL include it (do not discard it).
    - Always extract any explicit date, time, or year mentioned in the text as part of a factual triple.
    - Resolve pronouns to the nearest valid antecedent and describe that in notes.
    - Do not invent entities that are not mentioned in the text.
    - Output MUST be valid JSON and nothing else.
    """).strip()


In [10]:
def generate_raw_json(
    generator,
    tokenizer,
    system_text: str,
    user_text: str,
    max_new_tokens: int = 900,
    temperature: float = 0.25,
) -> str:
    """
    Generate RAW model output (string). No parsing here.
    Mirrors Prompt 1/2: format chat with tokenizer.apply_chat_template.
    """
    messages = [
        {"role": "system", "content": system_text},
        {"role": "user",   "content": user_text},
    ]
    formatted = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    out = generator(
        formatted,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=0.9,
        do_sample=True,
        return_full_text=False,
        truncation=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    return out[0]["generated_text"] if isinstance(out[0], dict) else out[0]


In [11]:
# def run_pipeline_prompt3(
#     input_jsonl_path: str,
#     ontology_json_path: str,
#     output_jsonl_path: str,
#     max_items: Optional[int] = None,
#     max_new_tokens: int = 900,
#     temperature: float = 0.25,
#     verbose: bool = False,  # True to inspect prompts & raw output
#     k: int = 6,
# ):
#     """
#     RAW-only pipeline for Prompt 3 (no filtering / no validation).
#     Writes a JSONL where each line contains:
#       - id
#       - input text
#       - prompts (system/user)
#       - response: raw + parsed json
#     """
#     # --- load ontology ---
#     with open(ontology_json_path, "r", encoding="utf-8") as f:
#         ontology_json = json.load(f)

#     # --- load inputs (with max_items) ---
#     items = list(read_jsonl(input_jsonl_path, max_items=max_items))
#     if verbose:
#         print(f"[RUN] Loaded {len(items)} input items from {input_jsonl_path}")

#     # --- model (same style as Prompt 1/2) ---
#     generator, tokenizer = setup_model()

#     outputs: List[Dict[str, Any]] = []

#     for i, rec in enumerate(items, start=1):
#         rid = str(rec.get("id") or f"item_{i}")
#         text, key_used = _extract_text(rec)

#         if verbose:
#             print(f"\n[RUN] === ID={rid} ===")
#             print(f"[INFO] text key: {key_used!r}")

#         # --- build Prompt 3 (open IE under ontology) ---
#         sys_prompt = build_p3_system()
#         usr_prompt = build_p3_user(text, ontology_json, k)

#         if verbose:
#             print("\n==== [DEBUG] SYSTEM PROMPT ====\n", sys_prompt)
#             print("\n==== [DEBUG] USER PROMPT ====\n", usr_prompt)

#         # --- generate RAW output (same signature as Prompt 1/2) ---
#         try:
#             raw = generate_raw_json(
#                 generator=generator,
#                 tokenizer=tokenizer,
#                 system_text=sys_prompt,
#                 user_text=usr_prompt,
#                 max_new_tokens=max_new_tokens,
#                 temperature=temperature,
#             )
#         except Exception as e:
#             print(f"[ERROR] Generation failed for {rid}: {e}")
#             raw = ""

#         if verbose:
#             print("\n==== [DEBUG] RAW MODEL OUTPUT ====\n", raw)

#         # --- parse JSON (best-effort, same as Prompt 1/2) ---
#         parsed = None
#         if isinstance(raw, str) and raw.strip():
#             try:
#                 parsed = json.loads(raw)
#             except Exception:
#                 m = re.search(r"\{[\s\S]*\}", raw)
#                 if m:
#                     try:
#                         parsed = json.loads(m.group(0))
#                     except Exception:
#                         parsed = None

#         # --- record output (Prompt 1/2 schema) ---
#         out_rec = {
#             "id": rid,
#             "input text": text,
#             "prompts": {
#                 "system_prompt": sys_prompt,
#                 "user_prompt": usr_prompt,
#             },
#             "response": {
#                 "LLM_output": raw,
#                 "json": parsed,
#             },
#         }
#         outputs.append(out_rec)

#     # --- write JSONL (one record per line) ---
#     write_jsonl(output_jsonl_path, outputs)
#     if verbose:
#         print(f"\n[RUN] Wrote {len(outputs)} records to {output_jsonl_path}")
#     return outputs


In [12]:
# # === Paths (adjust to your filesystem) ===
# ONTOLOGY_JSON = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/1_movie_ontology.json"
# INPUT_JSONL   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_1_movie_test.jsonl"
# OUTPUT_JSONL  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt3/wikidata/ont_1_movie_output_test.jsonl"

# # === Run knobs (mirror Prompt 1/2 defaults) ===
# MAX_ITEMS      = 1            # set to None to process all
# MAX_NEW_TOKENS = 900
# TEMPERATURE    = 0.25
# VERBOSE        = True         # True to inspect prompts + raw outputs
# K_CANDIDATES   = 6

# # --- Quick sanity peek (first record) ---
# first = next(read_jsonl(INPUT_JSONL, max_items=1), None)
# if first is None:
#     print(f"[ERROR] No records found in: {INPUT_JSONL}")
# else:
#     print("[DEBUG] First record keys:", list(first.keys()))
#     print(" id:", first.get("id"))
#     sent = first.get("sent") or first.get("text") or ""
#     print(" sent:", sent[:160] + ("..." if len(sent) > 160 else ""))

# # --- Run Prompt 3 pipeline (RAW only) ---
# _ = run_pipeline_prompt3(
#     input_jsonl_path=INPUT_JSONL,
#     ontology_json_path=ONTOLOGY_JSON,
#     output_jsonl_path=OUTPUT_JSONL,
#     max_items=MAX_ITEMS,
#     max_new_tokens=MAX_NEW_TOKENS,
#     temperature=TEMPERATURE,
#     verbose=VERBOSE,
#     k=K_CANDIDATES,
# )


In [13]:
def try_parse_json(raw: str) -> Optional[Dict[str, Any]]:
    """
    Same style as Prompt 2: first try json.loads, then fallback to first {...}
    """
    raw = raw.strip()
    try:
        return json.loads(raw)
    except Exception:
        pass

    m = re.search(r"\{.*\}", raw, flags=re.DOTALL)
    if m:
        block = m.group(0)
        try:
            return json.loads(block)
        except Exception:
            return None
    return None

def run_pipeline_prompt3(
    ontology_path: str,
    input_jsonl_path: str,
    output_jsonl_path: str,
    k_triples: int = 5,
    max_items: Optional[int] = None,
    verbose: bool = True,
    model_id: str = "mistralai/Mistral-7B-Instruct-v0.3",
    generator=None,
    tokenizer=None,
):
    """
    Run Prompt 3 over a single dataset file:
      - load ontology + rows
      - build prompts (Prompt 3 style)
      - generate model output
      - parse JSON
      - write trace jsonl

    Behavior:
      • If `generator` and `tokenizer` are provided, reuse them (no new model load).
      • Otherwise, load the model from `model_id` internally (backward compatible).
    """

    # 1. load ontology
    ontology_json = load_ontology_json(ontology_path)

    # 2. init / reuse model
    local_model_loaded = False
    if generator is None or tokenizer is None:
        generator, tokenizer = setup_model(model_id=model_id)
        local_model_loaded = True

    results: List[Dict[str, Any]] = []

    # 3. iterate input rows
    for idx, rec in enumerate(read_jsonl(input_jsonl_path, max_items=max_items)):
        rec_id = str(rec.get("id") or f"item_{idx}")
        text_val, text_key = extract_text_field(rec)

        # <-- Prompt 3 prompt builders
        sys_prompt = build_p3_system()
        usr_prompt = build_p3_user(text_val, ontology_json, k_triples)

        if verbose:
            print("======================================")
            print(f"[ID] {rec_id}")
            print(f"[TEXT_KEY] {text_key}")
            print("[SYSTEM PROMPT]\n", sys_prompt)
            print("[USER PROMPT]\n", usr_prompt)
            print("[SOURCE TEXT]\n", text_val)

        raw_response = generate_raw_json(
            generator=generator,
            tokenizer=tokenizer,
            system_text=sys_prompt,
            user_text=usr_prompt,
            max_new_tokens= 1500,
            temperature=0.25,
        )
        

        parsed_json = try_parse_json(raw_response)

        if verbose:
            print("[RAW RESPONSE]\n", raw_response)
            print("[PARSED JSON]\n", parsed_json)

        out_record = {
            "id": rec_id,
            "input text": text_val,
            "prompts": {
                "system_prompt": sys_prompt,
                "user_prompt": usr_prompt,
            },
            "response": {
                "LLM_output": raw_response,
                "json": parsed_json,
            },
        }

        results.append(out_record)

    # 4. write collected results
    write_jsonl(output_jsonl_path, results)

    if verbose:
        print(f"\n[P3 WRITE] {len(results)} rows -> {output_jsonl_path}")

    # optional cleanup if we self-loaded the model:
    # if local_model_loaded:
    #     torch.cuda.empty_cache()


In [14]:
# # === Paths (adjust to your filesystem) ===
# ONTOLOGY_JSON = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/1_movie_ontology.json"
# INPUT_JSONL   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_1_movie_test.jsonl"
# OUTPUT_JSONL  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt3/wikidata/ont_1_movie_output_test.jsonl"

# # === Debug / run knobs ===
# MAX_ITEMS      = 1        # only process first 1 row for debugging
# VERBOSE        = True     # print prompts, raw model output, parsed json
# MODEL_ID       = "mistralai/Mistral-7B-Instruct-v0.3"
# K_TRIPLES_P3   = 6        # how many ontology triples we include in the user prompt

# # --- Quick sanity peek (first record) ---
# first = next(read_jsonl(INPUT_JSONL, max_items=1), None)
# if first is None:
#     print(f"[ERROR] No records found in: {INPUT_JSONL}")
# else:
#     print("[DEBUG] First record keys:", list(first.keys()))
#     print(" id:", first.get("id"))
#     sent = first.get("sent") or first.get("text") or ""
#     preview = sent[:160] + ("..." if len(sent) > 160 else "")
#     print(" sent:", preview)

# # --- Run Prompt 3 pipeline on a small slice (MAX_ITEMS=1) ---
# _ = run_pipeline_prompt3(
#     ontology_path=ONTOLOGY_JSON,
#     input_jsonl_path=INPUT_JSONL,
#     output_jsonl_path=OUTPUT_JSONL,
#     k_triples=K_TRIPLES_P3,
#     max_items=MAX_ITEMS,
#     verbose=VERBOSE,
#     model_id=MODEL_ID,
#     # For debug we let run_pipeline_prompt3 load the model internally,
#     # so we do NOT pass generator/tokenizer here.
#     generator=None,
#     tokenizer=None,
# )


In [15]:
import os
import re

WIKIDATA_PATTERN_P3 = re.compile(r"^ont_(\d+)_([a-zA-Z0-9]+)_test\.jsonl$")

# === Debug / run knobs ===
MAX_ITEMS      = None        # only process first 1 row for debugging
VERBOSE        = True     # print prompts, raw model output, parsed json
MODEL_ID       = "mistralai/Mistral-7B-Instruct-v0.3"
K_TRIPLES_P3   = 6        # how many ontology triples we include in the user prompt

def make_wikidata_paths_p3(filename: str, base_input: str, base_onto: str, base_out: str):
    """
    Build all paths for a Wikidata file for Prompt 3.

    Example:
      filename = "ont_8_politician_test.jsonl"

      input_jsonl_path   -> <base_input>/ont_8_politician_test.jsonl
      ontology_json_path -> <base_onto>/8_politician_ontology.json
      output_jsonl_path  -> <base_out>/ont_8_politician_output.jsonl
      tag                -> "ont_8_politician"
    """
    m = WIKIDATA_PATTERN_P3.match(filename)
    if not m:
        raise ValueError(f"Unexpected filename format: {filename}")
    idx, cat = m.groups()

    input_jsonl_path = os.path.join(base_input, filename)
    ontology_json_path = os.path.join(base_onto, f"{idx}_{cat}_ontology.json")

    out_name = filename.replace("_test.jsonl", "_output.jsonl")
    output_jsonl_path = os.path.join(base_out, out_name)

    tag = f"ont_{idx}_{cat}"
    return input_jsonl_path, ontology_json_path, output_jsonl_path, tag


def run_wikidata_batch_p3(verbose: bool = True):
    BASE_INPUT = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/"
    BASE_ONTO  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/"
    BASE_OUT   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt3/wikidata/"

    FILENAMES = [
        #"ont_1_movie_test.jsonl",
        #"ont_2_music_test.jsonl",
        #"ont_3_sport_test.jsonl",
        "ont_4_book_test.jsonl",
        "ont_5_military_test.jsonl",
        "ont_6_computer_test.jsonl",
        "ont_7_space_test.jsonl",
        "ont_8_politics_test.jsonl",
        "ont_9_nature_test.jsonl",
        "ont_10_culture_test.jsonl",
    ]

    # ensure output dir exists
    os.makedirs(BASE_OUT, exist_ok=True)

    # load model once, reuse for all files
    generator, tokenizer = setup_model(model_id=MODEL_ID)

    for fname in FILENAMES:
        try:
            input_jsonl_path, ontology_json_path, output_jsonl_path, tag = make_wikidata_paths_p3(
                filename=fname,
                base_input=BASE_INPUT,
                base_onto=BASE_ONTO,
                base_out=BASE_OUT,
            )

            print("\n" + "=" * 80)
            print(f"[RUN P3] wikidata {tag}")
            print(f"[INPUT ] {input_jsonl_path}")
            print(f"[ONTO  ] {ontology_json_path}")
            print(f"[OUTPUT] {output_jsonl_path}")

            run_pipeline_prompt3(
                ontology_path=ontology_json_path,
                input_jsonl_path=input_jsonl_path,
                output_jsonl_path=output_jsonl_path,
                k_triples=K_TRIPLES_P3,
                max_items=MAX_ITEMS,
                verbose=False,
                model_id=MODEL_ID,
                generator=generator,   # reuse model
                tokenizer=tokenizer,   # reuse tokenizer
            )

            print(f"[DONE P3] wikidata {tag}")

        except Exception as exc:
            print(f"[ERROR P3] wikidata {fname}: {exc}max token")


In [16]:
run_wikidata_batch_p3()


⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0



[RUN P3] wikidata ont_4_book
[INPUT ] /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_4_book_test.jsonl
[ONTO  ] /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/4_book_ontology.json
[OUTPUT] /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt3/wikidata/ont_4_book_output.jsonl


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[DONE P3] wikidata ont_4_book

[RUN P3] wikidata ont_5_military
[INPUT ] /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_5_military_test.jsonl
[ONTO  ] /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/5_military_ontology.json
[OUTPUT] /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt3/wikidata/ont_5_military_output.jsonl
[DONE P3] wikidata ont_5_military

[RUN P3] wikidata ont_6_computer
[INPUT ] /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_6_computer_test.jsonl
[ONTO  ] /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/6_computer_ontology.json
[OUTPUT] /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt3/wikidata/ont_6_computer_output.jsonl
[DONE P3] wikidata ont_6_computer

[RUN P3] wikidata ont_7_space
[INPUT ] /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_7_space_test.jsonl
[ONTO  ] /upb/us

In [19]:
DBPEDIA_PATTERN_P3 = re.compile(r"^ont_(\d+)_([a-zA-Z0-9]+)_test\.jsonl$")
# === Debug / run knobs ===
MAX_ITEMS      = 1        # only process first 1 row for debugging
VERBOSE        = True     # print prompts, raw model output, parsed json
MODEL_ID       = "mistralai/Mistral-7B-Instruct-v0.3"
K_TRIPLES_P3   = 6        # how many ontology triples we include in the user prompt

def make_dbpedia_paths_p3(filename: str, base_input: str, base_onto: str, base_out: str):
    """
    Build all paths for a DBpedia file for Prompt 3.

    Example:
      filename = "ont_14_writtenwork_test.jsonl"

      input_jsonl_path   -> <base_input>/ont_14_writtenwork_test.jsonl
      ontology_json_path -> <base_onto>/14_writtenwork_ontology.json
      output_jsonl_path  -> <base_out>/ont_14_writtenwork_output.jsonl
      tag                -> "ont_14_writtenwork"
    """
    m = DBPEDIA_PATTERN_P3.match(filename)
    if not m:
        raise ValueError(f"Unexpected filename format: {filename}")
    idx, cat = m.groups()

    input_jsonl_path   = os.path.join(base_input, filename)
    ontology_json_path = os.path.join(base_onto, f"{idx}_{cat}_ontology.json")

    out_name = filename.replace("_test.jsonl", "_output.jsonl")
    output_jsonl_path  = os.path.join(base_out, out_name)

    tag = f"ont_{idx}_{cat}"
    return input_jsonl_path, ontology_json_path, output_jsonl_path, tag


def run_dbpedia_batch_p3(verbose: bool = True):
    BASE_INPUT = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/"
    BASE_ONTO  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/"
    BASE_OUT   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt3/dbpedia/"

    FILENAMES = [
        "ont_12_monument_test.jsonl",
        "ont_1_university_test.jsonl",
        "ont_2_musicalwork_test.jsonl",
        "ont_3_airport_test.jsonl",
        "ont_4_building_test.jsonl",
        "ont_5_athlete_test.jsonl",
        "ont_6_politician_test.jsonl",
        "ont_7_company_test.jsonl",
        "ont_8_celestialbody_test.jsonl",
        "ont_9_astronaut_test.jsonl",
        "ont_10_comicscharacter_test.jsonl",
        "ont_11_meanoftransportation_test.jsonl",
        "ont_13_food_test.jsonl",
        "ont_14_writtenwork_test.jsonl",
        "ont_15_sportsteam_test.jsonl",
        "ont_16_city_test.jsonl",
        "ont_17_artist_test.jsonl",
        "ont_18_scientist_test.jsonl",
        "ont_19_film_test.jsonl",
    ]

    # ensure output dir exists
    os.makedirs(BASE_OUT, exist_ok=True)

    # load model once, reuse for every DBpedia file
    generator, tokenizer = setup_model(model_id=MODEL_ID)

    for fname in FILENAMES:
        try:
            input_jsonl_path, ontology_json_path, output_jsonl_path, tag = make_dbpedia_paths_p3(
                filename=fname,
                base_input=BASE_INPUT,
                base_onto=BASE_ONTO,
                base_out=BASE_OUT,
            )

            print("\n" + "=" * 80)
            print(f"[RUN P3] dbpedia {tag}")
            print(f"[INPUT ] {input_jsonl_path}")
            print(f"[ONTO  ] {ontology_json_path}")
            print(f"[OUTPUT] {output_jsonl_path}")

            run_pipeline_prompt3(
                ontology_path=ontology_json_path,
                input_jsonl_path=input_jsonl_path,
                output_jsonl_path=output_jsonl_path,
                k_triples=K_TRIPLES_P3,
                max_items=None,
                verbose=verbose,
                model_id=MODEL_ID,
                generator=generator,   # reuse
                tokenizer=tokenizer,   # reuse
            )

            print(f"[DONE P3] dbpedia {tag}")

        except Exception as exc:
            print(f"[ERROR P3] dbpedia {fname}: {exc}")


In [20]:
# run_dbpedia_batch_p3()