In [1]:
!nvidia-smi

Tue Oct 21 12:32:28 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:81:00.0 Off |                  N/A |
|  0%   40C    P8             31W /  370W |       4MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [15]:
!kill 26205

/bin/bash: line 1: kill: (26205) - No such process


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
!hostname
!which python
import torch
print("CUDA available:", torch.cuda.is_available())

limbo
/opt/miniforge3/envs/jupyterhub/bin/python
CUDA available: True


In [3]:
# Block 0 — Imports & Config
import os, json, time
from textwrap import dedent
from typing import Dict, Any, List, Tuple, Optional

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

def setup_model(model_id: str = "mistralai/Mistral-7B-Instruct-v0.3"):
    print("⏳ Loading model:", model_id)
    torch.backends.cudnn.benchmark = True
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    model.config.use_cache = True
    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
    )
    return generator, tokenizer


In [4]:
# Block 1 — IO Utilities
def read_jsonl(path: str, max_items: Optional[int] = None):
    count = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            yield json.loads(line)
            count += 1
            if max_items is not None and count >= max_items:
                break

def write_jsonl(path: str, records):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False))
            f.write("\n")


In [5]:
# Block 2 — Ontology Helpers (label rendering without Q-IDs)

from typing import Dict, Any, List

def _build_concept_index(ontology_json: Dict[str, Any]) -> Dict[str, str]:
    """
    Build a lookup that maps any known identifier to its human-readable label.
    Keys include: concept['qid'], concept['id'], and concept['label'] (all as strings).
    """
    idx: Dict[str, str] = {}
    for c in ontology_json.get("concepts", []):
        label = str(c.get("label", "")).strip()
        if not label:
            continue
        # store by 'qid' (e.g., Q5), 'id' (if present), and the label itself
        for keyname in ("qid", "id", "label"):
            val = c.get(keyname)
            if val is None:
                continue
            sval = str(val).strip()
            if sval:
                idx[sval] = label
    return idx

def _label_for(value: Any, cindex: Dict[str, str]) -> str:
    """
    Given a domain/range value (could be 'Q5', 'human', etc.),
    return the human-readable label. Falls back to str(value) if unknown.
    """
    if value is None:
        return ""
    sval = str(value).strip()
    return cindex.get(sval, sval)

def format_ontology_concepts(ontology_json: Dict[str, Any]) -> str:
    # Just list labels; no Q-IDs
    labels: List[str] = []
    for c in ontology_json.get("concepts", []):
        lab = str(c.get("label", "")).strip()
        if lab:
            labels.append(lab)
    return ", ".join(labels)

def format_ontology_relations(ontology_json: Dict[str, Any]) -> str:
    """
    Render as:
      - director(film,human)
      - country of origin(film,country)
    Never show Q-IDs.
    """
    cindex = _build_concept_index(ontology_json)
    lines: List[str] = []
    for r in ontology_json.get("relations", []):
        rel_label = str(r.get("label", "")).strip()
        dom_label = _label_for(r.get("domain"), cindex)
        rng_label = _label_for(r.get("range"),  cindex)
        if rel_label:
            lines.append(f"- {rel_label}({dom_label},{rng_label})")
    return "\n".join(lines)

def _escape_multiline(s: str) -> str:
    return s.replace("\\", "\\\\").replace('"', '\\"')


In [6]:
# Block 3 — Prompt 1 builders
def build_p1_system() -> str:
    return (
        "You are a KG triple proposer in a Tree-of-Thoughts loop. "
        "First detect entity mentions and assign tentative ontology types. "
        "Then, using those mentions, propose candidate triples that are valid under the ontology (domain→range). "
        "Return only JSON."
    )

def build_p1_user(TEXT: str, ONTO: Dict[str, Any], k: int) -> str:
    return dedent(f'''
    Task: From the text, 1) list detected mentions with tentative types, 2) propose up to k={k} candidate triples [subject, relation, object]. Use only relations whose domain/range match the types you inferred. For each triple, include confidence ∈ [0,1] and cite the exact supporting span(s).

    Text
    "{_escape_multiline(TEXT)}"

    Ontology concepts
    {format_ontology_concepts(ONTO)}

    Ontology relations (domain → range)
    {format_ontology_relations(ONTO)}

    Output format (JSON only)
    {{
      "mentions": [
        {{"surface": "...", "type_candidates": ["ConceptA","ConceptB"], "span": [start,end]}}
      ],
      "triples": [
        {{
          "triple": ["subject","relation","object"],
          "confidence": 0.0,
          "support": "exact quote from text",
          "notes": "why domain/range fits"
        }}
      ]
    }}

    Constraints
    Only output domain/range-valid triples.
    Normalize dates to YYYY-MM-DD when possible.
    If a pronoun is required, resolve it to the nearest valid antecedent and state that in notes.
    Do not invent entities not in the text.
    ''').strip()


In [7]:
# Block 4 — Generation
def generate_raw_json(generator, tokenizer, system_text: str, user_text: str,
                      max_new_tokens: int = 768, temperature: float = 0.25) -> str:
    messages = [
        {"role": "system", "content": system_text},
        {"role": "user", "content": user_text},
    ]
    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    out = generator(
        formatted,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=0.9,
        do_sample=True,
        return_full_text=False,
        truncation=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    return out[0]["generated_text"] if isinstance(out[0], dict) else out[0]


In [8]:
    # Block 5 — Orchestrator (Prompt 1, RAW-only + readable JSONL)

    from typing import Dict, Any, List, Tuple, Optional
    import json, time

    # Prefer your input's 'sent' field, but fall back gracefully
    _TEXT_KEYS_PRIORITY = ("sent", "text", "Text", "sentence", "Sentence")

    def _extract_text(rec: Dict[str, Any]) -> Tuple[str, str]:
        """Return (text, key_used) from an input record."""
        for k in _TEXT_KEYS_PRIORITY:
            v = rec.get(k)
            if isinstance(v, str) and v.strip():
                return v.strip(), k
        # Fallback: choose the longest string-valued field
        best_k, best_v = "", ""
        for k, v in rec.items():
            if isinstance(v, str) and len(v) > len(best_v):
                best_k, best_v = k, v
        return best_v, best_k

    def run_pipeline_prompt1(
        input_jsonl_path: str,
        ontology_json_path: str,
        output_jsonl_path: str,
        max_items: Optional[int] = None,
        max_new_tokens: int = 768,
        temperature: float = 0.25,
        verbose: bool = True,
        k: int = 6,
    ):
        """
        RAW-only pipeline for Prompt 1 (no filtering / no validation).
        Writes a JSONL where each line contains:
        - id, prompt tag
        - input (echo of text/k/ontology path)
        - prompts (system/user exactly as sent)
        - model + gen_params
        - response: 
            * text  (verbatim raw LLM output)
            * json  (parsed copy if valid; else null)
            * json_valid (bool)
            * parse_error (string or null)
            * summary (tiny counts if json is valid)
        - timestamp
        """
        # Load ontology
        with open(ontology_json_path, "r", encoding="utf-8") as f:
            ontology_json = json.load(f)

        # Load dataset
        items = list(read_jsonl(input_jsonl_path, max_items=max_items))
        if verbose:
            print(f"[RUN] Loaded {len(items)} input items from {input_jsonl_path}")

        # Model
        generator, tokenizer = setup_model()

        outputs: List[Dict[str, Any]] = []

        for i, rec in enumerate(items, start=1):
            rid = str(rec.get("id") or f"item_{i}")
            text, key_used = _extract_text(rec)

            if verbose:
                print(f"\n[RUN] === ID={rid} ===")
                print(f"[INFO] text key: {key_used!r}")

            # Build Prompt 1
            sys_prompt = build_p1_system()
            usr_prompt = build_p1_user(text, ontology_json, k)

            if verbose:
                print("\n==== [DEBUG] SYSTEM PROMPT ====\n", sys_prompt)
                print("\n==== [DEBUG] USER PROMPT ====\n", usr_prompt)

            # Generate (RAW text only)
            try:
                raw = generate_raw_json(
                    generator, tokenizer, sys_prompt, usr_prompt,
                    max_new_tokens=max_new_tokens, temperature=temperature
                )
            except Exception as e:
                print(f"[ERROR] Generation failed for {rid}: {e}")
                raw = ""

            if verbose:
                print("\n==== [DEBUG] RAW MODEL OUTPUT ====\n", raw)

            # Try to parse JSON for convenience ONLY (no filtering). Keep raw text regardless.
            parsed = None
            parse_error = None
            if isinstance(raw, str) and raw.strip():
                try:
                    parsed = json.loads(raw)
                except Exception as e:
                    # Optional: minimal recovery – try the first {...} block
                    try:
                        import re
                        m = re.search(r"\{[\s\S]*\}", raw)
                        if m:
                            parsed = json.loads(m.group(0))
                        else:
                            parse_error = str(e)
                    except Exception as e2:
                        parse_error = f"{e} | recovery: {e2}"

            # Tiny convenience summary (just counts; doesn’t modify content)
            summary = None
            if isinstance(parsed, dict):
                summary = {
                    "mentions": len(parsed.get("mentions", [])) if isinstance(parsed.get("mentions"), list) else None,
                    "triples":  len(parsed.get("triples", []))  if isinstance(parsed.get("triples"), list)  else None,
                }
            out_rec = {
                "id": rid,
                "input": text,
                "prompts": {
                    "system": sys_prompt,
                    "user": usr_prompt
                },
                "response": {
                    "text": raw,                 # verbatim raw string
                    "json": parsed,              # parsed copy if valid; else null
                    "json_valid": parsed is not None,
                    "parse_error": parse_error,  # why parsing failed (if it did)
                    "summary": summary           # mentions/triples counts
                },
                "timestamp": int(time.time())
            }

            outputs.append(out_rec)

        write_jsonl(output_jsonl_path, outputs)
        if verbose:
            print(f"\n[RUN] Wrote {len(outputs)} records to {output_jsonl_path}")
        return outputs


In [9]:
# Block 6 — Example Run

ONTOLOGY_JSON = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/1_movie_ontology.json"
INPUT_JSONL   =  "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_1_movie_test.jsonl" 
OUTPUT_JSONL  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_1_movie_output_test.jsonl" 

MAX_ITEMS      = 1 # None   # set 1 for quick smoke test
MAX_NEW_TOKENS = 768
TEMPERATURE    = 0.25
VERBOSE        = True
K_CANDIDATES   = 6

first = next(read_jsonl(INPUT_JSONL, max_items=1), None)
if first is None:
    print(f"[ERROR] No records found in: {INPUT_JSONL}")
else:
    print("[DEBUG] First record keys:", list(first.keys()))
    print(" id:", first.get("id"))
    print(" sent:", (first.get("sent") or "")[:160] + ("..." if first.get("sent") and len(first["sent"])>160 else ""))

_ = run_pipeline_prompt1(
    input_jsonl_path=INPUT_JSONL,
    ontology_json_path=ONTOLOGY_JSON,
    output_jsonl_path=OUTPUT_JSONL,
    max_items=MAX_ITEMS,
    max_new_tokens=MAX_NEW_TOKENS,
    temperature=TEMPERATURE,
    verbose=VERBOSE,
    k=K_CANDIDATES,
)


[DEBUG] First record keys: ['id', 'sent']
 id: ont_1_movie_test_1
 sent: Bleach: Hell Verse (Japanese: BLEACH , Hepburn: BurÄ«chi Jigoku-Hen) is a 2010 Japanese animated film directed by Noriyuki Abe.
[RUN] Loaded 1 input items from /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_1_movie_test.jsonl
⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0



[RUN] === ID=ont_1_movie_test_1 ===
[INFO] text key: 'sent'

==== [DEBUG] SYSTEM PROMPT ====
 You are a KG triple proposer in a Tree-of-Thoughts loop. First detect entity mentions and assign tentative ontology types. Then, using those mentions, propose candidate triples that are valid under the ontology (domain→range). Return only JSON.

==== [DEBUG] USER PROMPT ====
 Task: From the text, 1) list detected mentions with tentative types, 2) propose up to k=6 candidate triples [subject, relation, object]. Use only relations whose domain/range match the types you inferred. For each triple, include confidence ∈ [0,1] and cite the exact supporting span(s).

    Text
    "Bleach: Hell Verse (Japanese: BLEACH , Hepburn: BurÄ«chi Jigoku-Hen) is a 2010 Japanese animated film directed by Noriyuki Abe."

    Ontology concepts
    human, city, country, film, film genre, genre, film production company, film award, award, written work, film character, film organization

    Ontology relations (domai