In [1]:
!nvidia-smi

Thu Oct 30 03:12:26 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:81:00.0 Off |                  N/A |
| 74%   54C    P5            157W /  370W |       4MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
!kill 1050157

In [2]:
!hostname
!which python
import torch
print("CUDA available:", torch.cuda.is_available())

limbo
/opt/miniforge3/envs/jupyterhub/bin/python
CUDA available: True


In [2]:
# === Required imports ===
import os
import json
from textwrap import dedent
from typing import Dict, Any, List, Tuple, Optional
import itertools

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# --- FIXED setup_model with pad_token patch ---
def setup_model(model_id: str = "mistralai/Mistral-7B-Instruct-v0.3"):
    """
    Load the chat model + tokenizer and return a text-generation pipeline.
    Uses half precision + device_map='auto' for efficiency.
    """
    print(f"[LOAD] model={model_id}")
    torch.backends.cudnn.benchmark = True

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # ✅ FIX: set pad_token = eos_token if missing
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
        tokenizer.padding_side = "left"

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    model.config.use_cache = True

    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
        batch_size=16,       # ✅ enables batching
        truncation=True,     # ✅ prevents runaway context
    )

    return generator, tokenizer


In [3]:
from typing import Dict, Any, List, Tuple, Optional

# BLOCK 2
def read_jsonl(path: str, max_items: Optional[int] = None):
    """
    Stream records from a .jsonl file.
    Stops early if max_items is provided.
    """
    count = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            yield json.loads(line)
            count += 1
            if max_items is not None and count >= max_items:
                break


def write_jsonl(path: str, records: List[Dict[str, Any]]):
    """
    Write a list of dicts as JSON lines.
    Creates parent directory if needed.
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False))
            f.write("\n")


_TEXT_KEYS_PRIORITY = ("sent", "text", "Text", "sentence", "Sentence")


def extract_text_field(rec: Dict[str, Any]) -> Tuple[str, str]:
    """
    Heuristic to pick the text field from an input record.
    Returns (text_value, key_used).
    Falls back to the longest string field if none of the preferred keys exist.
    """
    for k in _TEXT_KEYS_PRIORITY:
        v = rec.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip(), k

    # fallback: choose longest string in record
    best_key, best_val = "", ""
    for k, v in rec.items():
        if isinstance(v, str) and len(v) > len(best_val):
            best_key, best_val = k, v
    return best_val.strip(), best_key


In [4]:
# BLOCK 3
def _build_concept_index(ontology_json: Dict[str, Any]) -> Dict[str, str]:
    """
    Map any known identifier (qid/id/label) -> canonical label string.
    This lets us convert domain/range IDs into human-readable names.
    """
    idx: Dict[str, str] = {}
    for concept in ontology_json.get("concepts", []):
        label = str(concept.get("label", "")).strip()
        if not label:
            continue

        for keyname in ("qid", "id", "label"):
            raw_val = concept.get(keyname)
            if raw_val is None:
                continue

            sval = str(raw_val).strip()
            if sval:
                idx[sval] = label
    return idx


def _label_for(raw_val: Any, cindex: Dict[str, str]) -> str:
    """
    Convert domain/range IDs to readable labels.
    Fallback to string form of raw_val.
    """
    if raw_val is None:
        return ""
    rval = str(raw_val).strip()
    return cindex.get(rval, rval)


def render_concept_list(ontology_json: Dict[str, Any]) -> str:
    """
    Return a bullet list of ontology concepts by label.
    """
    lines: List[str] = []
    for c in ontology_json.get("concepts", []):
        label = str(c.get("label", "")).strip()
        if label:
            lines.append(f"- {label}")
    return "\n".join(lines)


def render_relation_list(ontology_json: Dict[str, Any]) -> str:
    """
    Return a bullet list of relations with (domain, range) in human-readable form.
    Format: - relationLabel(domainLabel,rangeLabel)
    """
    cindex = _build_concept_index(ontology_json)
    lines: List[str] = []
    for r in ontology_json.get("relations", []):
        rel_label = str(r.get("label", "")).strip()
        dom_label = _label_for(r.get("domain"), cindex)
        rng_label = _label_for(r.get("range"), cindex)
        if rel_label:
            lines.append(f"- {rel_label}({dom_label},{rng_label})")
    return "\n".join(lines)


def _escape_multiline(s: str) -> str:
    """
    Escape backslashes and quotes so we can safely embed text
    inside quoted blocks in the USER prompt.
    """
    return s.replace("\\", "\\\\").replace('"', '\\"')


In [5]:
# BLOCK 4
def build_prompt1_system() -> str:
    """
    SYSTEM message for Prompt 1.
    Allows both ontology-aligned and non-ontology triples,
    still returns strict JSON only.
    """
    return (
        "You are a KG triple proposer in a Tree-of-Thoughts loop. "
        "First detect entity mentions and assign tentative ontology types. "
        "Then propose candidate triples [subject, relation, object] that express factual statements in the text. "
        "You MUST include:\n"
        "1) triples whose relation/domain/range matches the ontology, AND\n"
        "2) any other clearly stated factual triples in the text even if the relation or types are not present in the ontology.\n"
        "Return only JSON. Do not include any natural language outside JSON."
    )


def build_prompt1_user(
    TEXT: str,
    ontology_json: Dict[str, Any],
    k: int,
) -> str:
    """
    USER message for Prompt 1.
    Updated so the model will not suppress non-ontology triples.
    We keep the same JSON schema (mentions[], triples[]),
    but adjust the task + constraints language.
    """
    concept_block = render_concept_list(ontology_json)
    relation_block = render_relation_list(ontology_json)

    return dedent(f"""
    Task:
    1) From the text, list detected entity mentions with tentative ontology types.
    2) Propose up to k={k} candidate triples [subject, relation, object].

    VERY IMPORTANT:
    - You MUST include all explicit factual triples stated in the text, even if the relation,
      subject type, or object type is not listed in the ontology.
    - ALSO include ontology-valid triples whose domain/range matches the ontology relations.

    For each triple, include confidence ∈ [0,1] and cite the exact supporting span(s).

    Text
    "{_escape_multiline(TEXT)}"

    Ontology concepts
    {concept_block}

    Ontology relations (domain → range)
    {relation_block}

    Output format (JSON only)
    {{
      "mentions": [
        {{"surface": "...", "type_candidates": ["ConceptA","ConceptB"], "span": [start,end]}}
      ],
      "triples": [
        {{
          "triple": ["subject","relation","object"],
          "confidence": 0.0,
          "support": "exact quote from text",
          "notes": "why this triple is supported; if ontology applies, explain domain/range fit. If not in ontology, say 'not in ontology but supported by text'."
        }}
      ]
    }}

    Constraints
    - Extract ALL clearly stated factual triples in the text.
    - If a triple matches an ontology relation, enforce domain→range consistency and mention that in notes.
    - If a triple does NOT match any ontology relation, you MUST STILL include it (do not discard it).
    - Always extract any explicit date, time, or year mentioned in the text as part of a factual triple.
    - Resolve pronouns to the nearest valid antecedent and describe that in notes.
    - Do not invent entities that are not mentioned in the text.
    - Output MUST be valid JSON and nothing else.
    """).strip()


In [6]:
# BLOCK 6
import json
import re
from typing import Any, Dict, List, Optional, Tuple

def robust_parse_model_output(raw_response: str) -> Dict[str, Any]:
    """
    Try very hard to get structured data out of the model response.
    Returns a dict with at least:
      {
        "triples": [ [head, rel, tail], ... ],
        "mentions": [...] or None,
        "raw_json_obj": ... or None
      }
    """

    # 1. Try direct parse
    try:
        obj = json.loads(raw_response)
        return {
            "triples": extract_triples_from_obj(obj),
            "mentions": obj.get("mentions"),
            "raw_json_obj": obj,
        }
    except Exception:
        pass

    # 2. Try slice from first { to last }
    try:
        start_i = raw_response.find("{")
        end_i = raw_response.rfind("}")
        if start_i != -1 and end_i != -1 and end_i > start_i:
            candidate = raw_response[start_i:end_i+1]
            obj = json.loads(candidate)
            return {
                "triples": extract_triples_from_obj(obj),
                "mentions": obj.get("mentions"),
                "raw_json_obj": obj,
            }
    except Exception:
        pass

    # 3. Fallback: regex mine triples from messy text
    triples = extract_triples_via_regex(raw_response)

    return {
        "triples": triples,
        "mentions": None,
        "raw_json_obj": None,
    }


def extract_triples_from_obj(obj: Any) -> List[List[str]]:
    """
    Safely pull triples out of a parsed JSON object,
    accounting for ["h","r","t"] or {"triple":[...]} formats.
    """
    results: List[List[str]] = []

    # if it's dict-like and has "triples"
    if isinstance(obj, dict) and "triples" in obj:
        for item in obj["triples"]:
            # item could be ["h","r","t"]
            if isinstance(item, list) and len(item) >= 3:
                results.append(item[:3])
            # item could be { "triple": ["h","r","t"], ... }
            elif isinstance(item, dict) and "triple" in item:
                tval = item["triple"]
                if isinstance(tval, list) and len(tval) >= 3:
                    results.append(tval[:3])

    return results


def extract_triples_via_regex(raw_text: str) -> List[List[str]]:
    """
    Ultra-forgiving fallback.
    Finds patterns like:
      "triple": ["Head", "Rel", "Tail"]
    even if the outer JSON is broken.
    """
    triples: List[List[str]] = []

    triple_pattern = re.compile(
        r'"triple"\s*:\s*\[\s*"([^"]+)"\s*,\s*"([^"]+)"\s*,\s*"([^"]+)"\s*\]'
    )

    for match in triple_pattern.finditer(raw_text):
        h, r, t = match.groups()
        triples.append([h, r, t])

    return triples


In [7]:
# BLOCK 5 — Safe + self-contained (adaptive batching)

# === Global-safe defaults (in case Block 1 not yet run) ===
BATCH_SIZE   = globals().get("BATCH_SIZE", 16)   # same as Mistral batch setup
ADAPT_FACTOR = globals().get("ADAPT_FACTOR", 2)  # dyn_max ≈ 2× input tokens
ADAPT_CAP    = globals().get("ADAPT_CAP", 3000)   # hard cap to prevent slow runs


def _build_prompt_text(tokenizer, system_msg: str, user_msg: str) -> str:
    """
    Build a chat-style prompt text using tokenizer’s chat template.
    """
    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user",   "content": user_msg},
    ]
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )


def _batched(iterable, n):
    """
    Yield chunks of size n from iterable.
    """
    it = iter(iterable)
    while True:
        chunk = list(itertools.islice(it, n))
        if not chunk:
            break
        yield chunk


def _compute_dyn_max_new_tokens(
    prompts: List[str],
    tokenizer,
    cap: int = ADAPT_CAP,
    factor: int = ADAPT_FACTOR
) -> int:
    """
    Adaptive per-batch cap:
      dyn_max = max( min(factor * input_len_tokens, cap) for p in prompts )
    Ensures output length scales with input but never exceeds cap.
    """
    lens = [tokenizer(p, return_tensors="pt")["input_ids"].shape[1] for p in prompts]
    return max(min(factor * L, cap) for L in lens) if lens else cap


def generate_model_responses_batched(
    generator,
    tokenizer,
    prompt_texts: List[str],
    temperature: float = 0.25,
    cap_max_new_tokens: int = ADAPT_CAP,  # interpret caller’s max as a cap
    verbose: bool = False,
) -> List[str]:
    """
    Generate model outputs in true batches with:
      (A) truncation=True and adaptive max_new_tokens
      (B) batching via HF pipeline
    Returns a list of raw generated strings, one per prompt.
    """
    outputs: List[str] = []

    for batch_prompts in _batched(prompt_texts, BATCH_SIZE):
        # adaptive max per batch (respect caller’s cap if smaller)
        dyn_max = _compute_dyn_max_new_tokens(batch_prompts, tokenizer, cap=cap_max_new_tokens)

        if verbose:
            print(f"[BATCH] size={len(batch_prompts)}  dyn_max={dyn_max}")

        out_batch = generator(
            batch_prompts,
            max_new_tokens=dyn_max,   # (A) adaptive cap
            truncation=True,          # (A) enable truncation
            do_sample=True,
            temperature=temperature,
            top_p=0.9,
            return_full_text=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            batch_size=BATCH_SIZE,    # (B) true batching
        )

        # HF pipeline may return nested lists; flatten if needed
        if out_batch and isinstance(out_batch[0], list):
            flat = [o for sub in out_batch for o in sub]
        else:
            flat = out_batch

        for o in flat:
            outputs.append(o["generated_text"] if isinstance(o, dict) else o)

    return outputs


In [9]:
# BLOCK 7
def run_prompt1_pipeline(
    generator,
    tokenizer,
    input_jsonl_path: str,
    ontology_json_path: str,
    output_jsonl_path: str,
    k: int = 6,
    max_items: Optional[int] = None,
    max_new_tokens: int = 900,     # we now treat this as a CAP (ADAPT_CAP by default)
    temperature: float = 0.25,
    verbose: bool = False,
) -> List[Dict[str, Any]]:
    """
    Core batch function for Prompt 1.
    Reads each row from input_jsonl_path,
    runs Prompt 1 on it (BATCHED + adaptive length),
    and writes a JSONL of model outputs with parsed JSON.
    """

    # load ontology once
    with open(ontology_json_path, "r", encoding="utf-8") as f:
        ontology_json = json.load(f)

    sys_text = build_prompt1_system()
    outputs: List[Dict[str, Any]] = []

    # 1) Build prompts for all selected records
    recs = list(read_jsonl(input_jsonl_path, max_items=max_items))
    prompt_texts: List[str] = []
    meta: List[Tuple[int, Dict[str, Any], str]] = []  # (idx, rec, text_val)

    for idx, rec in enumerate(recs):
        # extract text from record
        text_val, text_key = extract_text_field(rec)

        # build user prompt
        usr_text = build_prompt1_user(
            TEXT=text_val,
            ontology_json=ontology_json,
            k=k,
        )

        if verbose:
            print(f"\n[ITEM {idx}] text_key={text_key}")
            print(f"[PROMPT_USER] {usr_text[:320]} ...")

        # build chat prompt text for the model
        prompt_text = _build_prompt_text(tokenizer, sys_text, usr_text)
        prompt_texts.append(prompt_text)
        meta.append((idx, rec, text_val))

    # 2) Generate in batches (A: truncation+adaptive, B: batching)
    cap = min(max_new_tokens, ADAPT_CAP) if max_new_tokens else ADAPT_CAP
    raw_outputs = generate_model_responses_batched(
        generator=generator,
        tokenizer=tokenizer,
        prompt_texts=prompt_texts,
        temperature=temperature,
        cap_max_new_tokens=cap,
        verbose=verbose,
    )

    # 3) Parse + package
    for (idx, rec, text_val), raw_response in zip(meta, raw_outputs):
        parsed_bundle = robust_parse_model_output(raw_response)

        out_record = {
            "id": rec.get("id"),
            "input text": text_val,
            "prompts": {
                "system_prompt": sys_text,
                "user_prompt": None,  # omit full user prompt to reduce file size; add if you want
            },
            "response": {
                "LLM_output": raw_response,
                "json": parsed_bundle.get("raw_json_obj"),
            },
        }
        outputs.append(out_record)

    # 4) write all outputs
    write_jsonl(output_jsonl_path, outputs)

    if verbose:
        print(f"[DONE] wrote {len(outputs)} records -> {output_jsonl_path}")

    return outputs


In [10]:
# # ============================ DEBUG BLOCK (BATCHED) ============================

# import pprint

# # --- CONFIG ---
# ONTOLOGY_JSON = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/1_movie_ontology.json"
# INPUT_JSONL   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_1_movie_test.jsonl"
# OUTPUT_JSONL  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_1_movie_output_test8.jsonl"

# MAX_ITEMS      = 1           # how many examples to actually test
# TEMPERATURE    = 0.25
# VERBOSE        = True
# K_CANDIDATES   = 6

# # batching + adaptive length knobs
# BATCH_SIZE     = 16          # match your Mistral batch setup
# ADAPT_FACTOR   = 2           # dyn_max ≈ 2x input length
# ADAPT_CAP      = 3000         # hard upper cap for speed

# def compute_dyn_max_new_tokens(prompts, tokenizer, cap=ADAPT_CAP, factor=ADAPT_FACTOR):
#     lens = [tokenizer(p, return_tensors="pt")["input_ids"].shape[1] for p in prompts]
#     return max(min(factor * L, cap) for L in lens) if lens else cap

# def generate_model_responses_batched(
#     generator,
#     tokenizer,
#     system_msg: str,
#     user_msgs: list[str],
#     temperature: float = 0.25,
#     batch_size: int = BATCH_SIZE,
# ) -> list[str]:
#     """
#     Generate for a list of USER messages in one or more batches using the HF pipeline.
#     Implements:
#       (A) truncation=True + adaptive max_new_tokens per batch
#       (B) true batching with batch_size
#     """
#     # Build chat-formatted prompts (system is shared)
#     prompts = []
#     for um in user_msgs:
#         messages = [
#             {"role": "system", "content": system_msg},
#             {"role": "user",   "content": um},
#         ]
#         prompt_text = tokenizer.apply_chat_template(
#             messages,
#             tokenize=False,
#             add_generation_prompt=True,
#         )
#         prompts.append(prompt_text)

#     outputs: list[str] = []

#     # Process in batches
#     for i in range(0, len(prompts), batch_size):
#         batch_prompts = prompts[i:i+batch_size]
#         dyn_max = compute_dyn_max_new_tokens(batch_prompts, tokenizer)

#         if VERBOSE:
#             print(f"\n[BATCH GEN] items={len(batch_prompts)} dyn_max={dyn_max}")

#         outs = generator(
#             batch_prompts,
#             max_new_tokens=dyn_max,       # (A) adaptive length
#             truncation=True,               # (A) truncation ON
#             do_sample=True,                # keep your sampling as-is
#             temperature=temperature,
#             top_p=0.9,
#             return_full_text=False,
#             eos_token_id=tokenizer.eos_token_id,
#             pad_token_id=tokenizer.eos_token_id,
#             batch_size=min(batch_size, len(batch_prompts)),  # (B) true batching
#         )

#         # HF pipeline can return nested lists; flatten if needed
#         if outs and isinstance(outs[0], list):
#             outs = [o for sub in outs for o in sub]

#         for o in outs:
#             outputs.append(o["generated_text"] if isinstance(o, dict) else o)

#     return outputs


# # --- 1) Quick peek at input file ---
# peek_items = list(read_jsonl(INPUT_JSONL, max_items=3))

# if not peek_items:
#     print(f"[ERROR] No records found in: {INPUT_JSONL}")
# else:
#     print(f"[DEBUG] Loaded {len(peek_items)} sample record(s) from {INPUT_JSONL}")
#     for i, rec in enumerate(peek_items[:MAX_ITEMS or 1]):
#         text_val, text_key = extract_text_field(rec)
#         print(f"\n--- SAMPLE {i} ---")
#         print("[keys]:", list(rec.keys()))
#         print(" id:", rec.get("id"))
#         print(f" chosen_text_key: {text_key}")
#         preview = text_val[:200] + ("..." if len(text_val) > 200 else "")
#         print(" text preview:", preview)

# # --- 2) Load ontology for prompt inspection ---
# with open(ONTOLOGY_JSON, "r", encoding="utf-8") as f:
#     ontology_data = json.load(f)

# concept_block_dbg  = render_concept_list(ontology_data)
# relation_block_dbg = render_relation_list(ontology_data)

# print("\n[DEBUG] ONTOLOGY CONCEPT LIST (truncated):")
# print(concept_block_dbg[:500] + ("..." if len(concept_block_dbg) > 500 else ""))

# print("\n[DEBUG] ONTOLOGY RELATION LIST (truncated):")
# print(relation_block_dbg[:500] + ("..." if len(relation_block_dbg) > 500 else ""))

# # --- 3) Show SYSTEM and USER prompt for first sample ---
# if peek_items:
#     sample_text, _ = extract_text_field(peek_items[0])

#     system_prompt_dbg = build_prompt1_system()
#     user_prompt_dbg   = build_prompt1_user(
#         TEXT=sample_text,
#         ontology_json=ontology_data,
#         k=K_CANDIDATES,
#     )

#     print("\n================ [SYSTEM PROMPT] ================")
#     print(system_prompt_dbg)

#     print("\n================ [USER PROMPT - FIRST SAMPLE] ================")
#     up_prev = user_prompt_dbg[:12000]
#     print(up_prev)
#     if len(user_prompt_dbg) > 12000:
#         print("... [USER PROMPT TRUNCATED FOR DISPLAY] ...")

# # --- 4) Spin up the model ---
# generator, tokenizer = setup_model()

# # --- 5) Dry-run: batched generation for the first MAX_ITEMS records ---
# if peek_items:
#     # Prepare USER prompts for the first MAX_ITEMS items
#     sel_items = peek_items[:MAX_ITEMS or 1]
#     user_msgs = []
#     for rec in sel_items:
#         text_val, _ = extract_text_field(rec)
#         user_msgs.append(
#             build_prompt1_user(
#                 TEXT=text_val,
#                 ontology_json=ontology_data,
#                 k=K_CANDIDATES,
#             )
#         )

#     raw_list = generate_model_responses_batched(
#         generator=generator,
#         tokenizer=tokenizer,
#         system_msg=build_prompt1_system(),
#         user_msgs=user_msgs,
#         temperature=TEMPERATURE,
#         batch_size=BATCH_SIZE,
#     )

#     # Show the first raw output and a parsed preview
#     first_raw = raw_list[0]
#     print("\n================ [RAW MODEL OUTPUT - FIRST SAMPLE] ================")
#     print(first_raw[:150000] + ("..." if len(first_raw) > 150000 else ""))

#     parsed_single = None
#     try:
#         parsed_single = json.loads(first_raw)
#     except Exception:
#         try:
#             start_i = first_raw.find("{")
#             end_i   = first_raw.rfind("}")
#             if start_i != -1 and end_i != -1 and end_i > start_i:
#                 candidate = first_raw[start_i:end_i+1]
#                 parsed_single = json.loads(candidate)
#         except Exception:
#             parsed_single = None

#     print("\n================ [PARSED MODEL JSON - FIRST SAMPLE] ================")
#     pprint.pprint(parsed_single, width=120)

#     # Sanity check for evaluator compatibility:
#     if parsed_single and "triples" in parsed_single:
#         print("\n[CHECK] triples[0] example for evaluator compatibility:")
#         if parsed_single["triples"]:
#             pprint.pprint(parsed_single["triples"][0], width=100)
#         else:
#             print("No triples returned.")
#     else:
#         print("[WARN] Model output did not parse into expected {'mentions':..., 'triples':...} shape.")

# # --- 6) Full mini-pipeline run (writes OUTPUT_JSONL) on first MAX_ITEMS records ---
# print("\n================ [BATCH PIPELINE RUN] ================")
# batch_outputs = run_prompt1_pipeline(
#     generator=generator,
#     tokenizer=tokenizer,
#     input_jsonl_path=INPUT_JSONL,
#     ontology_json_path=ONTOLOGY_JSON,
#     output_jsonl_path=OUTPUT_JSONL,
#     max_items=MAX_ITEMS,
#     max_new_tokens=900,      # this is ignored in the batched path if you've applied A/B there too
#     temperature=TEMPERATURE,
#     verbose=VERBOSE,
#     k=K_CANDIDATES,
# )

# print(f"\n[SUCCESS] Wrote {len(batch_outputs)} items to {OUTPUT_JSONL}")

# # --- 7) Peek at what was written (first 1-2 records) ---
# print("\n================ [WRITTEN OUTPUT PREVIEW] ================")
# for i, rec in enumerate(batch_outputs[:2]):
#     print(f"\n--- OUTPUT ITEM {i} ---")
#     print("[output keys]:", list(rec.keys()))
#     resp = rec.get("response", {})
#     parsed_json = resp.get("json")
#     if parsed_json:
#         print("[triples exists? ]", "triples" in parsed_json)
#         if "triples" in parsed_json and parsed_json["triples"]:
#             print("[first triple]:")
#             pprint.pprint(parsed_json["triples"][0], width=100)
#     else:
#         print("[WARN] No parsed JSON for this item.")
# # ==============================================================================


In [11]:
########################################
# WIKIDATA BATCH RUN (same style as DBpedia)
########################################

import os
import re
import json
from typing import List

WIKI_PATTERN = re.compile(r"^ont_(\d+)_([a-z]+)_test\.jsonl$")

def make_wikidata_paths(filename: str, base_input: str, base_onto: str, base_out: str):
    """
    Parse filenames like ont_8_politician_test.jsonl
    and return input_jsonl, ontology_json, output_jsonl, tag
    """
    m = WIKI_PATTERN.match(filename)
    if not m:
        raise ValueError(f"Unexpected filename format: {filename}")
    idx, cat = m.groups()

    input_jsonl  = os.path.join(base_input, filename)
    ontology_json = os.path.join(base_onto, f"{idx}_{cat}_ontology.json")
    output_jsonl  = os.path.join(base_out, filename.replace("_test.jsonl", "_output.jsonl"))
    tag = f"ont_{idx}_{cat}"
    return input_jsonl, ontology_json, output_jsonl, tag


def run_wikidata_batch():
    """
    Run Prompt 1 pipeline in batched mode for Wikidata datasets.
    Mirrors the DBpedia version: explicit read → build prompts → batched generate → parse → write.
    """
    BASE_INPUT = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/"
    BASE_ONTO  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/"
    BASE_OUT   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/"

    FILENAMES = [
        "ont_10_culture_test.jsonl",
        "ont_1_movie_test.jsonl",
        "ont_2_music_test.jsonl",
        "ont_3_sport_test.jsonl",
        "ont_4_book_test.jsonl",
        "ont_5_military_test.jsonl",
        "ont_6_computer_test.jsonl",
        "ont_7_space_test.jsonl",
        "ont_8_politics_test.jsonl",
        "ont_9_nature_test.jsonl",
    ]

    # 1) load model ONCE (same as dbpedia code)
    generator, tokenizer = setup_model()

    for fname in FILENAMES:
        try:
            print("\n" + "=" * 80)
            print(f"[RUN] wikidata {fname}")

            # 2) build all paths
            input_jsonl, ontology_json, output_jsonl, tag = make_wikidata_paths(
                fname,
                BASE_INPUT,
                BASE_ONTO,
                BASE_OUT,
            )

            # 3) load ontology
            with open(ontology_json, "r", encoding="utf-8") as f:
                ontology_data = json.load(f)

            # 4) load input records
            recs = list(read_jsonl(input_jsonl))
            if not recs:
                print(f"[SKIP] No records found in {input_jsonl}")
                continue

            # 5) build system + all chat-formatted prompts (same as dbpedia)
            system_prompt = build_prompt1_system()
            prompt_texts: List[str] = []
            for rec in recs:
                text_val, _ = extract_text_field(rec)
                user_prompt = build_prompt1_user(
                    TEXT=text_val,
                    ontology_json=ontology_data,
                    k=6,   # same as your original wikidata call
                )
                prompt_text = _build_prompt_text(tokenizer, system_prompt, user_prompt)
                prompt_texts.append(prompt_text)

            # 6) generate (BATCHED) exactly like dbpedia
            raw_outputs = generate_model_responses_batched(
                generator=generator,
                tokenizer=tokenizer,
                prompt_texts=prompt_texts,
                temperature=0.25,
                cap_max_new_tokens=3000,   # treated as CAP; actual dyn max inside helper
                verbose=False,
            )

            # 7) parse + package + write
            outputs: List[dict] = []
            for rec, raw_text in zip(recs, raw_outputs):
                parsed = robust_parse_model_output(raw_text)
                outputs.append({
                    "id": rec.get("id"),
                    "input text": rec,   # keep same style as dbpedia code 2
                    "response": {
                        "LLM_output": raw_text,
                        "json": parsed.get("raw_json_obj"),
                    },
                })

            os.makedirs(os.path.dirname(output_jsonl), exist_ok=True)
            write_jsonl(output_jsonl, outputs)
            print(f"[DONE] Wrote {len(outputs)} records → {output_jsonl}")

        except Exception as exc:
            print(f"[ERROR] wikidata {fname}: {exc}")


In [12]:
run_wikidata_batch()

[LOAD] model=mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



[RUN] wikidata ont_10_culture_test.jsonl
[DONE] Wrote 159 records → /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_10_culture_output.jsonl

[RUN] wikidata ont_1_movie_test.jsonl


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[DONE] Wrote 840 records → /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_1_movie_output.jsonl

[RUN] wikidata ont_2_music_test.jsonl
[DONE] Wrote 675 records → /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_2_music_output.jsonl

[RUN] wikidata ont_3_sport_test.jsonl
[DONE] Wrote 487 records → /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_3_sport_output.jsonl

[RUN] wikidata ont_4_book_test.jsonl
[DONE] Wrote 550 records → /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_4_book_output.jsonl

[RUN] wikidata ont_5_military_test.jsonl
[DONE] Wrote 230 records → /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_5_military_output.jsonl

[RUN] wikidata ont_6_computer_test.jsonl
[DONE] Wrote 230 records → /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_6_computer_output.jsonl

[RUN] wikidata ont_7_space_test.

In [10]:
# BLOCK 10 — DBPEDIA BATCH RUN (compatible with adaptive batching)
import os
import re
import json
from typing import List

PATTERN = re.compile(r"^ont_(\d+)_([a-z]+)_test\.jsonl$")

def make_paths(filename: str, BASE_INPUT: str, BASE_ONTO: str, BASE_OUT: str):
    """
    Parse filenames like ont_18_scientist_test.jsonl
    and return input, ontology, and output paths.
    """
    m = PATTERN.match(filename)
    if not m:
        raise ValueError(f"Unexpected filename format: {filename}")
    idx, cat = m.groups()
    input_jsonl  = os.path.join(BASE_INPUT, filename)
    ontology_json = os.path.join(BASE_ONTO, f"{idx}_{cat}_ontology.json")
    output_jsonl  = os.path.join(BASE_OUT, filename.replace("_test.jsonl", "_output.jsonl"))
    tag = f"ont_{idx}_{cat}"
    return input_jsonl, ontology_json, output_jsonl, tag


def run_dbpedia_batch():
    """
    Run Prompt 1 pipeline in batched mode for DBpedia datasets.
    Uses adaptive truncation and dynamic max token length.
    """
    BASE_INPUT = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/"
    BASE_ONTO  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/"
    BASE_OUT   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/"

    FILENAMES = [
        # "ont_12_monument_test.jsonl",
        # "ont_1_university_test.jsonl",
        # "ont_2_musicalwork_test.jsonl",
        # "ont_3_airport_test.jsonl",
        # "ont_4_building_test.jsonl",
        # "ont_5_athlete_test.jsonl",
        # "ont_6_politician_test.jsonl",
        # "ont_7_company_test.jsonl",
        # "ont_8_celestialbody_test.jsonl",
        # "ont_9_astronaut_test.jsonl",
        # "ont_10_comicscharacter_test.jsonl",
        # "ont_11_meanoftransportation_test.jsonl",
        # "ont_13_food_test.jsonl",
        # "ont_14_writtenwork_test.jsonl",
        # "ont_15_sportsteam_test.jsonl",
        # "ont_16_city_test.jsonl",
        # "ont_17_artist_test.jsonl",
        "ont_18_scientist_test.jsonl",
        "ont_19_film_test.jsonl",
    ]

    # === Load model once (for efficiency) ===
    generator, tokenizer = setup_model()

    for fname in FILENAMES:
        try:
            print("\n" + "=" * 80)
            print(f"[RUN] dbpedia {fname}")

            input_jsonl, ontology_json, output_jsonl, tag = make_paths(
                fname, BASE_INPUT, BASE_ONTO, BASE_OUT
            )

            # --- Load ontology ---
            with open(ontology_json, "r", encoding="utf-8") as f:
                ontology_data = json.load(f)

            # --- Load records ---
            recs = list(read_jsonl(input_jsonl))
            if not recs:
                print(f"[SKIP] No records found in {input_jsonl}")
                continue

            # --- Build full chat-formatted prompts ---
            system_prompt = build_prompt1_system()
            prompt_texts: List[str] = []
            for rec in recs:
                text_val, _ = extract_text_field(rec)
                user_prompt = build_prompt1_user(
                    TEXT=text_val,
                    ontology_json=ontology_data,
                    k=6,
                )
                prompt_text = _build_prompt_text(tokenizer, system_prompt, user_prompt)
                prompt_texts.append(prompt_text)

            # --- Generate model responses (batched) ---
            raw_outputs = generate_model_responses_batched(
                generator=generator,
                tokenizer=tokenizer,
                prompt_texts=prompt_texts,
                temperature=0.25,
                cap_max_new_tokens=3000,   # treated as CAP; per-batch dyn_max ≤ 512 by default
                verbose=False,
            )

            # --- Parse and save outputs ---
            outputs: List[dict] = []
            for rec, raw_text in zip(recs, raw_outputs):
                parsed = robust_parse_model_output(raw_text)
                outputs.append({
                    "id": rec.get("id"),
                    "input text": rec,
                    "response": {
                        "LLM_output": raw_text,
                        "json": parsed.get("raw_json_obj"),
                    },
                })

            os.makedirs(os.path.dirname(output_jsonl), exist_ok=True)
            write_jsonl(output_jsonl, outputs)
            print(f"[DONE] Wrote {len(outputs)} records → {output_jsonl}")

        except Exception as exc:
            print(f"[ERROR] dbpedia {fname}: {exc}")


In [11]:
run_dbpedia_batch()

[LOAD] model=mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



[RUN] dbpedia ont_18_scientist_test.jsonl


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

[DONE] Wrote 149 records → /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_18_scientist_output.jsonl

[RUN] dbpedia ont_19_film_test.jsonl


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

[DONE] Wrote 127 records → /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_19_film_output.jsonl
