In [1]:
!nvidia-smi

Tue Oct 21 16:52:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  |   00000000:81:00.0 Off |                  N/A |
| 79%   57C    P8             36W /  370W |       4MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!kill 29385

/bin/bash: line 1: kill: (29385) - No such process


In [4]:
!hostname
!which python
import torch
print("CUDA available:", torch.cuda.is_available())

limbo
/opt/miniforge3/envs/jupyterhub/bin/python
CUDA available: True


In [5]:
import os
import json
from textwrap import dedent
from typing import Dict, Any, List, Tuple, Optional

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


def setup_model(model_id: str = "mistralai/Mistral-7B-Instruct-v0.3"):
    print("⏳ Loading model:", model_id)
    torch.backends.cudnn.benchmark = True
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    model.config.use_cache = True
    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
    )
    return generator, tokenizer


In [6]:
def read_jsonl(path: str, max_items: Optional[int] = None):
    count = 0
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            yield json.loads(line)
            count += 1
            if max_items is not None and count >= max_items:
                break


def write_jsonl(path: str, records):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False))
            f.write("\n")


In [7]:
def _build_concept_index(ontology_json: Dict[str, Any]) -> Dict[str, str]:
    """
    Build a lookup that maps any known identifier to its human-readable label.
    Keys include: concept['qid'], concept['id'], and concept['label'] (all as strings).
    """
    idx: Dict[str, str] = {}
    for c in ontology_json.get("concepts", []):
        label = str(c.get("label", "")).strip()
        if not label:
            continue
        for keyname in ("qid", "id", "label"):
            val = c.get(keyname)
            if val is None:
                continue
            sval = str(val).strip()
            if sval:
                idx[sval] = label
    return idx


def _label_for(value: Any, cindex: Dict[str, str]) -> str:
    """Return the human-readable label for a given value."""
    if value is None:
        return ""
    sval = str(value).strip()
    return cindex.get(sval, sval)


def format_ontology_concepts(ontology_json: Dict[str, Any]) -> str:
    labels: List[str] = []
    for c in ontology_json.get("concepts", []):
        lab = str(c.get("label", "")).strip()
        if lab:
            labels.append(lab)
    return ", ".join(labels)


def format_ontology_relations(ontology_json: Dict[str, Any]) -> str:
    """
    Render as:
      - director(film,human)
      - country of origin(film,country)
    """
    cindex = _build_concept_index(ontology_json)
    lines: List[str] = []
    for r in ontology_json.get("relations", []):
        rel_label = str(r.get("label", "")).strip()
        dom_label = _label_for(r.get("domain"), cindex)
        rng_label = _label_for(r.get("range"), cindex)
        if rel_label:
            lines.append(f"- {rel_label}({dom_label},{rng_label})")
    return "\n".join(lines)


def _escape_multiline(s: str) -> str:
    return s.replace("\\", "\\\\").replace('"', '\\"')


In [8]:
def build_p1_system() -> str:
    return (
        "You are a KG triple proposer in a Tree-of-Thoughts loop. "
        "First detect entity mentions and assign tentative ontology types. "
        "Then, using those mentions, propose candidate triples that are valid under the ontology (domain→range). "
        "Return only JSON."
    )


def build_p1_user(TEXT: str, ONTO: Dict[str, Any], k: int) -> str:
    return dedent(f'''
    Task: From the text, 1) list detected mentions with tentative types, 2) propose up to k={k} candidate triples [subject, relation, object]. Use only relations whose domain/range match the types you inferred. For each triple, include confidence ∈ [0,1] and cite the exact supporting span(s).

    Text
    "{_escape_multiline(TEXT)}"

    Ontology concepts
    {format_ontology_concepts(ONTO)}

    Ontology relations (domain → range)
    {format_ontology_relations(ONTO)}

    Output format (JSON only)
    {{
      "mentions": [
        {{"surface": "...", "type_candidates": ["ConceptA","ConceptB"], "span": [start,end]}}
      ],
      "triples": [
        {{
          "triple": ["subject","relation","object"],
          "confidence": 0.0,
          "support": "exact quote from text",
          "notes": "why domain/range fits"
        }}
      ]
    }}

    Constraints
    Only output domain/range-valid triples.
    Normalize dates to YYYY-MM-DD when possible.
    If a pronoun is required, resolve it to the nearest valid antecedent and state that in notes.
    Do not invent entities not in the text.
    ''').strip()


In [9]:
def generate_raw_json(
    generator,
    tokenizer,
    system_text: str,
    user_text: str,
    max_new_tokens: int = 768,
    temperature: float = 0.25,
) -> str:
    messages = [
        {"role": "system", "content": system_text},
        {"role": "user", "content": user_text},
    ]
    formatted = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    out = generator(
        formatted,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=0.9,
        do_sample=True,
        return_full_text=False,
        truncation=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    return out[0]["generated_text"] if isinstance(out[0], dict) else out[0]


In [10]:
_TEXT_KEYS_PRIORITY = ("sent", "text", "Text", "sentence", "Sentence")


def _extract_text(rec: Dict[str, Any]) -> Tuple[str, str]:
    """Return (text, key_used) from an input record."""
    for k in _TEXT_KEYS_PRIORITY:
        v = rec.get(k)
        if isinstance(v, str) and v.strip():
            return v.strip(), k
    # Fallback: choose the longest string-valued field
    best_k, best_v = "", ""
    for k, v in rec.items():
        if isinstance(v, str) and len(v) > len(best_v):
            best_k, best_v = k, v
    return best_v, best_k


def run_pipeline_prompt1(
    input_jsonl_path: str,
    ontology_json_path: str,
    output_jsonl_path: str,
    max_items: Optional[int] = None,
    max_new_tokens: int = 900,
    temperature: float = 0.25,
    verbose: bool = False, #True,
    k: int = 6,
):
    """
    RAW-only pipeline for Prompt 1 (no filtering / no validation).
    Writes a JSONL where each line contains:
      - id
      - input text
      - prompts (system/user)
      - response: raw + parsed json
    """
    with open(ontology_json_path, "r", encoding="utf-8") as f:
        ontology_json = json.load(f)

    items = list(read_jsonl(input_jsonl_path, max_items=max_items))
    if verbose:
        print(f"[RUN] Loaded {len(items)} input items from {input_jsonl_path}")

    generator, tokenizer = setup_model()
    outputs: List[Dict[str, Any]] = []

    for i, rec in enumerate(items, start=1):
        rid = str(rec.get("id") or f"item_{i}")
        text, key_used = _extract_text(rec)

        if verbose:
            print(f"\n[RUN] === ID={rid} ===")
            print(f"[INFO] text key: {key_used!r}")

        sys_prompt = build_p1_system()
        usr_prompt = build_p1_user(text, ontology_json, k)

        if verbose:
            print("\n==== [DEBUG] SYSTEM PROMPT ====\n", sys_prompt)
            print("\n==== [DEBUG] USER PROMPT ====\n", usr_prompt)

        try:
            raw = generate_raw_json(
                generator,
                tokenizer,
                sys_prompt,
                usr_prompt,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
            )
        except Exception as e:
            print(f"[ERROR] Generation failed for {rid}: {e}")
            raw = ""

        if verbose:
            print("\n==== [DEBUG] RAW MODEL OUTPUT ====\n", raw)

        parsed = None
        if isinstance(raw, str) and raw.strip():
            try:
                parsed = json.loads(raw)
            except Exception:
                import re
                m = re.search(r"\{[\s\S]*\}", raw)
                if m:
                    try:
                        parsed = json.loads(m.group(0))
                    except Exception:
                        parsed = None

        out_rec = {
            "id": rid,
            "input": text,
            "prompts": {
                "system": sys_prompt,
                "user": usr_prompt,
            },
            "response": {
                "text": raw,
                "json": parsed,
            },
        }

        outputs.append(out_rec)

    write_jsonl(output_jsonl_path, outputs)
    if verbose:
        print(f"\n[RUN] Wrote {len(outputs)} records to {output_jsonl_path}")
    return outputs


In [10]:
# ONTOLOGY_JSON = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/1_movie_ontology.json"
# INPUT_JSONL   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_1_movie_test.jsonl"
# OUTPUT_JSONL  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_1_movie_output_test.jsonl"

# MAX_ITEMS      = 3
# MAX_NEW_TOKENS = 900
# TEMPERATURE    = 0.25
# VERBOSE        = True #True
# K_CANDIDATES   = 6

# first = next(read_jsonl(INPUT_JSONL, max_items=1), None)
# if first is None:
#     print(f"[ERROR] No records found in: {INPUT_JSONL}")
# else:
#     print("[DEBUG] First record keys:", list(first.keys()))
#     print(" id:", first.get("id"))
#     print(
#         " sent:",
#         (first.get("sent") or "")[:160]
#         + ("..." if first.get("sent") and len(first["sent"]) > 160 else ""),
#     )

# _ = run_pipeline_prompt1(
#     input_jsonl_path=INPUT_JSONL,
#     ontology_json_path=ONTOLOGY_JSON,
#     output_jsonl_path=OUTPUT_JSONL,
#     max_items=MAX_ITEMS,
#     max_new_tokens=MAX_NEW_TOKENS,
#     temperature=TEMPERATURE,
#     verbose=VERBOSE,
#     k=K_CANDIDATES,
# )


In [None]:
# It is for wikidata sets

import os, re

# ---- Fixed base paths (unchanged) ----
BASE_INPUT = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/"
BASE_ONTO  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/"
BASE_OUT   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/"

# ---- The 10 filenames ----
FILENAMES = [
    "ont_1_movie_test.jsonl",
    "ont_2_music_test.jsonl",
    "ont_3_sport_test.jsonl",
    "ont_4_book_test.jsonl",
    "ont_5_military_test.jsonl",
    "ont_6_computer_test.jsonl",
    "ont_7_space_test.jsonl",
    "ont_8_politics_test.jsonl",
    "ont_9_nature_test.jsonl",
    "ont_10_culture_test.jsonl",
]

# ont_{index}_{category}_test.jsonl -> {index}_{category}_ontology.json, ont_{index}_{category}_output.jsonl
PATTERN = re.compile(r"^ont_(\d+)_([a-z]+)_test\.jsonl$")


def make_paths(filename: str):
    m = PATTERN.match(filename)
    if not m:
        raise ValueError(f"Unexpected filename format: {filename}")
    idx, cat = m.groups()

    input_jsonl = os.path.join(BASE_INPUT, filename)
    ontology_json = os.path.join(BASE_ONTO, f"{idx}_{cat}_ontology.json")

    # ont_{idx}_{cat}_test.jsonl -> ont_{idx}_{cat}_output.jsonl
    out_name = filename.replace("_test.jsonl", "_output.jsonl")
    output_jsonl = os.path.join(BASE_OUT, out_name)

    # Return compatible tuple with the same unpacking style
    return input_jsonl, ontology_json, None, output_jsonl, f"ont_{idx}_{cat}"


# ---- Run all files ----
for fname in FILENAMES:
    try:
        INPUT_JSONL, ONTOLOGY_JSON, FEW_SHOT_JSONL, OUTPUT_JSONL, tag = make_paths(fname)

        print("\n" + "=" * 80)
        print(f"[RUN] {tag}")
        print("INPUT :", INPUT_JSONL)
        print("ONTO  :", ONTOLOGY_JSON)
        print("FEWS  :", FEW_SHOT_JSONL)
        print("OUTPUT:", OUTPUT_JSONL)

        out = run_pipeline_prompt1(
            input_jsonl_path=INPUT_JSONL,
            ontology_json_path=ONTOLOGY_JSON,
            output_jsonl_path=OUTPUT_JSONL,
            max_items=None,          # process full dataset
            max_new_tokens=900,
            temperature=0.25,
            verbose=False,
            k=6,
        )

        print(f"[DONE] {tag}")
    except Exception as e:
        print(f"[ERROR] {fname}: {e}")



[RUN] ont_1_movie
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_text/ont_1_movie_test.jsonl
ONTO  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/wikidata/input_ontology/1_movie_ontology.json
FEWS  : None
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/wikidata/ont_1_movie_output.jsonl
⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [11]:
#this is for dbpedia data sets 
import os, re

# ---- Fixed base paths (for DBpedia) ----
BASE_INPUT = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/"
BASE_ONTO  = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/"
BASE_OUT   = "/upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/"

# ---- The 19 DBpedia filenames ----
FILENAMES = [
    #"ont_12_monument_test.jsonl",
    #"ont_1_university_test.jsonl",
    #"ont_10_comicscharacter_test.jsonl",
    #"ont_11_meanoftransportation_test.jsonl",
    #"ont_13_food_test.jsonl",
    "ont_14_writtenwork_test.jsonl",
    "ont_15_sportsteam_test.jsonl",
    "ont_16_city_test.jsonl",
    "ont_17_artist_test.jsonl",
    "ont_18_scientist_test.jsonl",
    "ont_19_film_test.jsonl",
    "ont_2_musicalwork_test.jsonl",
    "ont_3_airport_test.jsonl",
    "ont_4_building_test.jsonl",
    "ont_5_athlete_test.jsonl",
    "ont_6_politician_test.jsonl",
    "ont_7_company_test.jsonl",
    "ont_8_celestialbody_test.jsonl",
    "ont_9_astronaut_test.jsonl",
]

# ---- Regex pattern for filename parsing ----
PATTERN = re.compile(r"^ont_(\d+)_([a-z]+)_test\.jsonl$")


def make_paths(filename: str):
    """Build full input/output/ontology paths."""
    m = PATTERN.match(filename)
    if not m:
        raise ValueError(f"Unexpected filename format: {filename}")
    idx, cat = m.groups()

    input_jsonl = os.path.join(BASE_INPUT, filename)
    ontology_json = os.path.join(BASE_ONTO, f"{idx}_{cat}_ontology.json")

    # ont_{idx}_{cat}_test.jsonl → ont_{idx}_{cat}_output.jsonl
    out_name = filename.replace("_test.jsonl", "_output.jsonl")
    output_jsonl = os.path.join(BASE_OUT, out_name)

    return input_jsonl, ontology_json, output_jsonl, f"ont_{idx}_{cat}"


# ---- Run all DBpedia files ----
for fname in FILENAMES:
    try:
        INPUT_JSONL, ONTOLOGY_JSON, OUTPUT_JSONL, tag = make_paths(fname)

        print("\n" + "=" * 80)
        print(f"[RUN] {tag}")
        print("INPUT :", INPUT_JSONL)
        print("ONTO  :", ONTOLOGY_JSON)
        print("OUTPUT:", OUTPUT_JSONL)

        out = run_pipeline_prompt1(
            input_jsonl_path=INPUT_JSONL,
            ontology_json_path=ONTOLOGY_JSON,
            output_jsonl_path=OUTPUT_JSONL,
            max_items=None,          # process all items
            max_new_tokens=900,      # fixed value
            temperature=0.25,
            verbose=False,
            k=6
        )

        print(f"[DONE] {tag}")

    except Exception as e:
        print(f"[ERROR] {fname}: {e}")



[RUN] ont_14_writtenwork
INPUT : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_text/ont_14_writtenwork_test.jsonl
ONTO  : /upb/users/b/balram/profiles/unix/cs/promptKG/data/input/dbpedia/input_ontology/14_writtenwork_ontology.json
OUTPUT: /upb/users/b/balram/profiles/unix/cs/promptKG/data/output/prompt1/dbpedia/ont_14_writtenwork_output.jsonl
⏳ Loading model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


KeyboardInterrupt: 