# IWTC Graph Indexing (v0)

Builds graph artifacts from canonical index tables by translating index grammar into graph grammar.

Index tables express statements in tabular form.
Graph artifacts express the same statements as triples (subject–predicate–object).

Outputs:
- graph_nodes_v0.csv
- graph_edges_v0.csv

Artifacts are written to `working_drafts` for review before promotion to canonical indexes.

# Pre-Build: Validate environment and load canonical indexes

Validates repository paths and loads canonical index tables into DataFrames.

No graph artifacts are built here.

You may collapse this section after it runs successfully.

## Phase P0: Parameters

Define which world repository this notebook operates on and which index version it expects.

This notebook builds graph artifacts from existing canonical indexes.
It does not modify source material.

**IMPORTANT:** This notebook assumes index artifacts already exist and will fail
if required CSV files are missing.

In [None]:
# Phase 0: Parameters
LAST_PHASE_RUN = "P0"

# Absolute path to the world_repository.yml descriptor.
WORLD_REPOSITORY_DESCRIPTOR = (
    "/Users/charissophia/obsidian/Iron Wolf Trading Company/_meta/descriptors/world_repository.yml"
)

# Index version to load (must match previously generated artifacts)
INDEX_VERSION = "V0"

# Internal run metadata (do not edit)
from datetime import datetime
print(f"Notebook run initialized at: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
del datetime

## Phase P1: Load and validate world descriptor

This phase verifies that the world repository descriptor is readable and structurally valid.

- Load the descriptor file
- Resolve required paths
- Confirm referenced directories and files exist

If validation fails, the notebook stops with actionable error messages.

No data is read, written, or scanned until this succeeds.

In [None]:
# Phase P1: Load and validate world repository descriptor (Graph Indexing v0)
LAST_PHASE_RUN = "P1"

from pathlib import Path
import yaml

errors = []
warnings = []

# --- Load descriptor file ---
descriptor_path = Path(WORLD_REPOSITORY_DESCRIPTOR)

if not descriptor_path.exists():
    raise FileNotFoundError(
        "World repository descriptor file was not found.\n"
        f"Path provided:\n  {descriptor_path}\n\n"
        "What to do:\n"
        "- Confirm the file exists at this location or fix WORLD_REPOSITORY_DESCRIPTOR in Phase 0\n"
        "- If you just edited Phase 0, rerun Phase 0 and then rerun this cell\n"
    )

try:
    with descriptor_path.open("r", encoding="utf-8") as f:
        world_repo = yaml.safe_load(f)
except Exception:
    raise ValueError(
        "The world repository descriptor could not be read.\n"
        "This usually indicates a YAML formatting problem.\n\n"
        f"File:\n  {descriptor_path}\n\n"
        "What to do:\n"
        "- Compare the file against the example world_repository.yml\n"
        "- Paste the contents into https://www.yamllint.com/\n"
        "- Fix any reported issues, save the file, and rerun this cell"
    )

if not isinstance(world_repo, dict):
    raise ValueError(
        "World repository descriptor structure is not usable.\n"
        "The file must be a YAML mapping (top-level `name: value` entries).\n"
    )

print(f"World repository descriptor loaded successfully: {descriptor_path.name}")

# --- Extract required entries ---
WORLD_ROOT_RAW = world_repo.get("world_root")

drafts_block = world_repo.get("working_drafts")
DRAFTS_RAW = drafts_block.get("path") if isinstance(drafts_block, dict) else None

indexes_block = world_repo.get("indexes")
INDEXES_RAW = indexes_block.get("path") if isinstance(indexes_block, dict) else None

vocab = world_repo.get("vocabulary") or {}
ENTITIES_RAW = vocab.get("entities")
ALIASES_RAW = vocab.get("aliases")
AUTHORS_RAW = vocab.get("author_aliases")
PC_MAP_RAW = vocab.get("player_character_map")

if not WORLD_ROOT_RAW:
    errors.append("Missing required entry: world_root")
if not DRAFTS_RAW:
    errors.append("Missing required entry: working_drafts.path")
if not INDEXES_RAW:
    errors.append("Missing required entry: indexes.path")

if errors:
    raise ValueError(
        "World repository descriptor is missing required entries:\n- "
        + "\n- ".join(errors)
        + "\n\nWhat to do:\n"
          "- Edit your world_repository.yml and add/fix the missing entries\n"
          "- Save the file and rerun this cell"
    )

# ------------------------------------------------------------------
# Published outputs (initialize up front for later phases)
# ------------------------------------------------------------------
WORLD_ROOT = None

WORKING_DRAFTS_PATH = None
WORKING_DRAFTS_RELPATH = None

INDEXES_PATH = None
INDEXES_RELPATH = None

VOCAB_ENTITIES_PATH = None
VOCAB_ENTITIES_RELPATH = None
VOCAB_ALIASES_PATH = None
VOCAB_ALIASES_RELPATH = None
VOCAB_AUTHORS_PATH = None
VOCAB_AUTHORS_RELPATH = None
VOCAB_PC_MAP_PATH = None
VOCAB_PC_MAP_RELPATH = None

# --- Validate and resolve world_root ---
WORLD_ROOT = Path(WORLD_ROOT_RAW)

if str(WORLD_ROOT).startswith("~"):
    errors.append("world_root: '~' is not allowed. Use a full absolute path.")
elif not WORLD_ROOT.is_absolute():
    errors.append("world_root must be an absolute path (starts with / on macOS/Linux, or C:\\ on Windows).")
elif not WORLD_ROOT.is_dir():
    errors.append(f"world_root must be an existing directory: {WORLD_ROOT}")
else:
    WORLD_ROOT = WORLD_ROOT.resolve()

if errors:
    raise ValueError("Descriptor path validation failed:\n- " + "\n- ".join(errors))

def _resolve_under_world_root(raw_path: str, label: str):
    if raw_path is None or str(raw_path).strip() == "":
        return None, None

    p = Path(str(raw_path))

    if str(p).startswith("~"):
        errors.append(f"{label}: '~' is not allowed: {raw_path}")
        return None, None

    if not p.is_absolute():
        p = WORLD_ROOT / p
    p = p.resolve()

    try:
        rel = str(p.relative_to(WORLD_ROOT))
    except Exception:
        rel = str(p)

    return p, rel

# --- Resolve and validate working_drafts path (required, directory, writable) ---
WORKING_DRAFTS_PATH, WORKING_DRAFTS_RELPATH = _resolve_under_world_root(DRAFTS_RAW, "working_drafts.path")

if WORKING_DRAFTS_PATH is None:
    errors.append("working_drafts.path: missing or invalid.")
else:
    if not WORKING_DRAFTS_PATH.exists():
        errors.append(f"working_drafts.path: path does not exist: {WORKING_DRAFTS_PATH}")
    elif not WORKING_DRAFTS_PATH.is_dir():
        errors.append(f"working_drafts.path: must be a directory: {WORKING_DRAFTS_PATH}")
    else:
        # write probe (same behavior as raw source indexing)
        probe = WORKING_DRAFTS_PATH / ".iwtc_tools_write_probe.tmp"
        try:
            probe.write_text("test", encoding="utf-8")
        except Exception as e:
            errors.append(f"working_drafts.path: not writable: {WORKING_DRAFTS_PATH} ({type(e).__name__})")
        finally:
            try:
                if probe.exists():
                    probe.unlink()
            except Exception:
                pass

# --- Resolve and validate indexes path (required, directory) ---
INDEXES_PATH, INDEXES_RELPATH = _resolve_under_world_root(INDEXES_RAW, "indexes.path")

if INDEXES_PATH is None:
    errors.append("indexes.path: missing or invalid.")
else:
    if not INDEXES_PATH.exists():
        errors.append(f"indexes.path: path does not exist: {INDEXES_PATH}")
    elif not INDEXES_PATH.is_dir():
        errors.append(f"indexes.path: must be a directory: {INDEXES_PATH}")

# --- Resolve vocabulary paths (optional; warn if missing) ---
vocab_entries = [
    ("entities", "vocab.entities", ENTITIES_RAW),
    ("aliases", "vocab.aliases", ALIASES_RAW),
    ("author_aliases", "vocab.author_aliases", AUTHORS_RAW),
    ("player_character_map", "vocab.player_character_map", PC_MAP_RAW),
]

for key, label, raw in vocab_entries:
    if not raw:
        continue

    p, rel = _resolve_under_world_root(raw, label)
    if p is None:
        continue

    if p.exists() and p.is_dir():
        warnings.append(f"{label}: {p} must be a file (got directory). Ignoring.")
        continue

    if not p.exists():
        warnings.append(f"{label}: file does not exist: {p} (name resolution may be limited).")

    if key == "entities":
        VOCAB_ENTITIES_PATH, VOCAB_ENTITIES_RELPATH = p, rel
    elif key == "aliases":
        VOCAB_ALIASES_PATH, VOCAB_ALIASES_RELPATH = p, rel
    elif key == "author_aliases":
        VOCAB_AUTHORS_PATH, VOCAB_AUTHORS_RELPATH = p, rel
    elif key == "player_character_map":
        VOCAB_PC_MAP_PATH, VOCAB_PC_MAP_RELPATH = p, rel

if errors:
    raise ValueError("Descriptor path validation failed:\n- " + "\n- ".join(errors))

print("Descriptor paths are usable for this notebook.")
print(f'world_root: {WORLD_ROOT}')
print(f'working_drafts: {WORKING_DRAFTS_RELPATH}')
print(f'indexes: {INDEXES_RELPATH}')

print(f"vocab.entities: {VOCAB_ENTITIES_RELPATH} (exists={VOCAB_ENTITIES_PATH.exists() if VOCAB_ENTITIES_PATH else False})")
print(f"vocab.aliases: {VOCAB_ALIASES_RELPATH} (exists={VOCAB_ALIASES_PATH.exists() if VOCAB_ALIASES_PATH else False})")
print(f"vocab.author_aliases: {VOCAB_AUTHORS_RELPATH} (exists={VOCAB_AUTHORS_PATH.exists() if VOCAB_AUTHORS_PATH else False})")
print(f"vocab.player_character_map: {VOCAB_PC_MAP_RELPATH} (exists={VOCAB_PC_MAP_PATH.exists() if VOCAB_PC_MAP_PATH else False})")

if warnings:
    print("\nWarnings:")
    for w in warnings:
        print(f"- {w}")

# cleanup
del yaml, Path
del descriptor_path, world_repo, drafts_block, indexes_block, vocab
del WORLD_REPOSITORY_DESCRIPTOR
del WORLD_ROOT_RAW, DRAFTS_RAW, INDEXES_RAW, ENTITIES_RAW, ALIASES_RAW, AUTHORS_RAW, PC_MAP_RAW
del vocab_entries, key, label, raw, p, rel, warnings, errors, probe, f
del _resolve_under_world_root

## Phase P2: Load index artifacts

This phase verifies that the required index CSV artifacts exist and can be loaded.

- Resolve expected filenames from `INDEX_VERSION`
- Confirm they exist under `indexes.path`
- Load them into DataFrames
- Verify required columns are present

If any artifact is missing or malformed, the notebook stops with instructions to regenerate them.

No files are modified in this phase.

In [None]:
# Phase 2: Load index artifacts (v0)
LAST_PHASE_RUN = "P2"

import pandas as pd
from pathlib import Path

errors = []

# Normalize INDEX_VERSION into the on-disk suffix (your files use lowercase v0)
# Accepts "V0", "v0", "0" (if you ever use that), but publishes "v0"
INDEX_VERSION_SUFFIX = f"v{str(INDEX_VERSION).lower().lstrip('v')}"

# Required artifact filenames (fixed contract for this notebook)
required = {
    "entity_to_chunks": f"index_entity_to_chunks_{INDEX_VERSION_SUFFIX}.csv",
    "chunk_to_entities": f"index_chunk_to_entities_{INDEX_VERSION_SUFFIX}.csv",
    "player_to_chunks": f"index_player_to_chunks_{INDEX_VERSION_SUFFIX}.csv",
    "source_files": f"index_source_files_{INDEX_VERSION_SUFFIX}.csv",
}

# Resolve paths and validate existence
INDEX_FILES = {}
for key, fname in required.items():
    p = (INDEXES_PATH / fname).resolve()
    INDEX_FILES[key] = p
    if not p.exists():
        errors.append(f"Missing required index artifact: {fname}\n  Expected at: {p}")

if errors:
    raise FileNotFoundError(
        "Phase 2 cannot proceed because required index artifacts are missing.\n\n"
        + "\n\n".join(errors)
        + "\n\nWhat to do:\n"
          "- Rerun IWTC_Raw_Source_Indexing.ipynb to generate the v0 artifacts\n"
          "- Ensure the resulting index_*.csv files are placed under your indexes.path directory\n"
          f"- indexes.path resolved to:\n  {INDEXES_PATH}\n"
          "- Then rerun Phase 2"
    )

# Load CSVs (raw)
DF_ENTITY_TO_CHUNKS = pd.read_csv(INDEX_FILES["entity_to_chunks"])
DF_CHUNK_TO_ENTITIES = pd.read_csv(INDEX_FILES["chunk_to_entities"])
DF_PLAYER_TO_CHUNKS = pd.read_csv(INDEX_FILES["player_to_chunks"])
DF_SOURCE_FILES = pd.read_csv(INDEX_FILES["source_files"])

# Validate required columns (presence only)
expected_cols = {
    "DF_ENTITY_TO_CHUNKS": {"entity_id", "canonical", "chunk_ids", "chunk_count", "file_relpaths", "file_count"},
    "DF_CHUNK_TO_ENTITIES": {
        "chunk_id", "source_id", "source_type", "relpath",
        "chunk_start_line", "chunk_end_line",
        "entity_ids", "canonicals", "entity_count",
        "matched_vocabs", "match_kinds",
    },
    "DF_PLAYER_TO_CHUNKS": {"player_entity_id", "canonical", "chunk_ids", "chunk_count", "file_relpaths", "file_count"},
    "DF_SOURCE_FILES": {"source_id", "relpath", "source_type"},
}

for df_name, cols in expected_cols.items():
    df = globals()[df_name]
    missing = [c for c in cols if c not in df.columns]
    if missing:
        errors.append(f"{df_name}: missing expected columns: {missing}")

if errors:
    raise ValueError(
        "One or more index artifacts were loaded but do not match expected v0 columns.\n- "
        + "\n- ".join(errors)
        + "\n\nWhat to do:\n"
          "- Confirm you are using the v0 CSVs produced by IWTC_Raw_Source_Indexing.ipynb\n"
          "- Do not edit the CSVs manually\n"
          "- If you changed the producer notebook, re-run it to regenerate indexes and retry"
    )

# Summary prints
print("Phase 2 OK: index artifacts loaded.")
print(f"indexes.path: {INDEXES_PATH}")
print(f"index version: {INDEX_VERSION_SUFFIX}")

print("\nLoaded tables:")
print(f"- DF_ENTITY_TO_CHUNKS:   {len(DF_ENTITY_TO_CHUNKS):>8} rows, {len(DF_ENTITY_TO_CHUNKS.columns):>3} cols")
print(f"- DF_CHUNK_TO_ENTITIES:  {len(DF_CHUNK_TO_ENTITIES):>8} rows, {len(DF_CHUNK_TO_ENTITIES.columns):>3} cols")
print(f"- DF_PLAYER_TO_CHUNKS:   {len(DF_PLAYER_TO_CHUNKS):>8} rows, {len(DF_PLAYER_TO_CHUNKS.columns):>3} cols")
print(f"- DF_SOURCE_FILES:       {len(DF_SOURCE_FILES):>8} rows, {len(DF_SOURCE_FILES.columns):>3} cols")

# Optional: quick column display (helps debugging early)
print("\nDF_ENTITY_TO_CHUNKS columns:", list(DF_ENTITY_TO_CHUNKS.columns))
print("DF_CHUNK_TO_ENTITIES columns:", list(DF_CHUNK_TO_ENTITIES.columns))
print("DF_PLAYER_TO_CHUNKS columns:", list(DF_PLAYER_TO_CHUNKS.columns))
print("DF_SOURCE_FILES columns:", list(DF_SOURCE_FILES.columns))

# cleanup locals
del pd, Path, errors, required, key, fname, p, cols, df_name, df, missing
del expected_cols, INDEX_VERSION_SUFFIX, INDEX_FILES

In [None]:
# optional: clean up INDEXES_PATH that has been loaded into dataframes
# keep INDEXES_RELPATH for writing out final instructions
del INDEXES_PATH

## Phase P3: Load vocabulary tables

This phase loads optional vocabulary tables that enable human-readable
resolution and display during querying.

The notebook:

- Loads `vocab_entities.csv`
- Loads `vocab_aliases.csv`
- Loads `vocab_author_aliases.csv`
- Loads `vocab_map_player_character.csv`
- Validates minimal required columns (presence only)
- Publishes vocabulary dataframes for use in resolution helpers

This phase does not modify index tables and does not merge data.
It only prepares lookup tables for name resolution and display.

In [None]:
# Phase 3: Load vocabulary tables (human-authored CSVs; entities required)
LAST_PHASE_RUN = "P3"

import pandas as pd
from pathlib import Path

errors = []
warnings = []

# ------------------------------------------------------------------
# Semantic column mappings
# ------------------------------------------------------------------
ENTITY_COLS = {
    "entity_id": ["entity_id", "id"],
    "canonical": ["canonical", "canonical_name", "name"],
}
ALIAS_COLS = {
    "entity_id": ["entity_id", "id"],
    "alias": ["alias", "alt", "alternate"],
}
AUTHOR_ALIAS_COLS = {
    "author": ["author", "discord_name", "handle"],
    "player_entity_id": ["player_entity_id", "player", "player_id"],
    "ambig_char_id": ["ambig_char_id", "ambiguous_character", "ambig_character"],
}
PC_MAP_COLS = {
    "player_entity_id": ["player_entity_id", "player", "player_id"],
    "char_entity_id": ["char_entity_id", "character_entity_id", "character"],
}

# ------------------------------------------------------------------
# Use descriptor-validated vocab paths (from Phase 1)
# ------------------------------------------------------------------
vocab_files = [
    ("entities", VOCAB_ENTITIES_PATH, ENTITY_COLS, True),
    ("aliases", VOCAB_ALIASES_PATH, ALIAS_COLS, False),
    ("author_aliases", VOCAB_AUTHORS_PATH, AUTHOR_ALIAS_COLS, False),
    ("pc_map", VOCAB_PC_MAP_PATH, PC_MAP_COLS, False),
]

# Published outputs
DF_VOCAB_ENTITIES = pd.DataFrame(columns=list(ENTITY_COLS.keys()))
DF_VOCAB_ALIASES = pd.DataFrame(columns=list(ALIAS_COLS.keys()))
DF_VOCAB_AUTHORS = pd.DataFrame(columns=list(AUTHOR_ALIAS_COLS.keys()))
DF_VOCAB_PC_MAP = pd.DataFrame(columns=list(PC_MAP_COLS.keys()))

# ------------------------------------------------------------------
# Load + normalize (looped, inline)
# ------------------------------------------------------------------
for key, path_obj, col_map, required in vocab_files:

    if not path_obj:
        if required:
            errors.append(f"Missing required path for {key} in descriptor.")
        continue

    p = Path(path_obj)

    if required and not p.exists():
        errors.append(f"Missing required vocabulary file:\n  {p}")
        continue

    if not p.exists():
        warnings.append(f"Optional vocab file not found: {p}")
        continue

    raw_df = pd.read_csv(p, dtype=str).fillna("")

    rename = {}
    for semantic, options in col_map.items():
        found = next((c for c in options if c in raw_df.columns), None)
        if found:
            rename[found] = semantic

    if len(raw_df) > 0 and not rename:
        warnings.append(
            f"[{key}] CSV has rows but none of the expected columns were found.\n"
            f"  CSV columns: {list(raw_df.columns)}\n"
            f"  Expected mapping: {col_map}\n"
            f"  File: {p}"
        )
        norm_df = pd.DataFrame(columns=list(col_map.keys()))
    else:
        out = raw_df.rename(columns=rename)
        keep = [k for k in col_map.keys() if k in out.columns]
        norm_df = out[keep].copy()

    if key == "entities":
        DF_VOCAB_ENTITIES = norm_df
    elif key == "aliases":
        DF_VOCAB_ALIASES = norm_df
    elif key == "author_aliases":
        DF_VOCAB_AUTHORS = norm_df
    elif key == "pc_map":
        DF_VOCAB_PC_MAP = norm_df

    del raw_df, rename, semantic, options, found, out, keep, norm_df

# ------------------------------------------------------------------
# Hard validation: entities must be usable
# ------------------------------------------------------------------
if errors:
    raise FileNotFoundError(
        "Phase 3 cannot proceed.\n\n"
        + "\n\n".join(errors)
        + "\n\nFix the descriptor or vocabulary files, then rerun Phase 3."
    )

if DF_VOCAB_ENTITIES.empty:
    raise ValueError(
        "Entities vocab file loaded but no usable rows were found.\n"
        "Ensure the CSV contains entity_id and canonical columns."
    )

# ------------------------------------------------------------------
# Build DF_VOCAB_LOOKUP (unified vocab table for remapping)
# Columns:
#   - vocab_id: entity_id or player_entity_id
#   - vocab: canonical / alias / author handle
#   - vocab_kind: "entity" | "alias" | "author"
#   - vocab_norm: lowercase normalized vocab for matching
# ------------------------------------------------------------------
rows = []

# Entities (canonical names)
for _, r in DF_VOCAB_ENTITIES.iterrows():
    vid = str(r.get("entity_id", "")).strip()
    v = str(r.get("canonical", "")).strip()
    if vid and v:
        rows.append([vid, v, "entity"])

# Aliases (optional)
if DF_VOCAB_ALIASES is not None and not DF_VOCAB_ALIASES.empty:
    for _, r in DF_VOCAB_ALIASES.iterrows():
        vid = str(r.get("entity_id", "")).strip()
        v = str(r.get("alias", "")).strip()
        if vid and v:
            rows.append([vid, v, "alias"])

# Author handles (optional)
if DF_VOCAB_AUTHORS is not None and not DF_VOCAB_AUTHORS.empty:
    for _, r in DF_VOCAB_AUTHORS.iterrows():
        vid = str(r.get("player_entity_id", "")).strip()
        v = str(r.get("author", "")).strip()
        if vid and v:
            rows.append([vid, v, "author"])

DF_VOCAB_LOOKUP = pd.DataFrame(rows, columns=["vocab_id", "vocab", "vocab_kind"])
DF_VOCAB_LOOKUP["vocab_norm"] = DF_VOCAB_LOOKUP["vocab"].astype(str).str.strip().str.lower()
DF_VOCAB_LOOKUP = DF_VOCAB_LOOKUP.drop_duplicates(
    subset=["vocab_id", "vocab_norm", "vocab_kind"]
).reset_index(drop=True)

del rows, r, vid, v

# ------------------------------------------------------------------
# Summary
# ------------------------------------------------------------------
print("Phase 3 OK: vocabulary tables loaded.")

print("\nLoaded vocab tables:")
print(f"- DF_VOCAB_ENTITIES: {len(DF_VOCAB_ENTITIES):>8} rows, {len(DF_VOCAB_ENTITIES.columns):>3} cols")
print(f"- DF_VOCAB_ALIASES:  {len(DF_VOCAB_ALIASES):>8} rows, {len(DF_VOCAB_ALIASES.columns):>3} cols")
print(f"- DF_VOCAB_AUTHORS:  {len(DF_VOCAB_AUTHORS):>8} rows, {len(DF_VOCAB_AUTHORS.columns):>3} cols")
print(f"- DF_VOCAB_PC_MAP:   {len(DF_VOCAB_PC_MAP):>8} rows, {len(DF_VOCAB_PC_MAP.columns):>3} cols")
print(f"- DF_VOCAB_LOOKUP:   {len(DF_VOCAB_LOOKUP):>8} rows, {len(DF_VOCAB_LOOKUP.columns):>3} cols")

if warnings:
    print("\nWarnings:")
    for w in warnings:
        print(f"- {w}")

# cleanup
del pd, Path
del errors, warnings, vocab_files, key, path_obj, col_map, required, p
del ENTITY_COLS, ALIAS_COLS, AUTHOR_ALIAS_COLS, PC_MAP_COLS

In [None]:
# optional: clean up VOCAB path variables
del VOCAB_ENTITIES_PATH, VOCAB_ENTITIES_RELPATH
del VOCAB_ALIASES_PATH, VOCAB_ALIASES_RELPATH
del VOCAB_AUTHORS_PATH, VOCAB_AUTHORS_RELPATH
del VOCAB_PC_MAP_PATH, VOCAB_PC_MAP_RELPATH

# Graph Bootstrap

This section translates index grammar into graph grammar and builds:

- graph_nodes_v0.csv  
- graph_edges_v0.csv  

Artifacts are written to `working_drafts` for review before promotion to `indexes.path`.

## Phase N: Build graph nodes

This phase defines **what things exist** in the graph.

Index tables describe things implicitly.
Here we make them explicit as graph nodes.

Nodes represent:
- entities (person, place, faction, etc.)
- vocab text forms
- chunks
- files

No relationships are created in this phase.
Only the vocabulary of “things” the graph can talk about.

In [None]:
# Build graph nodes (CSV-only; no networkx here)
LAST_PHASE_RUN = "N"

import pandas as pd

# -------------------------------------------------------------------
# Nodes: start empty, append sources, then finalize
# -------------------------------------------------------------------
nodes = []

# 1) Entity nodes (from vocab entities)
# node_type = prefix of entity_id (before first "_"), e.g. "artifact_folly" -> "artifact"
nodes.append(
    DF_VOCAB_ENTITIES.assign(
        node_id=lambda d: d["entity_id"].astype(str),
        node_type=lambda d: d["entity_id"].astype(str).str.split("_", n=1).str[0],
        label=lambda d: d["canonical"].astype(str),
    ).loc[:, ["node_id", "node_type", "label"]]
)


# 2) Chunk nodes (from chunk index)
nodes.append(
    DF_CHUNK_TO_ENTITIES.assign(
        node_id=lambda d: "chunk_" + d["chunk_id"].astype(int).astype(str),
        node_type="chunk",
        label=lambda d: "chunk_" + d["chunk_id"].astype(int).astype(str),
    ).loc[:, ["node_id", "node_type", "label"]]
)


# 3) File nodes (from source files)
# node_type = source_type (pbp_transcripts, session_notes, etc.)
nodes.append(
    DF_SOURCE_FILES.assign(
        node_id=lambda d: "file:" + d["relpath"].astype(str),
        node_type=lambda d: d["source_type"].astype(str),
        label=lambda d: d["relpath"].astype(str),
    ).loc[:, ["node_id", "node_type", "label"]]
)


# 4) Vocab nodes (from consolidated vocab lookup)
# node_id = stable text-form node keyed by vocab_norm
# label   = original vocab string (human-readable)
nodes.append(
    DF_VOCAB_LOOKUP.assign(
        node_id=lambda d: "vocab:" + d["vocab_norm"].astype(str),
        node_type="vocab",
        label=lambda d: d["vocab"].astype(str),
    ).loc[:, ["node_id", "node_type", "label"]]
)

# 5) Finalize
DF_GRAPH_NODES = (
    pd.concat(nodes, ignore_index=True)
      .drop_duplicates(subset=["node_id"])
      .sort_values(["node_type", "node_id"])
      .reset_index(drop=True)
)

# -------------------------------------------------------------------
# Sanity check (compact but useful)
# -------------------------------------------------------------------
print("Graph nodes built.")
print(f"Total nodes: {len(DF_GRAPH_NODES)}")
print("\nCounts by node_type:")
display(DF_GRAPH_NODES["node_type"].value_counts().to_frame("count"))

print("\nSample nodes:")
display(DF_GRAPH_NODES.sample(min(5, len(DF_GRAPH_NODES)), random_state=7))

# cleanup locals (keep DF_GRAPH_NODES)
del nodes

## Phase E: Build graph edges

This phase defines **how things relate** in the graph.

Index tables express relationships implicitly.
Here we translate them into explicit graph grammar:

subject → predicate → object  
(+ weight where needed)

Each edge is a statement derived directly from canonical indexes.
No interpretation or heuristics are introduced.

This phase produces a single edge table ready for graph queries.

In [None]:
# -------------------------------------------------------------------
# Phase: Graph edges (v0)
# -------------------------------------------------------------------
LAST_PHASE_RUN = "E"

import pandas as pd

rows = []

# -------------------------------------------------------------------
# Relationships sourced from DF_CHUNK_TO_ENTITIES
# -------------------------------------------------------------------
# File contains Chunk:
#   Source grammar (index table):
#     relpath + chunk_id  => "This chunk is located in this file"
#   Target grammar (graph edges):
#     subject + predicate + object  => "file:<relpath> contains chunk_<id>"
#
# Chunk mentions Vocab:
#   Source grammar (index table):
#     chunk_id + matched_vocabs  => "This chunk contains these text forms"
#   Target grammar (graph edges):
#     subject + predicate + object  => "chunk_<id> mentions vocab:<text>"
# -------------------------------------------------------------------

for _, r in DF_CHUNK_TO_ENTITIES.loc[:, ["chunk_id", "relpath", "matched_vocabs", "entity_ids"]].iterrows():
    chunk_node = f"chunk_{int(r['chunk_id'])}"
    file_node = f"file:{r['relpath']}"

    # file contains chunk
    rows.append((file_node, "contains", chunk_node, pd.NA))

    # chunk mentions vocab (pipe-delimited)
    for v in (x.strip() for x in str(r["matched_vocabs"]).split("|")):
        rows.append((chunk_node, "mentions", f"vocab:{v}", pd.NA))

    # entity co-occurs with entity (within this chunk) -> "votes" (weight=1)
    # chunk_id + entity_ids => "<entity_id A> cooccurs_with <entity_id B>" (undirected via A < B)
    entity_ids = sorted({e.strip() for e in str(r["entity_ids"]).split("|") if e.strip()})
    for i in range(len(entity_ids)):
        for j in range(i + 1, len(entity_ids)):
            rows.append((entity_ids[i], "cooccurs_with", entity_ids[j], 1))

# -------------------------------------------------------------------
# Vocab refers_to Entity/Player
# -------------------------------------------------------------------
# Source grammar (vocab lookup table):
#   vocab_norm + vocab_id  => "This text form refers to this thing"
#
# Target grammar (graph edges):
#   subject + predicate + object  => "vocab:<vocab_norm> refers_to <vocab_id>"
# -------------------------------------------------------------------

for _, r in DF_VOCAB_LOOKUP.loc[:, ["vocab_norm", "vocab_id"]].iterrows():

    # Translate vocab_norm ("shadowboy") -> graph node id ("vocab:shadowboy")
    subject = f"vocab:{str(r['vocab_norm']).strip()}"

    # vocab_id is already the target node id (entity_id or player_entity_id)
    object_ = str(r["vocab_id"]).strip()

    rows.append((subject, "refers_to", object_, pd.NA))


# -------------------------------------------------------------------
# Player plays Character
# -------------------------------------------------------------------
# Source grammar (vocab table):
#    char_entity_id + player_entity_id  => "Character is played by Player"
#
# Target grammar (graph edges):
#   subject + predicate + object  => "<player_entity_id> plays <char_entity_id>"
# -------------------------------------------------------------------

for _, r in DF_VOCAB_PC_MAP.loc[:, ["player_entity_id", "char_entity_id"]].iterrows():
    subject = str(r["player_entity_id"]).strip()
    object_ = str(r["char_entity_id"]).strip()
    rows.append((subject, "plays", object_, pd.NA))


# -------------------------------------------------------------------
# Entity co-occurs with Entity (within same chunk)
# -------------------------------------------------------------------
# Source grammar (index table):
#   chunk_id + entity_ids
#     => "These entities appear together in this chunk"
#
# Target grammar (graph edges):
#   subject + predicate + object
#     => "<entity_id_A> cooccurs_with <entity_id_B>"
#
# Convention:
#   Alphabetical ordering ensures one undirected edge per pair.
# -------------------------------------------------------------------

for _, r in DF_CHUNK_TO_ENTITIES.loc[:, ["entity_ids"]].iterrows():

    # Extract clean list of entity_ids in this chunk
    entities = sorted(
        e.strip()
        for e in str(r["entity_ids"]).split("|")
        if e.strip()
    )

    # Build all unordered pairs (i < j ensures no duplicates)
    for i in range(len(entities)):
        for j in range(i + 1, len(entities)):
            subject = entities[i]
            object_ = entities[j]
            rows.append((subject, "cooccurs_with", object_, 1))


# -------------------------------------------------------------------
# Aggregate and build the dataframe
# -------------------------------------------------------------------
DF_GRAPH_EDGES = (
    pd.DataFrame(rows, columns=["subject", "predicate", "object", "weight"])
      .groupby(["subject", "predicate", "object"], as_index=False)
      .agg(weight=("weight", lambda s: s.sum(min_count=1)))
      .sort_values(["predicate", "subject", "object"], ascending=[True, True, True])
      .reset_index(drop=True)
)

# -------------------------------------------------------------------
# Sanity check
# -------------------------------------------------------------------
print("Graph edges built (v0)")
print(f"Edges: {len(DF_GRAPH_EDGES)}")
print("Edge counts by predicate:\n")

display(
    DF_GRAPH_EDGES
        .groupby("predicate", as_index=False)
        .size()
        .rename(columns={"size": "edge_count"})
        .sort_values("edge_count", ascending=False)
        .reset_index(drop=True)
)

display(DF_GRAPH_EDGES.sample(3))

# clean up locals
del rows, r, chunk_node, file_node, v, entity_ids, i, j, subject, object_, entities

## Phase W: Write graph artifacts

This phase writes the generated graph tables to `working_drafts` for review.

After reviewing the CSV files:
- If satisfied, move or copy them into `indexes.path`
- Commit them to version control

This notebook does not modify canonical data.
It only generates reproducible artifacts.

In [None]:
# Phase: Write graph artifacts (v0) - simplified
# Output (written to WORKING_DRAFTS_PATH only):
#   - graph_nodes_v0.csv
#   - graph_edges_v0.csv
LAST_PHASE_RUN = "W1"

from pathlib import Path

# ------------------------------------------------------------------
# Validate inputs (presence only)
# ------------------------------------------------------------------
if "DF_GRAPH_NODES" not in globals() or DF_GRAPH_NODES is None or DF_GRAPH_NODES.empty:
    raise ValueError("DF_GRAPH_NODES is missing or empty. Build nodes first.")

if "DF_GRAPH_EDGES" not in globals() or DF_GRAPH_EDGES is None or DF_GRAPH_EDGES.empty:
    raise ValueError("DF_GRAPH_EDGES is missing or empty. Build edges first.")

if "WORKING_DRAFTS_PATH" not in globals() or not WORKING_DRAFTS_PATH:
    raise ValueError("WORKING_DRAFTS_PATH is missing. Rerun Phase P1.")

# ------------------------------------------------------------------
# Write artifacts to WORKING_DRAFTS_PATH (never to canonical indexes)
# ------------------------------------------------------------------
out_dir = Path(WORKING_DRAFTS_PATH)
out_dir.mkdir(parents=True, exist_ok=True)
out_rel = globals().get("WORKING_DRAFTS_RELPATH", str(out_dir)).rstrip("/")

DF_GRAPH_NODES.to_csv(out_dir / "graph_nodes_v0.csv", index=False, encoding="utf-8")
DF_GRAPH_EDGES.to_csv(out_dir / "graph_edges_v0.csv", index=False, encoding="utf-8")

print("Graph artifacts written:")
print(f" - {out_rel}/graph_nodes_v0.csv")
print(f" - {out_rel}/graph_edges_v0.csv")

display(DF_GRAPH_NODES.head(5))
display(DF_GRAPH_EDGES.head(5))

print("\nNext steps:")
print("1) Review the generated CSV files in:")
print(f"   {out_rel}")

print("\n2) If satisfied, move (or copy) these files into your canonical indexes directory:")
print(f"   {INDEXES_RELPATH}")

print("\n3) Commit the moved graph CSVs (and any descriptor updates) to version control if desired.")

print("\nThis notebook does NOT modify canonical data. All artifacts were written to working_drafts.")

# cleanup locals
del Path, out_dir, out_rel