In [1]:
import os
from pathlib import Path
import json

import pandas as pd

os.chdir("..")
print("CWD:", os.getcwd())

BASE_DIR = Path(".").resolve()
DATA_DIR = BASE_DIR / "data"
INDEX_PATH = DATA_DIR / "dataset_index.csv"

NUMERIC_DIR = BASE_DIR / "profiles" / "numeric"
CAT_DIR = BASE_DIR / "profiles" / "categorical"

META_DIR = BASE_DIR / "metadata" / "final_profiles"
META_DIR.mkdir(parents=True, exist_ok=True)

index_df = pd.read_csv(INDEX_PATH)
index_records = index_df.to_dict(orient="records")
len(index_records)


CWD: /home/jovyan


40

In [2]:
for row in index_records:
    dataset_id = row["dataset_id"]

    numeric_path = NUMERIC_DIR / f"{dataset_id}_numeric.json"
    cat_path = CAT_DIR / f"{dataset_id}_categorical.json"

    if numeric_path.exists():
        with numeric_path.open() as f:
            numeric_data = json.load(f)
        numeric_profile = numeric_data.get("numeric_profile", {})
        num_rows = numeric_data.get("num_rows", row.get("num_rows"))
        num_cols = numeric_data.get("num_cols", row.get("num_cols"))
    else:
        numeric_profile = {}
        num_rows = row.get("num_rows")
        num_cols = row.get("num_cols")

    if cat_path.exists():
        with cat_path.open() as f:
            cat_data = json.load(f)
        categorical_profile = cat_data.get("categorical_profile", {})
        feature_groups = cat_data.get("feature_groups", {})
    else:
        categorical_profile = {}
        feature_groups = {}

    name = row.get("name")
    if not isinstance(name, str) or not name.strip():
        name = dataset_id

    final_metadata = {
        "dataset_id": dataset_id,
        "name": name,
        "category": row.get("category"),
        "nyc_url": row.get("nyc_url"),
        "schema": {
            "num_rows": num_rows,
            "num_cols": num_cols,
        },
        "numeric_profile": numeric_profile,
        "categorical_profile": categorical_profile,
        "feature_groups": feature_groups,
    }

    out_path = META_DIR / f"{dataset_id}.json"
    with out_path.open("w") as f:
        json.dump(final_metadata, f, indent=2)


In [3]:
def build_deep_profile_prompt(metadata_dict):
    name = metadata_dict.get("name")
    category = metadata_dict.get("category")
    schema = metadata_dict.get("schema") or {}
    num_rows = schema.get("num_rows")
    num_cols = schema.get("num_cols")

    numeric_profile = metadata_dict.get("numeric_profile") or {}
    categorical_profile = metadata_dict.get("categorical_profile") or {}
    feature_groups = metadata_dict.get("feature_groups") or {}

    parts = []
    parts.append(f"Dataset name: {name}")
    parts.append(f"Category: {category}")
    parts.append(f"Approximate schema: rows={num_rows}, columns={num_cols}")
    parts.append("")

    if numeric_profile:
        parts.append("Numeric fields summary:")
        for col, prof in numeric_profile.items():
            vals = []
            for key in ("count", "mean", "stddev", "min", "max"):
                v = prof.get(key)
                if v is not None:
                    vals.append(f"{key}={v}")
            parts.append(f"- {col}: " + ", ".join(vals))
        parts.append("")

    if categorical_profile:
        parts.append("Categorical fields (top categories):")
        for col, prof in categorical_profile.items():
            top_vals = prof.get("top_values") or []
            top_slice = top_vals[:5]
            pretty = []
            for t in top_slice:
                v = t.get("value")
                p = t.get("percent")
                if p is not None:
                    pretty.append(f"{v} ({p:.1%})")
                else:
                    pretty.append(str(v))
            parts.append(f"- {col}: " + ", ".join(pretty))
        parts.append("")

    if feature_groups:
        parts.append("Feature groups:")
        for prefix, cols in feature_groups.items():
            if len(cols) > 1:
                parts.append(f"- {prefix}: {', '.join(cols)}")
        parts.append("")

    parts.append(
        "Using the structured summary above, write a concise but information-rich natural language "
        "description of this dataset, explaining what it contains, how it can be used, and any notable "
        "patterns, caveats, or biases."
    )

    return "\n".join(parts)



In [4]:
meta_files = list(META_DIR.glob("*.json"))
len(meta_files), meta_files[:3]


(40,
 [PosixPath('/home/jovyan/metadata/final_profiles/n3p6-zve2.json'),
  PosixPath('/home/jovyan/metadata/final_profiles/ebb7-mvp5.json'),
  PosixPath('/home/jovyan/metadata/final_profiles/3khw-qi8f.json')])

In [5]:
if meta_files:
    with meta_files[0].open() as f:
        sample_meta = json.load(f)
    prompt_example = build_deep_profile_prompt(sample_meta)
    print(prompt_example[:2000])


Dataset name: n3p6-zve2
Category: Education
Approximate schema: rows=1305, columns=64

Numeric fields summary:
- grade_span_min: count=863, mean=1254005922.829664, stddev=1490729695.4971747, min=6, max=5066130001
- expgrade_span_min: count=31, mean=8.516129032258064, stddev=1.1216347516589493, min=6, max=9
- expgrade_span_max: count=33, mean=12.363636363636363, stddev=0.7833494518006401, min=12, max=14
- postcode: count=435, mean=10726.096551724138, stddev=537.9279069286126, min=10001, max=11694
- total_students: count=426, mean=703.8427230046948, stddev=775.8704356094535, min=50, max=5458
- number_programs: count=435, mean=1.7080459770114942, stddev=1.5642328835599504, min=1, max=10

Using the structured summary above, write a concise but information-rich natural language description of this dataset, explaining what it contains, how it can be used, and any notable patterns, caveats, or biases.
