# Example 1


Agent 1
Desc: Access the data and identify the groups
tool(s):


upload data to postgres table with this command: 

```
psql \                            
  --host localhost \
  --port 5433 \
  --username devuser \
  --dbname devdb \
  --command "\
\\copy public.nodes(id, type, tags, lat, lon) \  
  FROM '<full-path-to-csv>' \    
  WITH (FORMAT csv, HEADER true)"    

```

In [90]:
from dotenv import load_dotenv
load_dotenv()  # this reads .env and injects into os.environ


True

In [None]:
#!/usr/bin/env python3
# classification_agent.py

from dotenv import load_dotenv
import os
import pandas as pd
from sqlalchemy import create_engine

from langchain.tools import Tool
from langchain.chat_models import init_chat_model
from langgraph.prebuilt import create_react_agent

# 1. Load environment (for OPENAI_API_KEY, optional DB_URL override)
load_dotenv()

# 2. Configuration
DB_URL    = os.getenv(
    "DB_URL",
    "postgresql+psycopg2://devuser:devpassword@localhost:5433/devdb"
)
MODEL_STR = os.getenv("OPENAI_MODEL", "openai:gpt-4o")


# 3. Global storage variable for detected columns
classification_columns: list[str] = []

# 4. Detection logic: read table, return column names

def detect_classification_columns(table_name: str) -> list[str]:
    """
    Load `table_name` into a DataFrame and return a list of columns
    that are not IDs (col=='id' or ending '_id') and not numeric.
    Also updates global `classification_columns`.
    """
    global classification_columns
    engine = create_engine(DB_URL)
    df = pd.read_sql_table(table_name, engine)
    cols = []
    for col in df.columns:
        lc = col.lower()
        if lc == "id" or lc.endswith("_id"):
            continue
        if pd.api.types.is_numeric_dtype(df[col]):
            continue
        cols.append(col)
    classification_columns = cols
    return cols

# 5. Wrap detection as a LangChain Tool
detect_tool = Tool(
    name="detect_classification_columns",
    func=detect_classification_columns,
    description=(
        "Given a Postgres table name, return non-ID, non-numeric columns "
        "for classification and store them in `classification_columns`."
    ),
)

# 6. Initialize LLM for tool-binding
tt_model = init_chat_model(MODEL_STR, temperature=0)

# 7. Create the React agent
agent = create_react_agent(
    model=tt_model,
    tools=[detect_tool],
    prompt=(
        "You are an agent that receives a SQL table name, detects which columns "
        "are useful for classification, and stores them in the global variable."
    ),
)



  chat = ChatOpenAI(model=MODEL_STR, temperature=0)


In [94]:
from langchain_google_genai import ChatGoogleGenerativeAI


chat = ChatOpenAI(model='gpt-4o', temperature=0)
gemini = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.0
)


In [5]:
table_name = "nodes"
print(f"\n▶ Detecting classification columns for table: {table_name}\n")

# Agent invocation will call our detect_tool
response = agent.invoke({"messages": [{"role": "user", "content": table_name}]})
print("Agent response:\n", response, "\n")

# classification_columns global now holds the detected columns
print("Detected classification columns:", classification_columns)
 


▶ Detecting classification columns for table: nodes

Agent response:
 {'messages': [HumanMessage(content='nodes', additional_kwargs={}, response_metadata={}, id='c35281fb-e172-4447-8b15-3d5874c96e14'), AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_CTxypKWkmztGEeX6uDFjgDdG', 'function': {'arguments': '{"__arg1":"nodes"}', 'name': 'detect_classification_columns'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 99, 'total_tokens': 118, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_90122d973c', 'id': 'chatcmpl-BVbZ64Gnz9BfLi7XeDE88Cppzl3KT', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--b5161436-2c66-45ab-9c22-788c9a68dcb2-0', tool_calls=[{'

In [29]:
classification_columns

['type', 'tags']

In [97]:
from typing import List
import json

import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

def get_semantic_group_names(
    df: pd.DataFrame,
    columns: List[str]
) -> List[str]:
    """
    For each row in `df[columns]`, ask the LLM to assign a group name,
    but *enforce* via StructuredOutputParser that the model emits exactly
    a JSON object with a 'group_names' key and a JSON list of strings.
    """
    # 1) Serialize just the fields you care about
    records = df[columns].to_dict(orient="records")
    records_json = json.dumps(records, indent=2)

    # 2) Prepare a single-schema parser
    response_schemas = [
        ResponseSchema(
            name="group_names",
            description=(
                "A JSON array of the distinct semantic group names that "
                "cover all input records (no duplicates)."
            )
        )
    ]
    parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = parser.get_format_instructions()

    # 3) Build the prompt with strict format instructions
    prompt = f"""
You are given a JSON array of records, each with only these fields: {columns}

{records_json}

Your tasks:
1. Analyze all the records to find the underlying semantic "groups" they form.
2. Produce only a single JSON object matching this schema:

{format_instructions}

Make sure:
- You list each group name exactly once.
- You do NOT output per-record labels, only the distinct set of group names.
- You include no extra text or commentary.
"""
    # This builds a single pipeline: LLM → parser
    chain = gemini | parser

    output = chain.invoke(prompt)
    # output is now a dict: {"group_names": [...]}
    return output["group_names"]


In [98]:
def get_records(sample_size=1000, table_name="nodes"):
    engine = create_engine(DB_URL)
    q = f"SELECT * FROM {table_name} LIMIT {sample_size}"
    df = pd.read_sql(q, engine)
    return df

sample_size = 1000
df = get_records(sample_size=sample_size)
groups = get_semantic_group_names(df, classification_columns)
print(groups)

['motorway_junction', 'crossing_with_traffic_signals', 'speed_camera', 'junction_name', 'PGS_sourced', 'traffic_signals', 'crossing_marked', 'barrier', 'other']


## Different version


In [95]:
from typing import List, Tuple
import json

import pandas as pd
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

def get_id_category_pairs(
    df: pd.DataFrame,
    id_col: str,
    group_cols: List[str],
) -> List[Tuple[str, str]]:
    """
    Returns a list of (id, category) pairs for each record in df,
    where `category` is the semantic group assigned by the LLM.
    
    Args:
      df: the full DataFrame
      id_col: name of the column holding each record's unique identifier
      group_cols: list of columns to use when grouping/classifying
    """
    # 1) Build records JSON including the ID
    records = df[[id_col] + group_cols].to_dict(orient="records")
    records_json = json.dumps(records, indent=2)
    
    # 2) Define the response schema: array of {id: ..., category: ...}
    response_schemas = [
        ResponseSchema(
            name="categorized_records",
            description=(
                "A JSON array of objects, each containing the original "
                f"'{id_col}' and a new 'category' string assigned by the model."
            )
        )
    ]
    parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = parser.get_format_instructions()
    
    # 3) Build the prompt with strict instructions
    prompt = f"""
You are given a JSON array of records, each with:
  • \"{id_col}\": unique identifier  
  • additional fields: {group_cols}

Here are the records:
{records_json}

Your task:
1. Analyze each record and assign it a concise semantic \"category\".
2. Return **only** a single JSON object matching this schema:

{format_instructions}

Example of required output format:
```json
{{
  "categorized_records": [
    {{ "{id_col}": "123", "category": "Speed Camera" }},
    {{ "{id_col}": "124", "category": "Toll Booth" }},
    …
  ]
}}
No explanations, no extra fields—only the JSON object above.
"""
    # 4) Call the LLM → parser pipeline

    chain = gemini | parser
    output = chain.invoke(prompt)
    recs = output["categorized_records"]
    print(recs)
    return recs

In [101]:
# pairs = get_id_category_pairs(df, id_col="id", group_cols=classification_columns)
# for record_id, category in pairs:
#     print(f"{record_id} → {category}")

In [121]:
from sklearn.cluster import KMeans
from langchain.chat_models import ChatOpenAI
import numpy as np
from typing import List
import pandas as pd
from langchain.embeddings.openai import OpenAIEmbeddings
from sklearn.preprocessing import normalize

def cluster_and_map(
    df: pd.DataFrame,
    columns: List[str],
    categories: List[str],
    n_clusters: int
) -> List[str]:
    """
    Returns a list of assigned categories (len == len(df)), by:
     • embedding & KMeans-clustering into n_clusters
     • LLM-mapping each cluster to one of your predefined categories
    """
    # 1. Prepare texts + embeddings
    def serialize(v):
        try: obj = json.loads(v) if isinstance(v, str) else v
        except: return str(v)
        if isinstance(obj, dict):
            return "{" + ", ".join(f"{k}={json.dumps(obj[k],sort_keys=True)}"
                                   for k in sorted(obj)) + "}"
        return str(obj)

    texts = df[columns].applymap(serialize).agg(" | ".join, axis=1).tolist()
    embedder  = OpenAIEmbeddings()
    embeddings = embedder.embed_documents(texts)

    # 2. Cluster
    km = KMeans(n_clusters=n_clusters, random_state=0)
    clusters = km.fit_predict(embeddings)

    # 3. Map clusters → categories via LLM
    llm = gemini
    cluster_to_cat = {}
    for cid in range(n_clusters):
        samples = [texts[i] for i,c in enumerate(clusters) if c==cid][:5]
        prompt = f"""
You have these candidate categories: {categories}

Here are sample records for group {cid}:
{chr(10).join(samples)}

Which one of the above categories best describes *all* of these?
Answer with exactly the category name.
"""
        cat = llm.predict(prompt).strip()
        # fallback if the LLM picks something unknown
        if cat not in categories:
            cat = "unknown"
        cluster_to_cat[cid] = cat

    # 4. Assign back to each record
    return [cluster_to_cat[c] for c in clusters]


In [122]:
assigned_labels = cluster_and_map(
    df,
    columns=classification_columns,
    categories=groups,
    n_clusters=len(groups)
)

# Attach back to your DataFrame
df["assigned_category"] = assigned_labels

# Inspect
print(df[["assigned_category"]].value_counts())
print(df.head())

  texts = df[columns].applymap(serialize).agg(" | ".join, axis=1).tolist()


assigned_category            
other                            784
PGS_sourced                      166
traffic_signals                   18
crossing_with_traffic_signals     15
motorway_junction                 11
barrier                            4
junction_name                      2
Name: count, dtype: int64
       id  type tags        lat       lon assigned_category
0  123379  node   {}  51.200308  4.377739             other
1  123380  node   {}  51.199611  4.380116             other
2  123381  node   {}  51.199706  4.381602             other
3  123382  node   {}  51.199627  4.383552             other
4  123383  node   {}  51.199074  4.384953             other


In [126]:

df.loc[df["assigned_category"] == 'motorway_junction']

Unnamed: 0,id,type,tags,lat,lon,assigned_category
80,636413,node,"{'name': 'Antwerpen-Oost', 'highway': 'motorwa...",51.216691,4.453695,motorway_junction
84,636419,node,"{'name': 'Antwerpen-Oost', 'highway': 'motorwa...",51.213587,4.448103,motorway_junction
133,21280953,node,"{'ref': '6', 'name': 'Wilrijk', 'highway': 'mo...",51.167431,4.413595,motorway_junction
137,21281161,node,"{'name': 'Antwerpen-Zuid', 'highway': 'motorwa...",51.18826,4.415757,motorway_junction
701,25924005,node,"{'ref': '5a', 'name': 'Antwerpen-Centrum', 'hi...",51.201224,4.375957,motorway_junction
732,25924349,node,"{'ref': '5a', 'name': 'Antwerpen-Centrum', 'hi...",51.197136,4.389047,motorway_junction
733,25924420,node,"{'name': 'Antwerpen-Zuid', 'highway': 'motorwa...",51.191967,4.403142,motorway_junction
750,25924523,node,"{'ref': '5', 'name': 'Le Grellelaan', 'highway...",51.190114,4.407187,motorway_junction
765,25924658,node,"{'ref': '4', 'name': 'Berchem', 'highway': 'mo...",51.193246,4.429339,motorway_junction
775,25924707,node,"{'ref': '3', 'name': 'Borgerhout', 'highway': ...",51.204976,4.440158,motorway_junction


In [None]:
import json, pandas as pd
from typing import List, Dict, Any
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

# --------------------------- helpers ---------------------------------
def _looks_like_json(val) -> bool:
    """True if val is a dict or a JSON‑parsable string."""
    if isinstance(val, dict):
        return True
    if isinstance(val, str):
        try:
            obj = json.loads(val)
            return isinstance(obj, dict)
        except Exception:
            return False
    return False

def _auto_json_columns(df: pd.DataFrame, sample: int = 50) -> List[str]:
    """Return column names whose sample values are mostly dicts / JSON strings."""
    json_cols = []
    for col in df.columns:
        sample_vals = df[col].dropna().head(sample)
        if sample_vals.empty:
            continue
        pct_json = sample_vals.map(_looks_like_json).mean()
        if pct_json > 0.5:          # >50 % of sampled rows look like JSON
            json_cols.append(col)
    return json_cols

def _flatten_json_series(s: pd.Series, prefix: str, sep="_") -> pd.DataFrame:
    """Recursively flatten a JSON/dict series into scalar columns."""
    def flat(v, px=""):
        if isinstance(v, str):
            try: v = json.loads(v)
            except: return {px[:-1]: v}
        if isinstance(v, dict):
            out = {}
            for k, val in v.items():
                out.update(flat(val, f"{px}{k}{sep}"))
            return out
        return {px[:-1]: v}
    return pd.json_normalize(s.map(lambda x: flat(x, f"{prefix}{sep}")))

# ------------------- summarizer with auto‑detection -------------------
def summarize_category_with_llm(
    df: pd.DataFrame,
    category: str,
    llm,
    id_col: str                = "id",
    auto_sample: int           = 50         # rows per column to test for JSON
) -> Dict[str, Any]:
    """
    1) Filters df to `category`
    2) Auto‑detects JSON‑like columns and flattens them
    3) Computes per‑column stats
    4) Returns stats + LLM narrative
    """
    sub = df[df["assigned_category"] == category].copy()
    if sub.empty:
        return {"summary": {}, "narrative": f"No records for '{category}'."}

    # 1⃣  auto‑detect and flatten all JSON‑ish columns
    json_cols = _auto_json_columns(sub, sample=auto_sample)
    flat_parts = [sub]
    for jc in json_cols:
        flat_parts.append(_flatten_json_series(sub[jc], jc))
    wide = pd.concat(flat_parts, axis=1).drop(columns=json_cols)

    # 2⃣  quick type-aware stats
    summary = {"category": category, "n_rows": len(wide), "columns": {}}
    for col in wide.columns:
        if col in ("assigned_category", "geometry"):
            continue
        ser = wide[col].dropna()
        if ser.empty:
            summary["columns"][col] = {"all_null": True}
            continue
        if ser.dtype.kind in "if":
            summary["columns"][col] = {
                "type": "numeric",
                "min": ser.min(),
                "max": ser.max(),
                "mean": ser.mean(),
                "p50": ser.quantile(.5),
                "p95": ser.quantile(.95),
            }
        elif ser.dtype == bool or ser.isin([0, 1]).all():
            summary["columns"][col] = {
                "type": "boolean",
                "pct_true": float(ser.mean()),
            }
        else:
            top = ser.value_counts().head(5)
            summary["columns"][col] = {
                "type": "categorical",
                "distinct": int(ser.nunique()),
                "top_values": top.to_dict(),
            }

    # 3⃣  Ask LLM for narrative
    prompt = f"""
You are a data analyst. Summarize these statistics for the category "{category}"
in 3‑4 sentences, highlighting notable patterns.

Stats JSON:
{json.dumps(summary, indent=2)}
"""
    narrative = llm.predict(prompt).strip()

    return {"summary": summary, "narrative": narrative}



In [None]:
result = summarize_category_with_llm(df, "motorway_junction", gemini)
print(result)