# Example 1


Agent 1
Desc: Access the data and identify the groups
tool(s):


upload data to postgres table with this command: 

```
psql \                            
  --host localhost \
  --port 5433 \
  --username devuser \
  --dbname devdb \
  --command "\
\\copy public.nodes(id, type, tags, lat, lon) \  
  FROM '<full-path-to-csv>' \    
  WITH (FORMAT csv, HEADER true)"    

```

In [90]:
from dotenv import load_dotenv
load_dotenv()  # this reads .env and injects into os.environ


True

In [None]:
#!/usr/bin/env python3
# classification_agent.py

from dotenv import load_dotenv
import os
import pandas as pd
from sqlalchemy import create_engine

from langchain.tools import Tool
from langchain.chat_models import init_chat_model
from langgraph.prebuilt import create_react_agent

# 1. Load environment (for OPENAI_API_KEY, optional DB_URL override)
load_dotenv()

# 2. Configuration
DB_URL    = os.getenv(
    "DB_URL",
    "postgresql+psycopg2://devuser:devpassword@localhost:5433/devdb"
)
MODEL_STR = os.getenv("OPENAI_MODEL", "openai:gpt-4o")


# 3. Global storage variable for detected columns
classification_columns: list[str] = []

# 4. Detection logic: read table, return column names

def detect_classification_columns(table_name: str) -> list[str]:
    """
    Load `table_name` into a DataFrame and return a list of columns
    that are not IDs (col=='id' or ending '_id') and not numeric.
    Also updates global `classification_columns`.
    """
    global classification_columns
    engine = create_engine(DB_URL)
    df = pd.read_sql_table(table_name, engine)
    cols = []
    for col in df.columns:
        lc = col.lower()
        if lc == "id" or lc.endswith("_id"):
            continue
        if pd.api.types.is_numeric_dtype(df[col]):
            continue
        cols.append(col)
    classification_columns = cols
    return cols

# 5. Wrap detection as a LangChain Tool
detect_tool = Tool(
    name="detect_classification_columns",
    func=detect_classification_columns,
    description=(
        "Given a Postgres table name, return non-ID, non-numeric columns "
        "for classification and store them in `classification_columns`."
    ),
)

# 6. Initialize LLM for tool-binding
tt_model = init_chat_model(MODEL_STR, temperature=0)

# 7. Create the React agent
agent = create_react_agent(
    model=tt_model,
    tools=[detect_tool],
    prompt=(
        "You are an agent that receives a SQL table name, detects which columns "
        "are useful for classification, and stores them in the global variable."
    ),
)



  chat = ChatOpenAI(model=MODEL_STR, temperature=0)


In [94]:
from langchain_google_genai import ChatGoogleGenerativeAI


chat = ChatOpenAI(model='gpt-4o', temperature=0)
gemini = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.0
)


In [5]:
table_name = "nodes"
print(f"\n▶ Detecting classification columns for table: {table_name}\n")

# Agent invocation will call our detect_tool
response = agent.invoke({"messages": [{"role": "user", "content": table_name}]})
print("Agent response:\n", response, "\n")

# classification_columns global now holds the detected columns
print("Detected classification columns:", classification_columns)
 


▶ Detecting classification columns for table: nodes

Agent response:
 {'messages': [HumanMessage(content='nodes', additional_kwargs={}, response_metadata={}, id='c35281fb-e172-4447-8b15-3d5874c96e14'), AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_CTxypKWkmztGEeX6uDFjgDdG', 'function': {'arguments': '{"__arg1":"nodes"}', 'name': 'detect_classification_columns'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 99, 'total_tokens': 118, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_90122d973c', 'id': 'chatcmpl-BVbZ64Gnz9BfLi7XeDE88Cppzl3KT', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run--b5161436-2c66-45ab-9c22-788c9a68dcb2-0', tool_calls=[{'

In [29]:
classification_columns

['type', 'tags']

In [97]:
from typing import List
import json

import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

def get_semantic_group_names(
    df: pd.DataFrame,
    columns: List[str]
) -> List[str]:
    """
    For each row in `df[columns]`, ask the LLM to assign a group name,
    but *enforce* via StructuredOutputParser that the model emits exactly
    a JSON object with a 'group_names' key and a JSON list of strings.
    """
    # 1) Serialize just the fields you care about
    records = df[columns].to_dict(orient="records")
    records_json = json.dumps(records, indent=2)

    # 2) Prepare a single-schema parser
    response_schemas = [
        ResponseSchema(
            name="group_names",
            description=(
                "A JSON array of the distinct semantic group names that "
                "cover all input records (no duplicates)."
            )
        )
    ]
    parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = parser.get_format_instructions()

    # 3) Build the prompt with strict format instructions
    prompt = f"""
You are given a JSON array of records, each with only these fields: {columns}

{records_json}

Your tasks:
1. Analyze all the records to find the underlying semantic "groups" they form.
2. Produce only a single JSON object matching this schema:

{format_instructions}

Make sure:
- You list each group name exactly once.
- You do NOT output per-record labels, only the distinct set of group names.
- You include no extra text or commentary.
"""
    # This builds a single pipeline: LLM → parser
    chain = gemini | parser

    output = chain.invoke(prompt)
    # output is now a dict: {"group_names": [...]}
    return output["group_names"]


In [98]:
def get_records(sample_size=1000, table_name="nodes"):
    engine = create_engine(DB_URL)
    q = f"SELECT * FROM {table_name} LIMIT {sample_size}"
    df = pd.read_sql(q, engine)
    return df

sample_size = 1000
df = get_records(sample_size=sample_size)
groups = get_semantic_group_names(df, classification_columns)
print(groups)

['motorway_junction', 'crossing_with_traffic_signals', 'speed_camera', 'junction_name', 'PGS_sourced', 'traffic_signals', 'crossing_marked', 'barrier', 'other']


## Different version


In [95]:
from typing import List, Tuple
import json

import pandas as pd
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

def get_id_category_pairs(
    df: pd.DataFrame,
    id_col: str,
    group_cols: List[str],
) -> List[Tuple[str, str]]:
    """
    Returns a list of (id, category) pairs for each record in df,
    where `category` is the semantic group assigned by the LLM.
    
    Args:
      df: the full DataFrame
      id_col: name of the column holding each record's unique identifier
      group_cols: list of columns to use when grouping/classifying
    """
    # 1) Build records JSON including the ID
    records = df[[id_col] + group_cols].to_dict(orient="records")
    records_json = json.dumps(records, indent=2)
    
    # 2) Define the response schema: array of {id: ..., category: ...}
    response_schemas = [
        ResponseSchema(
            name="categorized_records",
            description=(
                "A JSON array of objects, each containing the original "
                f"'{id_col}' and a new 'category' string assigned by the model."
            )
        )
    ]
    parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = parser.get_format_instructions()
    
    # 3) Build the prompt with strict instructions
    prompt = f"""
You are given a JSON array of records, each with:
  • \"{id_col}\": unique identifier  
  • additional fields: {group_cols}

Here are the records:
{records_json}

Your task:
1. Analyze each record and assign it a concise semantic \"category\".
2. Return **only** a single JSON object matching this schema:

{format_instructions}

Example of required output format:
```json
{{
  "categorized_records": [
    {{ "{id_col}": "123", "category": "Speed Camera" }},
    {{ "{id_col}": "124", "category": "Toll Booth" }},
    …
  ]
}}
No explanations, no extra fields—only the JSON object above.
"""
    # 4) Call the LLM → parser pipeline

    chain = gemini | parser
    output = chain.invoke(prompt)
    recs = output["categorized_records"]
    print(recs)
    return recs

In [101]:
# pairs = get_id_category_pairs(df, id_col="id", group_cols=classification_columns)
# for record_id, category in pairs:
#     print(f"{record_id} → {category}")

In [112]:
from typing import List, Dict
import json

from langchain_core.output_parsers import JsonOutputParser
import pandas as pd


def extract_ids_by_category(
    df: pd.DataFrame,
    id_col: str,
    columns: List[str],
    categories: List[str]
) -> Dict[str, List[str]]:

    """
    For each category in `categories`, asks the LLM to extract IDs of matching records,
    removes those from further consideration, and finally assigns any leftovers to 'unknown'.
    """
    df_remaining = df.copy()
    llm    = gemini
    parser = JsonOutputParser()
    pipeline = llm | parser

    results: Dict[str, List[str]] = {}
    for cat in categories:
        if df_remaining.empty:
            results[cat] = []
            continue

        mini = df_remaining[[id_col] + columns].to_dict(orient="records")
        mini_json = json.dumps(mini, indent=2)

        prompt = f"""
You have a JSON array of records, each with fields: "{id_col}" and {columns}:

{mini_json}

Extract **only** the `{id_col}` values of records that belong to the category "{cat}".
**Return only** a raw JSON array of strings, e.g. ["123", "456"].
If none match, return [].
"""
        matched_ids: List[str] = pipeline.invoke(prompt)
        # dedupe & string‐ify
        matched_ids = [str(i) for i in dict.fromkeys(matched_ids)]
        results[cat] = matched_ids

        # drop matched rows
        if matched_ids:
            df_remaining = df_remaining[
                ~df_remaining[id_col].astype(str).isin(matched_ids)
            ]

    # Any remaining IDs go into 'unknown'
    leftover_ids = df_remaining[id_col].astype(str).tolist()
    results["unknown"] = leftover_ids

    return results


In [113]:
extracted = extract_ids_by_category(df, 'id', classification_columns, groups)
for cat, recs in extracted.items():
    print(f"=== {cat} ({len(recs)} records) ===")
    print(json.dumps(recs, indent=2))

=== motorway_junction (11 records) ===
[
  "636413",
  "636419",
  "21280953",
  "21281161",
  "25924004",
  "25924349",
  "25924420",
  "25924523",
  "25924658",
  "25924707",
  "25947647"
]
=== crossing_with_traffic_signals (15 records) ===
[
  "636356",
  "18277791",
  "25913046",
  "25914706",
  "25915124",
  "25915127",
  "25921640",
  "25944157",
  "25944158",
  "25944159",
  "25944160",
  "25944267",
  "25944268",
  "25944272",
  "25944275"
]
=== speed_camera (2 records) ===
[
  "636417",
  "25945960"
]
=== junction_name (6 records) ===
[
  "636378",
  "636380",
  "25946325",
  "25946326",
  "25946334",
  "25946340"
]
=== PGS_sourced (165 records) ===
[
  "22419560",
  "22419563",
  "22419567",
  "22419570",
  "22419573",
  "22419576",
  "22419579",
  "22419582",
  "22419586",
  "22419589",
  "22419593",
  "22419596",
  "22419599",
  "22419603",
  "22419606",
  "22419610",
  "22419725",
  "22419731",
  "22419740",
  "22419745",
  "22419747",
  "22419752",
  "22419764",
  "224197

In [114]:
amount = 0

for cat, recs in extracted.items():
    amount += len(recs)

print("Amount processed", amount)

Amount processed 1000


In [118]:
from typing import List
import os
import pandas as pd
from sqlalchemy import create_engine, text, bindparam
from dotenv import load_dotenv

def extract_records_by_category(
    table_name: str,
    id_col: str,
    ids: List[str],
    columns: List[str]
) -> pd.DataFrame:
    """
    Fetches rows from `table_name` where `id_col` is in `ids`,
    using SQLAlchemy's expanding bind-param on an IN clause.
    """
    if not ids:
        return pd.DataFrame(columns=columns or [])

    select_clause = ", ".join(columns) if columns else "*"
    engine = create_engine(DB_URL)

    # Build a text() statement with an expanding bind parameter
    stmt = text(f"""
        SELECT {select_clause}
        FROM {table_name}
        WHERE {id_col} IN :id_list
    """).bindparams(bindparam("id_list", expanding=True))

    # pd.read_sql will pass `ids` as the expanding list
    with engine.connect() as conn:
        df = pd.read_sql(stmt, conn, params={"id_list": ids})

    return df


In [120]:

cat = "motorway_junction"
df_speed = get_records_for_category(
    category=cat,
    extracted_ids=extracted,
    table_name="nodes",
    id_col="id",
    fetch_columns=["id","type","tags","lat","lon"]
)
print(f"Records for {cat}:")
print(df_speed.to_string(index=False))


Records for motorway_junction:
      id type                                                                                                    tags       lat      lon
  636413 node                                              {'name': 'Antwerpen-Oost', 'highway': 'motorway_junction'} 51.216691 4.453695
  636419 node                           {'name': 'Antwerpen-Oost', 'highway': 'motorway_junction', 'toll:hgv': 'yes'} 51.213587 4.448103
21280953 node                      {'ref': '6', 'name': 'Wilrijk', 'highway': 'motorway_junction', 'toll:hgv': 'yes'} 51.167431 4.413595
21281161 node                                              {'name': 'Antwerpen-Zuid', 'highway': 'motorway_junction'} 51.188260 4.415757
25924004 node                                                                                                      {} 51.202791 4.374246
25924349 node           {'ref': '5a', 'name': 'Antwerpen-Centrum', 'highway': 'motorway_junction', 'toll:hgv': 'yes'} 51.197136 4.389047
25924420 n