# Mini RAG

LangChain, OpenAI, Nominatim

## Set Up

In [None]:
!pip -q install "langchain>=0.2.10" "langchain-openai>=0.2.2" "langchain-community>=0.2.10" faiss-cpu tiktoken geopy pyproj requests pydantic==2.*

import os, json, requests
from typing import Optional, Dict, Any

# LangChain core
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.runnables import Runnable
from langchain_core.prompts import ChatPromptTemplate
from langchain.tools import tool
from langchain.agents import create_openai_tools_agent, AgentExecutor
# Import retriever tool across LangChain versions
try:
    from langchain.tools.retriever import create_retriever_tool
except Exception:
    try:
        from langchain_community.tools.retriever import create_retriever_tool
    except Exception:
        from langchain_community.tools import create_retriever_tool
# Geospatial deps (your API code)
from geopy.geocoders import Nominatim
from pyproj import Transformer


In [None]:
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

## Zoning API


In [None]:
ARCGIS_URL = "https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/Zoning_BaseDistricts/FeatureServer/0/query"

def _query_zoning(projected_x: float, projected_y: float) -> Dict[str, Any]:
    params = {
        "where": "1=1",
        "geometry": f"{projected_x},{projected_y}",
        "geometryType": "esriGeometryPoint",
        "inSR": "3857",
        "spatialRel": "esriSpatialRelIntersects",
        "outFields": "*",
        "returnGeometry": "true",
        "f": "json"
    }
    r = requests.get(ARCGIS_URL, params=params, timeout=30)
    r.raise_for_status()
    return r.json()


def _geocode_to_web_mercator(address: str) -> Optional[Dict[str, float]]:
    geolocator = Nominatim(user_agent="philly-zoning-rag")
    location = geolocator.geocode(address)
    if not location:
        return None
    # WGS84 → Web Mercator (EPSG:4326 → EPSG:3857)
    transformer = Transformer.from_crs("EPSG:4326", "EPSG:3857", always_xy=True)
    proj_x, proj_y = transformer.transform(location.longitude, location.latitude)
    return {"x": proj_x, "y": proj_y}


## Zoning as a LangChain Tool

In [None]:
@tool("get_zoning_for_address", return_direct=False)
def get_zoning_for_address(address: str) -> str:
    """
    Look up the zoning district for a Philadelphia street address.
    Input: a free-form address string (e.g., "1234 Market St, Philadelphia, PA").
    Output: JSON with fields like zoninggroup, zoning, objectid, and the raw ArcGIS response.
    """
    coords = _geocode_to_web_mercator(address)
    if not coords:
        return json.dumps({"ok": False, "error": "Address not found or outside Philadelphia."})
    data = _query_zoning(coords["x"], coords["y"])
    if not data.get("features"):
        return json.dumps({"ok": False, "error": "No zoning data found for this point."})
    attrs = data["features"][0]["attributes"]
    return json.dumps({
        "ok": True,
        "address": address,
        "projected_xy": coords,
        "zoninggroup": attrs.get("zoninggroup"),
        "zoning": attrs.get("zoning"),
        "objectid": attrs.get("objectid"),
        "raw": data
    })


## Tiny knowledge base (RAG)

In [None]:
docs_texts = [
    # Placeholder cheat-sheet text. Swap in official Philadelphia zoning docs for accuracy.
    "CMX-1: Neighborhood Commercial Mixed-Use, small-scale retail at ground floor with residential above.",
    "CMX-2/2.5: Mixed-use districts allowing neighborhood-serving commercial and residential uses.",
    "RM-1: Residential Multi-Family, attached and semi-detached dwellings; height and density limits apply.",
    "RSA-5: Residential Single-Family Attached; single-family rowhouses common in Philly.",
    "Parking minimums/maximums vary by district and overlays; always check the code section for use and dimensional standards.",
    "Overlays can add or modify standards in particular neighborhoods (e.g., design standards, use restrictions)."
]

docs = [Document(page_content=t, metadata={"source": "cheatsheet"}) for t in docs_texts]

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

retriever_tool = create_retriever_tool(
    retriever,
    name="search_zoning_knowledge",
    description="Search local notes about Philadelphia zoning use categories and quick explanations."
)


## LLM + tools + RAG

In [None]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.agents import create_openai_tools_agent, AgentExecutor

system_prompt = """You are a helpful planning assistant for Philadelphia lots.
You can (1) retrieve background zoning info from a small knowledge base, and
(2) call a zoning API tool to look up the zoning district for a specific address.

Guidelines:
- If the user mentions an address, call get_zoning_for_address to fetch the zoning.
- Use search_zoning_knowledge to define or summarize what a zoning code permits.
- When both apply, do BOTH: call the API, then provide a concise explanation retrieved from the KB.
- Be clear about non-authoritative notes and suggest checking the official Philadelphia Code.
- Prefer structured JSON when the user asks for it.
"""

llm = ChatOpenAI(temperature=0)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    MessagesPlaceholder("chat_history"),      # optional but useful
    ("human", "{input}"),
    MessagesPlaceholder("agent_scratchpad"),  # REQUIRED by tools agent
])

tools = [retriever_tool, get_zoning_for_address]

agent = create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)


## Query


In [None]:
user_q = """What's the zoning for "4042 Chestnut Street, Philadelphia, PA 19104",
and what kinds of uses are generally allowed in that district? 1 sentence."""
response = agent_executor.invoke({"input": user_q, "chat_history": []})
print(response["output"])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `get_zoning_for_address` with `{'address': '4042 Chestnut Street, Philadelphia, PA 19104'}`


[0m[33;1m[1;3m{"ok": true, "address": "4042 Chestnut Street, Philadelphia, PA 19104", "projected_xy": {"x": -8371664.172864509, "y": 4859486.290951133}, "zoninggroup": "Commercial/Commercial Mixed-Use", "zoning": null, "objectid": 11541, "raw": {"objectIdFieldName": "objectid", "uniqueIdField": {"name": "objectid", "isSystemMaintained": true}, "globalIdFieldName": "", "geometryProperties": {"shapeAreaFieldName": "Shape__Area", "shapeLengthFieldName": "Shape__Length", "units": "esriMeters"}, "geometryType": "esriGeometryPolygon", "spatialReference": {"wkid": 102100, "latestWkid": 3857}, "fields": [{"name": "code", "type": "esriFieldTypeString", "alias": "Code No Dash", "sqlType": "sqlTypeOther", "length": 8, "domain": null, "defaultValue": null}, {"name": "citycor", "type": "esriFieldTypeSmallInteger", "alias": "CITYCOR"

In [None]:
user_q_json = """Return JSON with keys {"address","zoning_lookup","kb_summary"} for
"1234 Market St, Philadelphia, PA". "zoning_lookup" should be whatever the tool returns,
and "kb_summary" should be a short 2–3 sentence explainer pulled from retrieval."""
response = agent_executor.invoke({"input": user_q_json, "chat_history": []})
print(response["output"])




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `get_zoning_for_address` with `{'address': '1234 Market St, Philadelphia, PA'}`


[0m[33;1m[1;3m{"ok": true, "address": "1234 Market St, Philadelphia, PA", "projected_xy": {"x": -8366879.2604000475, "y": 4858928.491314433}, "zoninggroup": "Commercial/Commercial Mixed-Use", "zoning": null, "objectid": 27186, "raw": {"objectIdFieldName": "objectid", "uniqueIdField": {"name": "objectid", "isSystemMaintained": true}, "globalIdFieldName": "", "geometryProperties": {"shapeAreaFieldName": "Shape__Area", "shapeLengthFieldName": "Shape__Length", "units": "esriMeters"}, "geometryType": "esriGeometryPolygon", "spatialReference": {"wkid": 102100, "latestWkid": 3857}, "fields": [{"name": "code", "type": "esriFieldTypeString", "alias": "Code No Dash", "sqlType": "sqlTypeOther", "length": 8, "domain": null, "defaultValue": null}, {"name": "citycor", "type": "esriFieldTypeSmallInteger", "alias": "CITYCOR", "sqlType": "sqlTypeOt