### 1. Imports

In [1]:
import sys, os
sys.path.append(os.path.abspath(".."))

In [3]:
from google import genai
from autoddg import AutoDDG
import pandas as pd
from pathlib import Path
from geo_profiler.geo_profiler import GeoProfiler
from geo_profiler.models import GeoProfile
from autoddg_geo import AutoDDGGeo
from openai import OpenAI
import time
from rank_bm25 import BM25Okapi
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### 2. Setting up Open AI Client, AutoDDG and AutoDDG-Geo

In [4]:
PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))

# Setup OpenAI client
client = OpenAI(
    api_key="",      ## ADD GEMINI_API_KEY_HERE
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/")

# Initialize AutoDDG and AutoDDG Geo
autoddg = AutoDDG(client=client, model_name="gemini-2.5-flash-lite")
autoddg_geo = AutoDDGGeo(client=client, model_name="gemini-2.5-flash-lite")

### 3. Reading dataset

In [5]:
def read_dataset(path: str):
    sample_csv = pd.read_csv(path)
    print(sample_csv.shape)
    return sample_csv

### 4. Retrieving data profile

In [6]:
def infer_data_profile(dataframe: pd.DataFrame) -> str:
    dataprofile = autoddg.profile_dataframe(dataframe=dataframe)
    return dataprofile

### 5. Deriving Semantic Profile

In [7]:
def infer_semantic_profile(dataframe: pd.DataFrame) -> str:
    semantic_profile = autoddg.analyze_semantics(dataframe=dataframe)
    return semantic_profile

### 6. Deriving Geo Spatial Profile

In [8]:
def infer_geo_profile(semantic_profile: str, df: pd.DataFrame) -> GeoProfile: 
    geo_profile = autoddg_geo.analyze_geo(semantic_profile, df)
    return geo_profile

### 7. Deriving AutoDDG Description

In [9]:
def generate_dataset_description(sample_csv, dataprofile, use_data_profile, semantic_profile, use_semantic_profile, topic, use_topic):
    prompt, desc = autoddg.describe_dataset(sample_csv, dataprofile[0], use_data_profile, semantic_profile, use_semantic_profile)
    return desc

### 8. Deriving Geo Aware Description

In [10]:
def generate_geoaware_description(sample_csv, dataprofile, use_data_profile, semantic_profile, use_semantic_profile, topic, use_topic, geo_profile, use_geo_profile):
    geo_desc = autoddg_geo.generate_geoaware_description(sample_csv, dataprofile[0], use_data_profile, semantic_profile, use_semantic_profile, topic, use_topic, geo_profile, use_geo_profile)
    return geo_desc[0]

### 9. Expanding Geo Aware Description for Search

In [11]:
def generate_description_for_search(desc, topic):
    search_desc = autoddg.expand_description_for_search(desc, topic)
    return search_desc[1]

### 10. Generating Descriptions for Various Datasets:

In [12]:
evaluation_records = []

In [13]:
def generating_descriptions(topic, dataset_path):

    print(f"Read Dataset {topic}:")
    df = read_dataset(dataset_path)
    df_sample = df.head(5)
    sample_csv = df_sample.to_csv(index=False)
    print("==========================================================")
    print("Infer data profile:")
    dataprofile = infer_data_profile(df)
    print(dataprofile[0])
    print("==========================================================")
    print("Infer semantic profile:")
    semantic_profile = infer_semantic_profile(df)
    print(semantic_profile)
    print("==========================================================")
    print("Infer geo profile:")
    geo_profile = infer_geo_profile(semantic_profile, df_sample)
    print(geo_profile)
    print("==========================================================")
    print("Generated user focused AutoDDG description:")
    autoddg_desc = generate_dataset_description(sample_csv, dataprofile[0], True, semantic_profile, True, topic, True)
    print(autoddg_desc)
    print("==========================================================")
    print("Generated user focused description with Geo profile:")
    autoddg_geo_desc = generate_geoaware_description(sample_csv, dataprofile[0], True, semantic_profile, True, topic, True, geo_profile, True)
    print(autoddg_geo_desc)
    print("==========================================================")
    print("Generated search focused description with Geo description:")
    search_desc = generate_description_for_search(autoddg_geo_desc, topic)
    print(search_desc)
    print("==========================================================")
    evaluation_records.append({
        "topic": topic,
        "dataset_name": df,
        "data_profile": dataprofile[0],
        "semantic_profile": semantic_profile,
        "geo_profile": geo_profile,
        "baseline_description": autoddg_desc,
        "geo_description": autoddg_geo_desc,
        "search_description": search_desc
    })

### Dataset 1: Parking Meters Locations and Status
##### https://data.cityofnewyork.us/Transportation/Parking-Meters-Locations-and-Status/693u-uax6/about_data

In [14]:
topic = "Parking Meters Locations and Status"
csv_path = "../examples/Parking_Meters_Locations_and_Status.csv"
print("Generating descriptions...")
generating_descriptions(topic, csv_path)
print("Generated descriptions stored for evaluation...")

Generating descriptions...
Read Dataset Parking Meters Locations and Status:
(15582, 17)
Infer data profile:
The key data profile information for this dataset includes:
**ObjectID**: Data is of type integer. There are 15582 unique values. Coverage spans from 0 to 15326.0. 
**Meter Number**: Data is of type integer. There are 15576 unique values. Coverage spans from 0 to 5043005.0. 
**Status**: Data is of type text. There are 3 unique values. 
**Pay By Cell Number**: Data is of type integer. There are 12092 unique values. Coverage spans from 0 to 509514.0. 
**Meter_Hours**: Data is of type text. 
**Parking_Facility_Name**: Data is of type text. There are 35 unique values. 
**Facility**: Data is of type text. There are 3 unique values. 
**Borough**: Data is of type text. There are 5 unique values. 
**On_Street**: Data is of type text. There are 1446 unique values. 
**Side_of_Street**: Data is of type text. There are 13 unique values. 
**From_Street**: Data is of type text. There are 2341

### Dataset 2: Community Parks Initiative Zone Boundaries
##### https://data.cityofnewyork.us/City-Government/Community-Parks-Initiative-Zone-Boundaries/vvdx-b56i

In [15]:
topic = "Community Parks Initiative Zone Boundaries"
csv_path = "../examples/CPI_Zones.csv"
print("Generating descriptions...")
generating_descriptions(topic, csv_path)
print("Generated descriptions stored for evaluation...")

Generating descriptions...
Read Dataset Community Parks Initiative Zone Boundaries:
(55, 8)
Infer data profile:
The key data profile information for this dataset includes:
**the_geom**: Data is of type text. 
**BoroCode**: Data is of type integer. There are 5 unique values. Coverage spans from 0 to 5.0. 
**BoroName**: Data is of type text. There are 5 unique values. 
**CountyFIPS**: Data is of type integer. There are 5 unique values. Coverage spans from 0 to 85.0. 
**NTACode**: Data is of type text. There are 55 unique values. 
**NTAName**: Data is of type text. There are 55 unique values. 
**Shape_Leng**: Data is of type text. There are 55 unique values. 
**Shape_Area**: Data is of type text. There are 55 unique values. 
Infer semantic profile:
The key semantic information for this dataset includes:
**the_geom**: Represents geographic feature. Contains spatial data (resolution: Coordinates). Domain-specific type: gis/mapping. Function/Usage context: spatial data representation. 
**Bor

### Dataset 3: Landmarks Complaints
##### https://data.cityofnewyork.us/Housing-Development/Landmarks-Complaints/ck4n-5h6x/about_data

In [16]:
topic = "Landmark Complaints"
csv_path = "../examples/Landmarks_Complaints.csv"
print("Generating descriptions...")
generating_descriptions(topic, csv_path)
print("Generated descriptions stored for evaluation...")

Generating descriptions...
Read Dataset Landmark Complaints:
(6689, 18)
Infer data profile:
The key data profile information for this dataset includes:
**Complaint #**: Data is of type text. There are 6686 unique values. 
**Date**: Data is of type text. There are 2727 unique values. 
**Address #**: Data is of type text. There are 1646 unique values. 
**Street Name**: Data is of type text. There are 1382 unique values. 
**Borough**: Data is of type text. There are 11 unique values. 
**Block**: Data is of type integer. There are 1338 unique values. Coverage spans from 0 to 2476.0. 
**Lot**: Data is of type integer. There are 243 unique values. Coverage spans from 0 to 93.0. 
**BIN**: Data is of type integer. There are 4790 unique values. Coverage spans from 0 to 5015074.0. 
**Postcode**: Data is of type integer. There are 149 unique values. Coverage spans from 0 to 11377.0. 
**Landmark Name**: Data is of type text. 
**Work Reported**: Data is of type text. 
**Action Taken**: Data is of t

### Dataset 4: Centerline
##### https://data.cityofnewyork.us/City-Government/Centerline/3mf9-qshr

In [17]:
topic = "Centerline"
csv_path = "../examples/Centerline.csv"
print("Generating descriptions...")
generating_descriptions(topic, csv_path)
print("Generated descriptions stored for evaluation...")

Generating descriptions...
Read Dataset Centerline:
(122050, 64)
Infer data profile:
The key data profile information for this dataset includes:
**the_geom**: Data is of type text. 
**PHYSICALID**: Data is of type text. There are 122008 unique values. 
**L_LOW_HN**: Data is of type text. There are 7569 unique values. 
**L_HIGH_HN**: Data is of type text. There are 7958 unique values. 
**R_LOW_HN**: Data is of type text. There are 7676 unique values. 
**R_HIGH_HN**: Data is of type text. There are 8004 unique values. 
**L_ZIP**: Data is of type integer. There are 215 unique values. Coverage spans from 0 to 11436.0. 
**R_ZIP**: Data is of type integer. There are 223 unique values. Coverage spans from 0 to 11436.0. 
**STATUS**: Data is of type integer. There are 3 unique values. Coverage spans from 0 to 2.0. 
**BIKE_LANE**: Data is of type integer. There are 10 unique values. Coverage spans from 0 to 3.0. 
**TRAFDIR**: Data is of type text. There are 4 unique values. 
**RW_TYPE**: Data is

### Dataset 5: Parks Zones
##### https://data.cityofnewyork.us/City-Government/Parks-Zones/4j29-i5ry/about_data

In [18]:
topic = "Parks Zones"
csv_path = "../examples/Parks_Zones.csv"
print("Generating descriptions...")
generating_descriptions(topic, csv_path)
print("Generated descriptions stored for evaluation...")

Generating descriptions...
Read Dataset Parks Zones:
(928, 20)
Infer data profile:
The key data profile information for this dataset includes:
**ACRES**: Data is of type float. Coverage spans from 0 to 79.02298176. 
**BOROUGH**: Data is of type text. There are 5 unique values. 
**COMMUNITYBOARD**: Data is of type text. There are 84 unique values. 
**COUNCILDISTRICT**: Data is of type integer. There are 51 unique values. Coverage spans from 0 to 51.0. 
**DEPARTMENT**: Data is of type text. There are 70 unique values. 
**DESCRIPTION**: Data is of type text. 
**GISPROPNUM**: Data is of type text. There are 203 unique values. 
**LOCATION**: Data is of type text. 
**NYS_ASSEMBLY**: Data is of type text. There are 100 unique values. 
**NYS_SENATE**: Data is of type text. There are 47 unique values. 
**OMPPROPID**: Data is of type text. There are 928 unique values. 
**PRECINCT**: Data is of type integer. There are 77 unique values. Coverage spans from 0 to 123.0. 
**PROPNAME**: Data is of typ

In [19]:
print(len(evaluation_records))

5


### 11. Evaluation

In [23]:
gold_data = {

    # D1 — Parking Meters Location and Status
    "NYC parking meters dataset": ["D1"],
    "parking availability street-level NYC": ["D1"],
    "NYC pay-by-cell parking infrastructure": ["D1"],
    "parking meter geolocation data": ["D1"],
    "NYC curbside parking enforcement planning data": ["D1"],

    # D2 — CPI Zones
    "community parks initiative boundaries NYC": ["D2"],
    "CPI zone polygons New York": ["D2"],
    "NYC park improvement program boundary files": ["D2"],
    "borough-level CPI analysis dataset": ["D2"],

    # D3 — Landmark Complaints
    "NYC landmark preservation complaints": ["D3"],
    "historic district violation reports NYC": ["D3"],
    "NYC construction and renovation complaint data": ["D3"],
    "spatio-temporal landmark complaint incidents": ["D3"],

    # D4 — Centerline
    "NYC street centerline dataset": ["D4"],
    "road geometry and street segment shapefile NYC": ["D4"],
    "transportation network polyline data NYC": ["D4"],
    "NYC street connectivity and width dataset": ["D4"],
    
    # D5 — Parks Zones
    "NYC park zones boundary polygons": ["D5"],
    "parkland administrative zones NYC": ["D5"],
    "parks acreage and boundary dataset NYC": ["D5"],
    "recreational area boundary shapefiles": ["D5"],

    # OVERLAP CASES
    # Purpose: evaluate retrieval ranking & discrimination ability
    
    # D1 + D3 (both point datasets, but different roles)
    "NYC point-based geospatial datasets for public services": ["D1", "D3"],
    "datasets with street-level point locations in NYC": ["D1", "D3"],

    # D2 + D5 (both boundary multipolygon but different themes)
    "NYC park-related boundary datasets": ["D2", "D5"],
    "datasets defining administrative park or community boundaries": ["D2", "D5"],

    # D1 + D4 (both infrastructure, point vs multi-line)
    "NYC street and parking infrastructure datasets": ["D1", "D4"],
    "NYC transportation and curbside infrastructure mapping": ["D1", "D4"],

    # D4 + D5 (street network + park zones)
    "NYC parks and surrounding street networks": ["D4", "D5"],

    # D2 + D4 (CPI zones + centerline)
    "NYC planning datasets with zones and street geometry": ["D2", "D4"],

    # FULL MULTI-DATASET (D1+D2+D3+D4+D5)
    "NYC geospatial datasets across multiple geometry types": ["D1", "D2", "D3", "D4", "D5"],
    "datasets supporting citywide spatial analysis NYC": ["D1", "D2", "D3", "D4", "D5"],

}


In [24]:
# Automatically extract descriptions from evaluation_record
baseline_descriptions = [rec["baseline_description"] for rec in evaluation_records]
geo_descriptions = [rec["geo_description"] for rec in evaluation_records]

# Generate dataset IDs: D1, D2, D3, ...
dataset_ids = [f"D{i+1}" for i in range(len(evaluation_records))]

# Build the corpora
corpus_baseline = {
    dataset_ids[i]: baseline_descriptions[i]
    for i in range(len(dataset_ids))
}

corpus_geo = {
    dataset_ids[i]: geo_descriptions[i]
    for i in range(len(dataset_ids))
}

print("Dataset IDs:", dataset_ids)
print("Baseline Corpus Keys:", corpus_baseline.keys())
print("Geo Corpus Keys:", corpus_geo.keys())

Dataset IDs: ['D1', 'D2', 'D3', 'D4', 'D5']
Baseline Corpus Keys: dict_keys(['D1', 'D2', 'D3', 'D4', 'D5'])
Geo Corpus Keys: dict_keys(['D1', 'D2', 'D3', 'D4', 'D5'])


### 12. Retrieval Evaluation
#### Retrieval Metric Definitions

###### | Metric      | Meaning (one line) |
###### | **Precision@K** | Fraction of retrieved items in the top-K that are relevant. |
###### | **Recall@K** | Fraction of all relevant items that appear in the top-K. |
###### | **Hit@K** | Indicates whether at least one relevant item appears in the top-K (averaged across queries). |
###### | **MRR (Mean Reciprocal Rank)** | Measures how early the first relevant item appears in the ranking. |
###### | **nDCG@K** | Evaluates ranking quality by rewarding relevant items placed higher in the top-K. |
###### | **MAP (Mean Average Precision)** | Measures overall ranking quality across all relevant items for each query. |


In [25]:
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
import math

def dcg_at_k(r, k):
    """Discounted Cumulative Gain"""
    r = np.asarray(r, dtype=float)[:k]
    if r.size:
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.0

def ndcg_at_k(r, k):
    """Normalized DCG"""
    best_r = sorted(r, reverse=True)
    return dcg_at_k(r, k) / (dcg_at_k(best_r, k) or 1.0)

def average_precision(ranked_list, relevant_set):
    """AP for multi-label relevance"""
    hits = 0
    sum_precisions = 0
    for i, d in enumerate(ranked_list):
        if d in relevant_set:
            hits += 1
            sum_precisions += hits / (i + 1)
    if len(relevant_set) == 0:
        return 0
    return sum_precisions / len(relevant_set)

def eval_corpus(corpus, gold, k=2):
    dataset_ids = list(corpus.keys())
    tokenized = [corpus[d].lower().split() for d in dataset_ids]
    bm25 = BM25Okapi(tokenized)

    metrics = {
        "Precision@K": [],
        "Recall@K": [],
        "Hit@K": [],
        "MRR": [],
        "nDCG@K": [],
        "MAP": []
    }

    for query, relevant in gold.items():
        relevant = set(relevant)

        # Retrieve ranked list
        scores = bm25.get_scores(query.lower().split())
        ranked = [dataset_ids[i] for i in np.argsort(scores)[::-1]]
        top_k = ranked[:k]

        # Relevance mask for ranking
        relevance_mask = [1 if d in relevant else 0 for d in ranked]

        # Precision and recall
        hits = sum([1 for d in top_k if d in relevant])
        precision = hits / k
        recall = hits / (len(relevant) or 1)

        # Hit@K (boolean)
        hitk = 1 if hits > 0 else 0

        # MRR (first relevant rank)
        rr = 0
        for i, d in enumerate(ranked):
            if d in relevant:
                rr = 1 / (i + 1)
                break

        # nDCG@K
        ndcg = ndcg_at_k(relevance_mask, k)

        # MAP
        ap = average_precision(ranked, relevant)

        # Store
        metrics["Precision@K"].append(precision)
        metrics["Recall@K"].append(recall)
        metrics["Hit@K"].append(hitk)
        metrics["MRR"].append(rr)
        metrics["nDCG@K"].append(ndcg)
        metrics["MAP"].append(ap)

    # Return macro averages as a table
    results_df = pd.DataFrame({
        metric: [np.mean(values)]
        for metric, values in metrics.items()
    })

    return results_df


In [44]:
baseline_scores = eval_corpus(corpus_baseline, gold_data)
geo_scores = eval_corpus(corpus_geo, gold_data)

comparison_df = pd.DataFrame({
    "Model": ["Baseline", "AutoDDG-Geo"]
})

for col in baseline_scores.columns:
    comparison_df[col] = [baseline_scores[col].iloc[0], geo_scores[col].iloc[0]]

comparison_df

Unnamed: 0,Model,Precision@K,Recall@K,Hit@K,MRR,nDCG@K,MAP
0,Baseline,0.564516,0.848387,0.935484,0.889785,0.85503,0.877419
1,AutoDDG-Geo,0.580645,0.880645,0.967742,0.91129,0.882683,0.89328


### 13. Geospatial Faithfulness Score

In [41]:
POINT_TERMS = ["point", "point-based", "latitude", "longitude"]
POLYGON_TERMS = ["polygon", "polygon-based", "boundary"]
POLYLINE_TERMS = ["polyline", "line", "centerline"]
MULTI_TERMS = ["multi", "multi polygon", "multipolygon"]

EVENT_TERMS = ["event", "event", "event", "incident"]
INFRA_TERMS = ["infrastructure", "fixed infrastructure"]
BOUNDARY_TERMS = ["boundary", "boundary", "administrative boundary"]
OBSERVATION_TERMS = ["observation", "observational data", "sensor-based observation"]

In [42]:
def extract_geo_features(text):
    text = text.lower()
    feats = set()

    if any(w in text for w in POINT_TERMS): feats.add("point")
    if any(w in text for w in POLYGON_TERMS): feats.add("polygon")
    if any(w in text for w in POLYLINE_TERMS): feats.add("polyline")
    if any(w in text for w in MULTI_TERMS): feats.add("multi")
    if "multi" in feats:
        feats.add("polygon")

    if any(w in text for w in EVENT_TERMS): feats.add("event")
    if any(w in text for w in INFRA_TERMS): feats.add("infrastructure")
    if any(w in text for w in BOUNDARY_TERMS): feats.add("boundary")
    if any(w in text for w in OBSERVATION_TERMS): feats.add("observation")

    return feats
    
def geospatial_faithfulness(geo_profile, description_features):
    score = 0
    total = 2  # geometry + role

    # geometry match
    if geo_profile.geometry_type in description_features:
        score += 1

    # spatial role match
    if geo_profile.spatial_role in description_features:
        score += 1

    return score / total

def compute_faithfulness_scores(evaluation_record):
    rows = []

    for i, rec in enumerate(evaluation_record):
        dataset_id = f"D{i+1}"

        gp = rec["geo_profile"]

        # extract features
        baseline_feats = extract_geo_features(rec["baseline_description"])
        geo_desc_feats = extract_geo_features(rec["geo_description"])

        # compute scores
        baseline_gfs = geospatial_faithfulness(gp, baseline_feats)
        geo_gfs = geospatial_faithfulness(gp, geo_desc_feats)

        rows.append({
            "Dataset": dataset_id,
            "Spatial Role": gp.spatial_role,
            "Geometry Type": gp.geometry_type,
            "Baseline GFS": baseline_gfs,
            "Geo GFS": geo_gfs
        })

    return pd.DataFrame(rows)

In [43]:
faithfulness_table = compute_faithfulness_scores(evaluation_records)
faithfulness_table

Unnamed: 0,Dataset,Spatial Role,Geometry Type,Baseline GFS,Geo GFS
0,D1,infrastructure,point,0.5,1.0
1,D2,boundary,multi,0.5,1.0
2,D3,event,point,0.5,1.0
3,D4,infrastructure,multi,0.5,1.0
4,D5,boundary,multi,0.5,1.0
