### 1. Imports

In [1]:
import sys, os
sys.path.append(os.path.abspath(".."))

In [2]:
from google import genai
from autoddg import AutoDDG
import pandas as pd
from pathlib import Path
from geo_profiler.geo_profiler import GeoProfiler
from geo_profiler.models import GeoProfile
from autoddg_geo import AutoDDGGeo
from openai import OpenAI
import time
from rank_bm25 import BM25Okapi
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### 2. Setting up Open AI Client, AutoDDG and AutoDDG-Geo

In [3]:
PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))

# Setup OpenAI client
client = OpenAI(
    api_key="AIzaSyBaUUM696KvYfr0P-vN0qR64v3v6_zrfNU",
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/")

# Initialize AutoDDG and AutoDDG Geo
autoddg = AutoDDG(client=client, model_name="gemini-2.5-flash-lite")
autoddg_geo = AutoDDGGeo(client=client, model_name="gemini-2.5-flash-lite")

### 3. Reading dataset

In [4]:
def read_dataset(path: str):
    sample_csv = pd.read_csv(path)
    print(sample_csv.shape)
    return sample_csv

### 4. Retrieving data profile

In [5]:
def infer_data_profile(dataframe: pd.DataFrame) -> str:
    dataprofile = autoddg.profile_dataframe(dataframe=dataframe)
    return dataprofile

### 5. Deriving Semantic Profile

In [6]:
def infer_semantic_profile(dataframe: pd.DataFrame) -> str:
    semantic_profile = autoddg.analyze_semantics(dataframe=dataframe)
    return semantic_profile

### 6. Deriving Geo Spatial Profile

In [7]:
def infer_geo_profile(semantic_profile: str, df: pd.DataFrame) -> GeoProfile: 
    geo_profile = autoddg_geo.analyze_geo(semantic_profile, df)
    return geo_profile

### 7. Deriving AutoDDG Description

In [8]:
def generate_dataset_description(sample_csv, dataprofile, use_data_profile, semantic_profile, use_semantic_profile, topic, use_topic):
    prompt, desc = autoddg.describe_dataset(sample_csv, dataprofile[0], use_data_profile, semantic_profile, use_semantic_profile)
    return desc

### 8. Deriving Geo Aware Description

In [9]:
def generate_geoaware_description(sample_csv, dataprofile, use_data_profile, semantic_profile, use_semantic_profile, topic, use_topic, geo_profile, use_geo_profile):
    geo_desc = autoddg_geo.generate_geoaware_description(sample_csv, dataprofile[0], use_data_profile, semantic_profile, use_semantic_profile, topic, use_topic, geo_profile, use_geo_profile)
    return geo_desc[0]

### 9. Expanding Geo Aware Description for Search

In [10]:
def generate_description_for_search(desc, topic):
    search_desc = autoddg.expand_description_for_search(desc, topic)
    return search_desc[1]

### 10. Generating Descriptions for Various Datasets:

In [11]:
evaluation_records = []

In [12]:
def generating_descriptions(topic, dataset_path):

    print(f"Read Dataset {topic}:")
    df = read_dataset(dataset_path)
    df_sample = df.head(5)
    sample_csv = df_sample.to_csv(index=False)
    print("==========================================================")
    print("Infer data profile:")
    dataprofile = infer_data_profile(df)
    print(dataprofile[0])
    print("==========================================================")
    print("Infer semantic profile:")
    semantic_profile = infer_semantic_profile(df)
    print(semantic_profile)
    time.sleep(60)
    print("==========================================================")
    print("Infer geo profile:")
    geo_profile = infer_geo_profile(semantic_profile, df_sample)
    print(geo_profile)
    print("==========================================================")
    print("Generated user focused AutoDDG description:")
    autoddg_desc = generate_dataset_description(sample_csv, dataprofile[0], True, semantic_profile, True, topic, True)
    print(autoddg_desc)
    time.sleep(60)
    print("==========================================================")
    print("Generated user focused description with Geo profile:")
    autoddg_geo_desc = generate_geoaware_description(sample_csv, dataprofile[0], True, semantic_profile, True, topic, True, geo_profile, True)
    print(autoddg_geo_desc)
    time.sleep(60)
    print("==========================================================")
    print("Generated search focused description with Geo description:")
    search_desc = generate_description_for_search(autoddg_geo_desc, topic)
    print(search_desc)
    print("==========================================================")
    evaluation_records.append({
        "topic": topic,
        "dataset_name": df,
        "data_profile": dataprofile[0],
        "semantic_profile": semantic_profile,
        "geo_profile": geo_profile,
        "baseline_description": autoddg_desc,
        "geo_description": autoddg_geo_desc,
        "search_description": search_desc
    })

### 10. 1 - Dataset 1: Parking Meters Locations and Status
##### https://data.cityofnewyork.us/Transportation/Parking-Meters-Locations-and-Status/693u-uax6/about_data

In [18]:
topic = "Parking Meters Locations and Status"
csv_path = "../examples/Parking_Meters_Locations_and_Status.csv"
print("Generating descriptions...")
generating_descriptions(topic, csv_path)
print("Generated descriptions stored for evaluation...")

Generating descriptions...
Read Dataset Parking Meters Locations and Status:
(15582, 17)
Infer data profile:
The key data profile information for this dataset includes:
**ObjectID**: Data is of type integer. There are 15582 unique values. Coverage spans from 0 to 15326.0. 
**Meter Number**: Data is of type integer. There are 15576 unique values. Coverage spans from 0 to 5043005.0. 
**Status**: Data is of type text. There are 3 unique values. 
**Pay By Cell Number**: Data is of type integer. There are 12092 unique values. Coverage spans from 0 to 509514.0. 
**Meter_Hours**: Data is of type text. 
**Parking_Facility_Name**: Data is of type text. There are 35 unique values. 
**Facility**: Data is of type text. There are 3 unique values. 
**Borough**: Data is of type text. There are 5 unique values. 
**On_Street**: Data is of type text. There are 1446 unique values. 
**Side_of_Street**: Data is of type text. There are 13 unique values. 
**From_Street**: Data is of type text. There are 2341

### 10.2 - Dataset 2: Community Parks Initiative Zone Boundaries
##### https://data.cityofnewyork.us/City-Government/Community-Parks-Initiative-Zone-Boundaries/vvdx-b56i

In [19]:
topic = "Community Parks Initiative Zone Boundaries"
csv_path = "../examples/CPI_Zones.csv"
print("Generating descriptions...")
generating_descriptions(topic, csv_path)
print("Generated descriptions stored for evaluation...")

Generating descriptions...
Read Dataset Community Parks Initiative Zone Boundaries:
(55, 8)
Infer data profile:
The key data profile information for this dataset includes:
**the_geom**: Data is of type text. 
**BoroCode**: Data is of type integer. There are 5 unique values. Coverage spans from 0 to 5.0. 
**BoroName**: Data is of type text. There are 5 unique values. 
**CountyFIPS**: Data is of type integer. There are 5 unique values. Coverage spans from 0 to 85.0. 
**NTACode**: Data is of type text. There are 55 unique values. 
**NTAName**: Data is of type text. There are 55 unique values. 
**Shape_Leng**: Data is of type text. There are 55 unique values. 
**Shape_Area**: Data is of type text. There are 55 unique values. 
Infer semantic profile:
The key semantic information for this dataset includes:
**the_geom**: Represents geographic entity. Contains spatial data (resolution: Coordinates). Domain-specific type: geospatial. Function/Usage context: spatial representation. 
**BoroCode**

### 10.3 - Dataset 3: Landmarks Complaints
##### https://data.cityofnewyork.us/Housing-Development/Landmarks-Complaints/ck4n-5h6x/about_data

In [14]:
topic = "Landmark Complaints"
csv_path = "../examples/Landmarks_Complaints.csv"
print("Generating descriptions...")
generating_descriptions(topic, csv_path)
print("Generated descriptions stored for evaluation...")

Generating descriptions...
Read Dataset Landmark Complaints:
(6689, 18)
Infer data profile:
The key data profile information for this dataset includes:
**Complaint #**: Data is of type text. There are 6686 unique values. 
**Date**: Data is of type text. There are 2727 unique values. 
**Address #**: Data is of type text. There are 1646 unique values. 
**Street Name**: Data is of type text. There are 1382 unique values. 
**Borough**: Data is of type text. There are 11 unique values. 
**Block**: Data is of type integer. There are 1338 unique values. Coverage spans from 0 to 2476.0. 
**Lot**: Data is of type integer. There are 243 unique values. Coverage spans from 0 to 93.0. 
**BIN**: Data is of type integer. There are 4790 unique values. Coverage spans from 0 to 5015074.0. 
**Postcode**: Data is of type integer. There are 149 unique values. Coverage spans from 0 to 11377.0. 
**Landmark Name**: Data is of type text. 
**Work Reported**: Data is of type text. 
**Action Taken**: Data is of t

In [20]:
print(len(evaluation_records))

3


### 10. Evaluation

In [None]:
gold_data = {

  # EVENT-SPECIFIC (only D3)
  "NYC landmark complaints": ["D3"],
  "historic district violations NYC": ["D3"],
  "NYC construction complaint hotspots": ["D3"],
  "spatio-temporal incident clustering NYC": ["D3"],

  # INFRASTRUCTURE-SPECIFIC (only D1)
  "parking meters locations NYC": ["D1"],
  "street parking availability map": ["D1"],
  "NYC parking infrastructure dataset": ["D1"],

  # BOUNDARY-SPECIFIC (only D2)
  "NYC administrative district polygons": ["D2"],
  "community parks initiative zones": ["D2"],
  "borough-level park boundary maps": ["D2"],

  # MULTI-DATASET QUERIES (D1 + D3)
  "NYC street-level geospatial datasets": ["D1", "D3"],
  "NYC datasets with latitude and longitude for mapping": ["D1", "D3"],

  # MULTI-DATASET QUERIES (D2 + D3)
  "NYC neighborhood-level spatial analysis": ["D2", "D3"],
  "datasets with NTA-level spatial attributes NYC": ["D2", "D3"],

  # MULTI-DATASET QUERIES (D1 + D2)
  "NYC urban planning datasets": ["D1", "D2"],
  "NYC street + zone geospatial datasets": ["D1", "D2"],

  # MULTI-DATASET GENERAL QUERY (D1 + D2 + D3)
  "NYC geospatial datasets for multi-scale analysis": ["D1", "D2", "D3"],
  "datasets supporting citywide spatial analysis": ["D1", "D2", "D3"],
}

In [None]:
corpus_baseline = {
  "D1": baseline_desc_1,
  "D2": baseline_desc_2
}

corpus_geo = {
  "D1": geo_desc_1,
  "D2": geo_desc_2
}

In [None]:
import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
import math

def dcg_at_k(r, k):
    """Discounted Cumulative Gain"""
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.0

def ndcg_at_k(r, k):
    """Normalized DCG"""
    best_r = sorted(r, reverse=True)
    return dcg_at_k(r, k) / (dcg_at_k(best_r, k) or 1.0)

def average_precision(ranked_list, relevant_set):
    """AP for multi-label relevance"""
    hits = 0
    sum_precisions = 0
    for i, d in enumerate(ranked_list):
        if d in relevant_set:
            hits += 1
            sum_precisions += hits / (i + 1)
    if len(relevant_set) == 0:
        return 0
    return sum_precisions / len(relevant_set)

def eval_corpus(corpus, gold, k=2):
    dataset_ids = list(corpus.keys())
    tokenized = [corpus[d].lower().split() for d in dataset_ids]
    bm25 = BM25Okapi(tokenized)

    metrics = {
        "Precision@K": [],
        "Recall@K": [],
        "Hit@K": [],
        "MRR": [],
        "nDCG@K": [],
        "MAP": []
    }

    for query, relevant in gold.items():
        relevant = set(relevant)

        # Retrieve ranked list
        scores = bm25.get_scores(query.lower().split())
        ranked = [dataset_ids[i] for i in np.argsort(scores)[::-1]]
        top_k = ranked[:k]

        # Relevance mask for ranking
        relevance_mask = [1 if d in relevant else 0 for d in ranked]

        # Precision and recall
        hits = sum([1 for d in top_k if d in relevant])
        precision = hits / k
        recall = hits / (len(relevant) or 1)

        # Hit@K (boolean)
        hitk = 1 if hits > 0 else 0

        # MRR (first relevant rank)
        rr = 0
        for i, d in enumerate(ranked):
            if d in relevant:
                rr = 1 / (i + 1)
                break

        # nDCG@K
        ndcg = ndcg_at_k(relevance_mask, k)

        # MAP
        ap = average_precision(ranked, relevant)

        # Store
        metrics["Precision@K"].append(precision)
        metrics["Recall@K"].append(recall)
        metrics["Hit@K"].append(hitk)
        metrics["MRR"].append(rr)
        metrics["nDCG@K"].append(ndcg)
        metrics["MAP"].append(ap)

    # Return macro averages as a table
    results_df = pd.DataFrame({
        metric: [np.mean(values)]
        for metric, values in metrics.items()
    })

    return results_df


In [None]:
baseline_scores = eval_corpus(corpus_baseline, gold)
geo_scores = eval_corpus(corpus_geo, gold)

print(baseline_scores)
print(geo_scores)