# Apache Iceberg with Sedona Spatial Analysis

This notebook demonstrates how to use Apache Iceberg for spatial data storage and management with Sedona for geospatial processing.

In [None]:
# Import required libraries
import sys
sys.path.append('/opt/workspace/config')

from pyspark.sql import SparkSession
from sedona.spark import SedonaContext
import geopandas as gpd
import folium
import matplotlib.pyplot as plt
from iceberg_spatial_config import (
    create_spark_session_with_iceberg,
    create_sedona_context,
    create_spatial_iceberg_table_example,
    spatial_query_example
)

## Initialize Spark Session with Iceberg and Sedona

In [None]:
# Create Spark session with Iceberg support
spark = create_spark_session_with_iceberg()

# Enable Sedona spatial functions
sedona_spark = create_sedona_context(spark)

print(f"Spark Version: {spark.version}")
print(f"Spark Session ID: {spark.sparkContext.applicationId}")

## Create Iceberg Catalog and Spatial Tables

In [None]:
# Create namespace for spatial data
sedona_spark.sql("CREATE NAMESPACE IF NOT EXISTS local.spatial_data")

# Create a spatial Iceberg table for points of interest
create_table_sql = """
CREATE TABLE IF NOT EXISTS local.spatial_data.poi (
    id BIGINT,
    name STRING,
    category STRING,
    geometry GEOMETRY,
    elevation DOUBLE,
    population LONG,
    created_at TIMESTAMP,
    metadata MAP<STRING, STRING>
) USING ICEBERG
PARTITIONED BY (category, days(created_at))
TBLPROPERTIES (
    'write.parquet.compression-codec' = 'snappy',
    'write.metadata.compression-codec' = 'gzip',
    'write.object-storage.enabled' = 'true'
)
"""

sedona_spark.sql(create_table_sql)
print("Spatial Iceberg table created successfully!")

## Insert Sample Spatial Data

In [None]:
# Insert sample points of interest
insert_data_sql = """
INSERT INTO local.spatial_data.poi VALUES
    (1, 'San Francisco Bay Area', 'urban', ST_Point(-122.4194, 37.7749), 52.0, 884363, current_timestamp(), map('state', 'CA', 'country', 'USA')),
    (2, 'New York City', 'urban', ST_Point(-74.0060, 40.7128), 10.0, 8336817, current_timestamp(), map('state', 'NY', 'country', 'USA')),
    (3, 'Chicago', 'urban', ST_Point(-87.6298, 41.8781), 181.0, 2746388, current_timestamp(), map('state', 'IL', 'country', 'USA')),
    (4, 'Los Angeles', 'urban', ST_Point(-118.2437, 34.0522), 87.0, 3898747, current_timestamp(), map('state', 'CA', 'country', 'USA')),
    (5, 'Yellowstone National Park', 'park', ST_Point(-110.5885, 44.4280), 2400.0, 0, current_timestamp(), map('state', 'WY', 'country', 'USA')),
    (6, 'Grand Canyon', 'park', ST_Point(-112.1401, 36.1069), 2100.0, 0, current_timestamp(), map('state', 'AZ', 'country', 'USA')),
    (7, 'Miami', 'urban', ST_Point(-80.1918, 25.7617), 2.0, 467963, current_timestamp(), map('state', 'FL', 'country', 'USA')),
    (8, 'Seattle', 'urban', ST_Point(-122.3321, 47.6062), 56.0, 753675, current_timestamp(), map('state', 'WA', 'country', 'USA'))
"""

sedona_spark.sql(insert_data_sql)
print("Sample spatial data inserted!")

## Spatial Queries with Iceberg Tables

In [None]:
# Basic spatial query - show all data
result = sedona_spark.sql("""
    SELECT 
        id,
        name,
        category,
        ST_AsText(geometry) as wkt_geometry,
        ST_X(geometry) as longitude,
        ST_Y(geometry) as latitude,
        elevation,
        population
    FROM local.spatial_data.poi
    ORDER BY population DESC
""")

result.show(truncate=False)

## Spatial Analysis Examples

In [None]:
# Find cities within 500 miles of San Francisco
nearby_cities = sedona_spark.sql("""
    SELECT 
        p1.name as origin,
        p2.name as destination,
        ST_Distance(p1.geometry, p2.geometry) * 69.0 as distance_miles,
        p2.category,
        p2.population
    FROM local.spatial_data.poi p1
    CROSS JOIN local.spatial_data.poi p2
    WHERE p1.name = 'San Francisco Bay Area' 
        AND p2.name != 'San Francisco Bay Area'
        AND ST_Distance(p1.geometry, p2.geometry) * 69.0 <= 500
    ORDER BY distance_miles
""")

print("Cities within 500 miles of San Francisco:")
nearby_cities.show()

In [None]:
# Create buffer zones around urban areas
buffer_analysis = sedona_spark.sql("""
    SELECT 
        name,
        category,
        ST_AsText(ST_Buffer(geometry, 0.5)) as buffer_50miles,
        population,
        elevation
    FROM local.spatial_data.poi
    WHERE category = 'urban'
    ORDER BY population DESC
""")

print("Buffer zones around urban areas:")
buffer_analysis.show(truncate=False)

## Time Travel with Iceberg

In [None]:
# Show table history (Iceberg feature)
history = sedona_spark.sql("""
    SELECT * FROM local.spatial_data.poi.history
    ORDER BY made_current_at DESC
""")

print("Table history:")
history.show(truncate=False)

In [None]:
# Show table snapshots (Iceberg feature)
snapshots = sedona_spark.sql("""
    SELECT snapshot_id, committed_at, operation, summary 
    FROM local.spatial_data.poi.snapshots
    ORDER BY committed_at DESC
""")

print("Table snapshots:")
snapshots.show(truncate=False)

## Visualization with GeoPandas and Folium

In [None]:
# Convert Spark DataFrame to Pandas for visualization
pandas_df = sedona_spark.sql("""
    SELECT 
        name,
        category,
        ST_X(geometry) as longitude,
        ST_Y(geometry) as latitude,
        population,
        elevation
    FROM local.spatial_data.poi
""").toPandas()

# Create GeoDataFrame
gdf = gpd.GeoDataFrame(
    pandas_df, 
    geometry=gpd.points_from_xy(pandas_df.longitude, pandas_df.latitude),
    crs='EPSG:4326'
)

print(f"GeoDataFrame shape: {gdf.shape}")
gdf.head()

In [None]:
# Create interactive map with Folium
center_lat = gdf['latitude'].mean()
center_lon = gdf['longitude'].mean()

m = folium.Map(location=[center_lat, center_lon], zoom_start=4)

# Add points to map
for idx, row in gdf.iterrows():
    color = 'red' if row['category'] == 'urban' else 'green'
    popup_text = f"{row['name']}<br>Category: {row['category']}<br>Population: {row['population']:,}<br>Elevation: {row['elevation']}m"
    
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=8,
        popup=popup_text,
        color=color,
        fill=True
    ).add_to(m)

m

## Performance Optimization with Iceberg

In [None]:
# Show table metadata and statistics
table_files = sedona_spark.sql("""
    SELECT file_path, file_format, record_count, file_size_in_bytes
    FROM local.spatial_data.poi.files
""")

print("Table files information:")
table_files.show(truncate=False)

In [None]:
# Optimize table (compaction)
sedona_spark.sql("""
    CALL local.system.rewrite_data_files(
        table => 'local.spatial_data.poi'
    )
""")

print("Table optimized!")

## Cleanup

In [None]:
# Stop Spark session
spark.stop()
print("Spark session stopped.")