# Polars with polars-st Spatial Analysis and Iceberg Integration

This notebook demonstrates how to use Polars with polars-st for high-performance geospatial processing and Apache Iceberg for data lakehouse capabilities.

In [None]:
# Import required libraries
import sys
sys.path.append('/opt/workspace/config')

import polars as pl
import polars_st as st
import geopandas as gpd
import folium
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# Import our configuration
from iceberg_spatial_config import (
    setup_polars_spatial,
    create_polars_config,
    create_spatial_iceberg_table_example,
    spatial_query_example,
    advanced_spatial_operations,
    create_iceberg_catalog
)

print(f"Polars version: {pl.__version__}")
print(f"Polars-st imported successfully")

## Setup Polars Configuration

In [None]:
# Configure Polars for optimal performance
setup_polars_spatial()
config = create_polars_config()
print("Polars configuration:", config)

# Set up Iceberg catalog
try:
    catalog = create_iceberg_catalog()
    print("Iceberg catalog created successfully")
except Exception as e:
    print(f"Iceberg catalog setup (optional): {e}")
    catalog = None

## Create Sample Spatial Data with Polars

In [None]:
# Create sample spatial data
df = create_spatial_iceberg_table_example(catalog, "sample_locations")
print(f"Created DataFrame with {len(df)} rows")
print("\nDataFrame info:")
print(df)

## Basic Spatial Operations with polars-st

In [None]:
# Show spatial operations
result = spatial_query_example(df)
print("Spatial query results:")
print(result)

## Advanced Spatial Analysis

In [None]:
# Perform advanced spatial operations
advanced_result = advanced_spatial_operations(df)
print("Advanced spatial analysis results:")
print(advanced_result.select(["name", "buffer_area", "within_convex_hull"]))

## Working with Larger Datasets

In [None]:
# Create a larger synthetic dataset for performance demonstration
np.random.seed(42)
n_points = 10000

large_df = pl.DataFrame({
    "id": range(n_points),
    "longitude": np.random.uniform(-180, 180, n_points),
    "latitude": np.random.uniform(-90, 90, n_points),
    "value": np.random.uniform(0, 100, n_points),
    "category": np.random.choice(['A', 'B', 'C'], n_points)
})

# Add spatial geometry column
large_df = large_df.with_columns([
    st.from_xy("longitude", "latitude").alias("geometry")
])

print(f"Created large dataset with {len(large_df):,} points")
print("First few rows:")
print(large_df.head())

## Spatial Filtering and Aggregation

In [None]:
# Define a bounding box for filtering (roughly covering Europe)
bbox_west, bbox_south = -10.0, 35.0
bbox_east, bbox_north = 40.0, 70.0

# Filter points within bounding box
europe_points = large_df.filter(
    (pl.col("longitude") >= bbox_west) & 
    (pl.col("longitude") <= bbox_east) &
    (pl.col("latitude") >= bbox_south) & 
    (pl.col("latitude") <= bbox_north)
)

print(f"Points in Europe bounding box: {len(europe_points):,}")

# Aggregate by category
category_stats = europe_points.group_by("category").agg([
    pl.count().alias("count"),
    pl.col("value").mean().alias("avg_value"),
    pl.col("longitude").mean().alias("center_lon"),
    pl.col("latitude").mean().alias("center_lat")
])

print("\nCategory statistics:")
print(category_stats)

## Spatial Joins with polars-st

In [None]:
# Create some reference polygons (simplified regions)
regions_df = pl.DataFrame({
    "region_id": [1, 2, 3],
    "region_name": ["Region A", "Region B", "Region C"],
    "center_lon": [0.0, 10.0, 20.0],
    "center_lat": [50.0, 55.0, 45.0],
    "radius": [5.0, 7.0, 6.0]  # degrees
})

# Create circular polygons for regions (approximate)
regions_df = regions_df.with_columns([
    st.buffer(
        st.from_xy("center_lon", "center_lat"), 
        pl.col("radius")
    ).alias("region_polygon")
])

print("Regions created:")
print(regions_df.select(["region_id", "region_name", "center_lon", "center_lat", "radius"]))

## Performance Comparison: Lazy vs Eager Evaluation

In [None]:
import time

# Eager evaluation
start_time = time.time()
eager_result = large_df.filter(
    pl.col("value") > 50
).with_columns([
    st.buffer(pl.col("geometry"), 0.1).alias("buffered_geom")
])
eager_time = time.time() - start_time

print(f"Eager evaluation time: {eager_time:.3f} seconds")
print(f"Result shape: {eager_result.shape}")

# Lazy evaluation
start_time = time.time()
lazy_result = large_df.lazy().filter(
    pl.col("value") > 50
).with_columns([
    st.buffer(pl.col("geometry"), 0.1).alias("buffered_geom")
])
# Only collect when needed
lazy_collected = lazy_result.collect()
lazy_time = time.time() - start_time

print(f"Lazy evaluation time: {lazy_time:.3f} seconds")
print(f"Result shape: {lazy_collected.shape}")
print(f"Performance improvement: {(eager_time/lazy_time - 1)*100:.1f}%")

## Visualization with Folium

In [None]:
# Convert a subset of data for visualization
viz_data = europe_points.filter(
    pl.col("value") > 75  # Only high-value points
).head(100)  # Limit for performance

# Extract coordinates for folium
coords_data = viz_data.with_columns([
    st.x(pl.col("geometry")).alias("lon"),
    st.y(pl.col("geometry")).alias("lat")
]).select(["lon", "lat", "value", "category"]).to_pandas()

# Create interactive map
center_lat = coords_data['lat'].mean()
center_lon = coords_data['lon'].mean()

m = folium.Map(location=[center_lat, center_lon], zoom_start=4)

# Color mapping for categories
colors = {'A': 'red', 'B': 'blue', 'C': 'green'}

# Add points to map
for idx, row in coords_data.iterrows():
    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=max(3, row['value'] / 10),  # Size based on value
        popup=f"Category: {row['category']}<br>Value: {row['value']:.2f}",
        color=colors.get(row['category'], 'gray'),
        fill=True,
        opacity=0.7
    ).add_to(m)

m

## Data Export and Persistence

In [None]:
# Create output directory
output_dir = Path("/opt/workspace/data/output")
output_dir.mkdir(parents=True, exist_ok=True)

# Export to various formats

# 1. Parquet (efficient for large datasets)
parquet_path = output_dir / "spatial_data.parquet"
europe_points.write_parquet(parquet_path, compression="snappy")
print(f"Exported {len(europe_points)} rows to {parquet_path}")

# 2. CSV for interoperability
csv_data = europe_points.with_columns([
    st.to_wkt(pl.col("geometry")).alias("geometry_wkt")
]).drop("geometry")

csv_path = output_dir / "spatial_data.csv"
csv_data.write_csv(csv_path)
print(f"Exported to CSV: {csv_path}")

# 3. GeoJSON via GeoPandas conversion
pandas_df = europe_points.with_columns([
    st.to_wkt(pl.col("geometry")).alias("geometry_wkt")
]).to_pandas()

# Convert to GeoDataFrame
from shapely import wkt
pandas_df['geometry'] = pandas_df['geometry_wkt'].apply(wkt.loads)
gdf = gpd.GeoDataFrame(pandas_df.drop(['geometry_wkt'], axis=1), geometry='geometry')

geojson_path = output_dir / "spatial_data.geojson"
gdf.to_file(geojson_path, driver='GeoJSON')
print(f"Exported to GeoJSON: {geojson_path}")

## Performance Summary and Best Practices

In [None]:
print("=== Polars + polars-st Performance Summary ===")
print(f"• Processed {n_points:,} spatial points")
print(f"• Filtered to {len(europe_points):,} points in Europe")
print(f"• Created buffer zones and performed spatial operations")
print(f"• Lazy evaluation provided {(eager_time/lazy_time - 1)*100:.1f}% performance improvement")
print()
print("=== Best Practices for Polars + polars-st ===")
print("1. Use lazy evaluation with .lazy() for complex operations")
print("2. Chain spatial operations efficiently")
print("3. Filter early to reduce data size")
print("4. Use appropriate data types (Float64 for coordinates)")
print("5. Leverage Polars' columnar operations for batch processing")
print("6. Use Parquet format for persistent storage")
print("7. Consider memory usage when working with large geometries")
print()
print("=== Available Spatial Operations (polars-st) ===")
spatial_ops = [
    "st.point(x, y)",
    "st.from_xy(x_col, y_col)", 
    "st.from_wkt(wkt_string)",
    "st.to_wkt(geometry)",
    "st.distance(geom1, geom2)",
    "st.buffer(geometry, distance)",
    "st.area(geometry)",
    "st.within(geom1, geom2)",
    "st.contains(geom1, geom2)",
    "st.intersects(geom1, geom2)"
]
for op in spatial_ops:
    print(f"• {op}")

## Cleanup

In [None]:
print("Analysis completed successfully!")
print(f"Output files saved to: {output_dir}")
print("\nFiles created:")
for file in output_dir.glob("*"):
    print(f"• {file.name} ({file.stat().st_size / 1024:.1f} KB)")