# Apache Sedona Setup and Basic Operations

This notebook demonstrates how to set up Apache Sedona and perform basic spatial operations.

In [13]:
# Import required libraries
from sedona.spark import SedonaContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
import pandas as pd
import polars as pl
import httpx
import json
from typing import Optional, Dict, Any, Union

In [14]:
# Remote Sedona API client with Polars integration
class RemoteSedonaClient:
    """Client for interacting with remote Sedona application with Polars support."""
    
    def __init__(self, base_url: str = "https://sedona-production.up.railway.app"):
        self.base_url = base_url.rstrip('/')
        self.session = httpx.Client(
            base_url=self.base_url,
            timeout=httpx.Timeout(10.0, connect=5.0),
        )
        
    def health_check(self) -> Dict[str, Any]:
        """Check if the remote Sedona service is available."""
        try:
            response = self.session.get("/health", timeout=10)
            response.raise_for_status()
            return {"status": "healthy", "response": response.json()}
        except Exception as e:
            return {"status": "error", "error": str(e)}
    
    def submit_spatial_query(self, sql_query: str, return_format: str = "polars") -> Union[Dict[str, Any], pl.DataFrame]:
        """Submit a spatial SQL query to the remote service."""
        try:
            payload = {"query": sql_query}
            response = self.session.post(
                "/api/spatial/query", 
                json=payload, 
                timeout=30
            )
            response.raise_for_status()
            data = response.json()
            
            if return_format == "polars" and "data" in data:
                # Convert response to Polars DataFrame
                return pl.DataFrame(data["data"])
            else:
                return {"status": "success", "data": data}
        except Exception as e:
            if return_format == "polars":
                print(f"Error in spatial query: {e}")
                return pl.DataFrame()  # Return empty DataFrame on error
            return {"status": "error", "error": str(e)}
    
    def upload_spatial_data(self, data: Union[list, pl.DataFrame], data_type: str = "points") -> Dict[str, Any]:
        """Upload spatial data to the remote service."""
        try:
            # Convert Polars DataFrame to list of dicts if needed
            if isinstance(data, pl.DataFrame):
                data_list = data.to_dicts()
            else:
                data_list = data
                
            payload = {"data": data_list, "type": data_type}
            response = self.session.post(
                "/api/spatial/upload", 
                json=payload, 
                timeout=60
            )
            response.raise_for_status()
            return {"status": "success", "data": response.json()}
        except Exception as e:
            return {"status": "error", "error": str(e)}
    
    def get_spatial_analysis_polars(self, geometry_data: pl.DataFrame) -> pl.DataFrame:
        """Perform spatial analysis using Polars for local processing."""
        # Example: Calculate distances between points using Polars
        # This demonstrates local Polars processing of spatial data
        if geometry_data.is_empty():
            return pl.DataFrame()
            
        # Cross join to calculate all pairwise distances
        analysis_df = geometry_data.with_columns([
            pl.col("longitude").alias("lon1"),
            pl.col("latitude").alias("lat1"),
            pl.col("name").alias("point1")
        ]).join(
            geometry_data.select([
                pl.col("longitude").alias("lon2"),
                pl.col("latitude").alias("lat2"), 
                pl.col("name").alias("point2")
            ]),
            how="cross"
        ).filter(
            pl.col("point1") != pl.col("point2")
        ).with_columns([
            # Approximate distance calculation (Euclidean)
            ((pl.col("lon2") - pl.col("lon1"))**2 + 
             (pl.col("lat2") - pl.col("lat1"))**2).sqrt().alias("approx_distance")
        ]).select([
            "point1", "point2", "approx_distance"
        ]).sort("approx_distance")
        
        return analysis_df

# Initialize remote client
remote_client = RemoteSedonaClient()
print("Remote Sedona client with Polars support initialized!")
remote_client.health_check()

Remote Sedona client with Polars support initialized!


{'status': 'error', 'error': 'Expecting value: line 1 column 1 (char 0)'}

In [7]:
# Initialize Spark session with Sedona configuration (Local mode)
def create_sedona_session(app_name: str = "SedonaBasics", mode: str = "local") -> Optional[SedonaContext]:
    """Create and configure Sedona session for local processing."""
    if mode == "remote":
        print("Using remote Sedona service - no local session needed")
        return None
        
    spark = SparkSession.builder \
        .appName(app_name) \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator") \
        .config("spark.sql.adaptive.enabled", "false") \
        .getOrCreate()
    
    sedona = SedonaContext.create(spark)
    return sedona

# Choose your mode: "local" or "remote"
MODE = "remote"  # Change to "local" for local processing

if MODE == "local":
    sedona = create_sedona_session()
    print("Local Sedona session created successfully!")
else:
    sedona = None
    health_status = remote_client.health_check()
    print(f"Remote connection status: {health_status}")

Remote connection status: {'status': 'error', 'error': 'Expecting value: line 1 column 1 (char 0)'}


In [None]:
# Test remote API interaction with Polars
if MODE == "remote":
    # Create sample data as Polars DataFrame
    sample_points_pl = pl.DataFrame({
        "name": ["Point_A", "Point_B", "Point_C", "Point_D"],
        "longitude": [-74.0059, -118.2437, -87.6298, -95.3698],
        "latitude": [40.7128, 34.0522, 41.8781, 29.7604]
    })
    
    print("Sample points as Polars DataFrame:")
    print(sample_points_pl)
    
    # Upload Polars DataFrame to remote service
    upload_result = remote_client.upload_spatial_data(sample_points_pl)
    print("\nUpload result:", upload_result)
    
    # Test spatial query returning Polars DataFrame
    spatial_query = """
        SELECT 
            name,
            longitude,
            latitude,
            ST_Point(longitude, latitude) as geometry,
            ST_IsValid(ST_Point(longitude, latitude)) as is_valid
        FROM uploaded_points
    """
    
    # Get results as Polars DataFrame
    query_result_pl = remote_client.submit_spatial_query(spatial_query, return_format="polars")
    print("\nQuery result as Polars DataFrame:")
    print(query_result_pl)
    
    # Perform local Polars analysis
    local_analysis = remote_client.get_spatial_analysis_polars(sample_points_pl)
    print("\nLocal Polars spatial analysis:")
    print(local_analysis)
    
else:
    print("Skipping remote tests - using local mode")

Upload result: {'status': 'error', 'error': "Client error '405 Method Not Allowed' for url 'https://sedona-production.up.railway.app/api/spatial/upload'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/405"}
Query result: {'status': 'error', 'error': "Client error '405 Method Not Allowed' for url 'https://sedona-production.up.railway.app/api/spatial/query'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/405"}


In [None]:
# Advanced Polars spatial operations
if MODE == "remote":
    # Demonstrate more complex Polars operations
    enhanced_points = sample_points_pl.with_columns([
        # Add spatial metadata
        pl.when(pl.col("latitude") > 35).then("Northern").otherwise("Southern").alias("region"),
        pl.when(pl.col("longitude") < -100).then("Western").otherwise("Eastern").alias("side"),
        # Create bounding box approximation
        (pl.col("longitude") - 0.1).alias("bbox_min_lon"),
        (pl.col("longitude") + 0.1).alias("bbox_max_lon"),
        (pl.col("latitude") - 0.1).alias("bbox_min_lat"),
        (pl.col("latitude") + 0.1).alias("bbox_max_lat")
    ])
    
    print("Enhanced points with spatial metadata:")
    print(enhanced_points)
    
    # Group by region and calculate statistics
    region_stats = enhanced_points.group_by("region").agg([
        pl.count("name").alias("point_count"),
        pl.mean("longitude").alias("avg_longitude"),
        pl.mean("latitude").alias("avg_latitude"),
        pl.min("longitude").alias("min_longitude"),
        pl.max("longitude").alias("max_longitude")
    ])
    
    print("\nRegional statistics:")
    print(region_stats)
else:
    print("Skipping advanced Polars operations - using local mode")

In [None]:
# Create geometries using Sedona spatial functions (Local mode only)
if MODE == "local" and sedona:
    points_df.createOrReplaceTempView("points")

    geometry_df = sedona.sql("""
        SELECT 
            name,
            longitude,
            latitude,
            ST_Point(longitude, latitude) as geometry
        FROM points
    """)

    geometry_df.show(truncate=False)
    
    # Convert Spark DataFrame to Polars for further processing
    geometry_pandas = geometry_df.select("name", "longitude", "latitude").toPandas()
    geometry_polars = pl.from_pandas(geometry_pandas)
    print("\nConverted to Polars DataFrame:")
    print(geometry_polars)
else:
    print("Local geometry creation skipped - using remote mode")

Local geometry creation skipped - using remote mode


## Remote vs Local Processing

This notebook now supports both local and remote Sedona processing:

### Remote Mode (MODE = "remote")
- Connects to your deployed app at https://sedona-production.up.railway.app/
- Sends HTTP requests to process spatial data
- Requires your remote service to expose appropriate API endpoints

### Local Mode (MODE = "local") 
- Creates a local Spark/Sedona session
- Processes data on your local machine
- Useful for development and testing

Change the `MODE` variable in cell 3 to switch between modes.

## Key Takeaways

1. Always configure Spark with Kryo serialization for Sedona
2. Use `ST_Point(lon, lat)` to create point geometries
3. Validate geometries with `ST_IsValid()` before operations
4. Sedona SQL functions follow PostGIS naming conventions
5. Remote processing requires API endpoints on your deployed service
6. Consider data transfer costs when using remote processing

## Polars Integration Benefits

Using Polars with your remote Sedona service provides several advantages:

### Performance
- **Fast DataFrame operations**: Polars is optimized for speed and memory efficiency
- **Lazy evaluation**: Build complex query plans before execution
- **Parallel processing**: Automatic parallelization of operations

### Data Processing
- **Easy HTTP response handling**: Convert JSON responses to DataFrames
- **Rich data manipulation**: Filter, group, aggregate spatial data efficiently
- **Memory efficiency**: Better memory usage than Pandas for large datasets

### Integration Patterns
```python
# Upload Polars DataFrame to remote Sedona
result = client.upload_spatial_data(polars_df)

# Get results back as Polars DataFrame
spatial_results = client.submit_spatial_query(sql, return_format="polars")

# Process results with Polars expressions
processed = spatial_results.with_columns([
    pl.col("distance").round(2),
    pl.when(pl.col("distance") < 1.0).then("nearby").otherwise("distant").alias("proximity")
])
```