In [2]:
import duckdb
import os
from dotenv import load_dotenv
from datetime import datetime

ENVIRONMENT = "edge"

load_dotenv(f"./{ENVIRONMENT}.env")

# Setup MinIO credentials
minio_endpoint = os.getenv("MINIO_ENDPOINT")
minio_access_key = os.getenv("MINIO_ACCESS_KEY")
minio_secret_key = os.getenv("MINIO_SECRET_KEY")

# DuckDB expects endpoint without protocol for S3 secrets usually, but check compatibility.
# Ensure endpoint doesn't have http/https prefix for the SECRET configuration if it causes issues,
# though recent versions are more flexible. Stripping safety.
if minio_endpoint.startswith("http://"):
    minio_endpoint = minio_endpoint.replace("http://", "")
elif minio_endpoint.startswith("https://"):
    minio_endpoint = minio_endpoint.replace("https://", "")

# Connect to DuckDB
con = duckdb.connect()

# Install and load necessary extensions
con.sql("INSTALL httpfs; LOAD httpfs;")
con.sql("INSTALL delta; LOAD delta;")

# Configure MinIO Secret
con.sql(f"""
    CREATE OR REPLACE SECRET minio_secret (
        TYPE S3,
        KEY_ID '{minio_access_key}',
        SECRET '{minio_secret_key}',
        ENDPOINT '{minio_endpoint}',
        REGION 'us-east-1',
        URL_STYLE 'path',
        USE_SSL 'false'
    );
""")

┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

In [3]:
file_name = datetime.now().strftime("%Y-%m-%d-%H_%M_%S") + f"-avro-{ENVIRONMENT}.parquet"
print(f"Exporting data to {file_name}...")

# Use DuckDB to copy directly from Delta Lake (MinIO) to local Parquet without intermediate dataframe overhead
# Use s3:// scheme for the path so DuckDB uses the S3 secret.
source_path = "s3://lakehouse/delta/raw_robot_data-avro"

query = f"""
    COPY (
        SELECT 
            timestamp, 
            landing_timestamp, 
            parsed_value.*
        FROM delta_scan('{source_path}')
    ) TO '{file_name}' (FORMAT PARQUET)
"""

con.sql(query)
print("Export complete.")

Exporting data to 2025-12-24-16_22_27-avro-edge.parquet...
Export complete.
