[Reference](https://python.plainenglish.io/how-duckdb-makes-your-python-data-analysis-and-pipelines-fast-and-simple-a8bb792ec5d5)

# True Data Integrity with ACID Transactions

In [1]:
import duckdb

# Connects to or creates the 'pipeline.duckdb' file
con = duckdb.connect('pipeline.duckdb')

try:
    # ðŸ’¥ Atomicity in Action: If step 2 fails, step 1 is automatically undone.
    con.execute("BEGIN TRANSACTION;")

    # 1. Complex operation 1: Create a staging table
    con.execute("CREATE TABLE staging_data AS SELECT * FROM read_csv_auto('input.csv');")

    # 2. Complex operation 2 (This step might fail because the file not found)
    con.execute("INSERT INTO final_table SELECT * FROM non_existent_file.csv;")

    # Commit only if all steps succeed
    con.execute("COMMIT;")
    print("Transaction succeeded.")
except Exception as e:
    print(f"Transaction failed: {e}")
    # DuckDB handles rollback automatically on failure, ensuring data integrity.
    con.execute("ROLLBACK;")

con.close()

Transaction failed: IO Error: No files found that match the pattern "input.csv"


# The Single, Portable .duckdb File

In [2]:
import duckdb

# The entire database lives in this single file
db_file_path = 'project_warehouse.duckdb'
you_conn = duckdb.connect(database=db_file_path)

# Execute schema and data operations
you_conn.execute("CREATE TABLE sales (id INTEGER, amount DOUBLE, region VARCHAR);")
you_conn.execute("INSERT INTO sales VALUES (1, 100.50, 'East');")
you_conn.close()

# Share 'project_warehouse.duckdb' with your colleague.
# They can reopen it instantly with all data intact.
conn_by_colleague = duckdb.connect(database=db_file_path)
results_reopen = conn_by_colleague.execute("SELECT COUNT(*) FROM sales;").fetchone()
print(f"File Size Check: {results_reopen[0]} records found.")
conn_by_colleague.close()

File Size Check: 1 records found.


# Eliminating Dependency Hell with Built-in Ecosystem

In [4]:
# Use SQL to install and load the S3 extension
# DuckDB handles all underlying networking dependencies
con.execute("INSTALL httpfs;")
con.execute("LOAD httpfs;")

# Assuming S3 credentials are set up...

# Query a remote Parquet file directly from the S3 URL
remote_query = """
SELECT
    sum(trip_distance),
    count(*)
FROM
    read_parquet('s3://nyc-tlc/trip_data/yellow_tripdata_2023-01.parquet')
WHERE
    passenger_count > 2
LIMIT 10;
"""
remote_data = con.execute(remote_query).fetchall()
print(f"Results from S3: {remote_data}")

# Integration with Pandas and Polars

In [5]:
import duckdb
import pandas as pd

con = duckdb.connect(':memory:') # Use in-memory connection for quick demo

# 1. Create a Pandas DataFrame
df_pandas = pd.DataFrame({'id': range(1000), 'value': [i * 1.5 for i in range(1000)]})

# 2. Query the Pandas DataFrame directly using SQL!
# No need to load it into a separate table first.
sql_query = """
SELECT
    sum(value) as total,
    count(id)  as total_number
FROM
    df_pandas
WHERE
    value > 500;
"""

# Get the results back as a Pandas DataFrame
result_df = con.execute(sql_query).df()
print("\nQuerying a Pandas DF with DuckDB SQL:")
print(result_df)


Querying a Pandas DF with DuckDB SQL:
      total  total_number
0  665833.5           666


# Seamless File Integration for Data Lake Queries

```
-- Querying a local data lake using pure SQL
SELECT
    t1.user_id,
    t2.purchase_date,
    t2.item_count
FROM
    read_parquet('./local_users_master.parquet') AS t1
JOIN
    read_json_auto('./api_purchases_*.json') AS t2 -- Supports glob patterns
ON
    t1.user_id = t2.user_id
WHERE
    t2.item_count > 5;
```