[Reference](https://gaurav-adarshi.medium.com/duckdb-the-in-memory-analytics-database-revolutionizing-data-processing-1dd63a23a4c7)

In [1]:
pip install duckdb



In [2]:
import duckdb

# Creating a Database and Table

In [3]:
# Connect to DuckDB (creates an in-memory database)
con = duckdb.connect()
# Create a sample table
con.execute("""
CREATE TABLE employees (
    id INTEGER,
    name VARCHAR,
    department VARCHAR,
    salary INTEGER
)
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7bc070102db0>

# Inserting Data

In [4]:
con.execute("""
INSERT INTO employees VALUES
(1, 'Alice', 'Engineering', 100000),
(2, 'Bob', 'HR', 60000),
(3, 'Charlie', 'Engineering', 120000),
(4, 'David', 'Marketing', 70000)
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7bc070102db0>

# Querying Data

In [5]:
# Simple SELECT query
result = con.execute("SELECT * FROM employees").fetchall()
print(result)

[(1, 'Alice', 'Engineering', 100000), (2, 'Bob', 'HR', 60000), (3, 'Charlie', 'Engineering', 120000), (4, 'David', 'Marketing', 70000)]


# Analytical Query

In [6]:
# Aggregation query to find the average salary by department
avg_salary = con.execute("""
SELECT department, AVG(salary) AS avg_salary
FROM employees
GROUP BY department
""").fetchall()
print(avg_salary)

[('HR', 60000.0), ('Marketing', 70000.0), ('Engineering', 110000.0)]


# Basic API Usage

In [7]:
import duckdb
duckdb.sql("SELECT 42").show()

┌───────┐
│  42   │
│ int32 │
├───────┤
│    42 │
└───────┘



In [8]:
import duckdb
r1 = duckdb.sql("SELECT 42 AS i")
duckdb.sql("SELECT i * 2 AS k FROM r1").show()

┌───────┐
│   k   │
│ int32 │
├───────┤
│    84 │
└───────┘



# Data Input

In [10]:
# import duckdb
# duckdb.read_csv("example.csv")                # read a CSV file into a Relation
# duckdb.sql("SELECT * FROM 'example.csv'")     # directly query a CSV file

# DataFrames

In [11]:
import duckdb
# directly query a Pandas DataFrame
import pandas as pd
pandas_df = pd.DataFrame({"a": [42]})
duckdb.sql("SELECT * FROM pandas_df")
# directly query a Polars DataFrame
import polars as pl
polars_df = pl.DataFrame({"a": [42]})
duckdb.sql("SELECT * FROM polars_df")
# directly query a pyarrow table
import pyarrow as pa
arrow_table = pa.Table.from_pydict({"a": [42]})
duckdb.sql("SELECT * FROM arrow_table")

┌───────┐
│   a   │
│ int64 │
├───────┤
│    42 │
└───────┘

# Result Conversion

In [12]:
import duckdb
duckdb.sql("SELECT 42").fetchall()   # Python objects
duckdb.sql("SELECT 42").df()         # Pandas DataFrame
duckdb.sql("SELECT 42").pl()         # Polars DataFrame
duckdb.sql("SELECT 42").arrow()      # Arrow Table
duckdb.sql("SELECT 42").fetchnumpy() # NumPy Arrays

{'42': array([42], dtype=int32)}

# Writing Data to Disk

In [13]:
import duckdb
duckdb.sql("SELECT 42").write_parquet("out.parquet") # Write to a Parquet file
duckdb.sql("SELECT 42").write_csv("out.csv")         # Write to a CSV file
duckdb.sql("COPY (SELECT 42) TO 'out.parquet'")      # Copy to a Parquet file