In [None]:
# Dataset paths (payments)
CSV_PATH = "data/payments.csv"
PARQUET_PATH = "data/payments.parquet"

# Columns in the payments dataset (default)
# Examples: step, type, amount, nameOrig, oldbalanceOrg, nameDest, isFraud, ...
ID_COL = "nameOrig"     # used for "single-row fetch" query shape (not necessarily unique)
GROUP_COL = "type"      # aggregation grouping
MEASURE_COL = "amount"  # numeric measure for AVG / SUM

WARMUPS = 2
RUNS = 10

In [None]:

from pathlib import Path

csv_path = Path(CSV_PATH)
pq_path = Path(PARQUET_PATH)

if not csv_path.exists() or not pq_path.exists():
    raise FileNotFoundError(
        "Missing dataset files. Expected:\n"
        f"  {csv_path}\n  {pq_path}\n"
        "Ask the instructor or place them into the data/ folder."
    )

print("OK: dataset files found")

In [None]:
import importlib.util, sys, subprocess

if importlib.util.find_spec("duckdb") is None:
    print("Installing duckdb...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "duckdb==1.1.3"])
else:
    print("duckdb already installed")

In [None]:
import time, statistics
import pandas as pd

def percentiles(samples_ms, ps=(0.50, 0.95, 0.99)):
    xs = sorted(samples_ms)
    out = {}
    for p in ps:
        k = int((len(xs) - 1) * p)
        out[f"p{int(p*100)}"] = xs[k]
    out["avg_ms"] = statistics.mean(xs)
    return out

def timeit_ms(fn, warmups=WARMUPS, runs=RUNS):
    for _ in range(warmups):
        fn()
    times = []
    for _ in range(runs):
        t0 = time.perf_counter()
        fn()
        t1 = time.perf_counter()
        times.append((t1 - t0) * 1000.0)
    return percentiles(times)

results = []

In [None]:
def bench_pandas(fmt, path):
    df = pd.read_csv(path) if fmt == "csv" else pd.read_parquet(path)

    # pick one existing key value for the single-row fetch query shape
    key_value = df[ID_COL].iloc[0]

    def single_row():
        _ = df.loc[df[ID_COL] == key_value].head(1)

    def aggregate():
        _ = df.groupby(GROUP_COL)[MEASURE_COL].mean()

    s1 = timeit_ms(single_row)
    s2 = timeit_ms(aggregate)

    results.append({"engine": "pandas", "format": fmt, "query": "single_row", **s1})
    results.append({"engine": "pandas", "format": fmt, "query": "aggregate", **s2})

bench_pandas("csv", csv_path)
bench_pandas("parquet", pq_path)
print("pandas done")

In [None]:
import duckdb

def bench_duckdb(fmt, path):
    con = duckdb.connect(database=":memory:")

    if fmt == "csv":
        con.execute(
            f"CREATE TABLE t AS SELECT * FROM read_csv_auto('{path.as_posix()}', header=true)"
        )
    else:
        con.execute(f"CREATE TABLE t AS SELECT * FROM read_parquet('{path.as_posix()}')")

    key_value = con.execute(f"SELECT {ID_COL} FROM t LIMIT 1").fetchone()[0]

    q_single = f"SELECT * FROM t WHERE {ID_COL} = ? LIMIT 1"
    q_agg = f"SELECT {GROUP_COL}, AVG({MEASURE_COL}) AS avg_amount, COUNT(*) AS n FROM t GROUP BY {GROUP_COL}"

    def single_row():
        con.execute(q_single, [key_value]).fetchall()

    def aggregate():
        con.execute(q_agg).fetchall()

    s1 = timeit_ms(single_row)
    s2 = timeit_ms(aggregate)

    results.append({"engine": "duckdb", "format": fmt, "query": "single_row", **s1})
    results.append({"engine": "duckdb", "format": fmt, "query": "aggregate", **s2})

bench_duckdb("csv", csv_path)
bench_duckdb("parquet", pq_path)
print("duckdb done")

In [None]:
spark_ok = True
try:
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import avg
except Exception as e:
    spark_ok = False
    print("PySpark not available:", e)

if spark_ok:
    spark = (
        SparkSession.builder
        .appName("bdm_week01")
        .master("local[*]")
        .config("spark.sql.shuffle.partitions", "8")
        .getOrCreate()
    )

    def bench_spark(fmt, path):
        if fmt == "csv":
            df = (
                spark.read.option("header", "true")
                .option("inferSchema", "true")
                .csv(str(path))
            )
        else:
            df = spark.read.parquet(str(path))

        key_value = df.select(ID_COL).limit(1).collect()[0][0]

        def single_row():
            df.filter(df[ID_COL] == key_value).limit(1).collect()

        def aggregate():
            df.groupBy(GROUP_COL).agg(avg(MEASURE_COL).alias("avg_amount")).collect()

        s1 = timeit_ms(single_row)
        s2 = timeit_ms(aggregate)

        results.append({"engine": "spark", "format": fmt, "query": "single_row", **s1})
        results.append({"engine": "spark", "format": fmt, "query": "aggregate", **s2})

    bench_spark("csv", csv_path)
    bench_spark("parquet", pq_path)
    print("spark done")

In [None]:
out = pd.DataFrame(results)
out = out[["engine", "format", "query", "avg_ms", "p50", "p95", "p99"]].sort_values(
    ["query", "engine", "format"]
)
out