# Generating DB

To make our lives easier, we'll use [DuckDBs TPCH extension](https://duckdb.org/docs/extensions/tpch.html) to generate everything in chunks. Let's start with a ~100GB database.

In [1]:
import datetime
import duckdb
import matplotlib.pyplot as plt
import matplotlib.tri as mtri
import numpy as np
import pandas as pd
import seaborn as sns
import time

from tqdm.notebook import tqdm

In [2]:
con = duckdb.connect(database="tpch_sf1.db")

In [3]:
con.execute("INSTALL tpch; LOAD tpch")
for idx in tqdm(range(10)):
    con.execute(f"CALL dbgen(sf=1, children=10, step={idx})")

  0%|          | 0/10 [00:00<?, ?it/s]

In [4]:
con.execute('SET enable_progress_bar = false')

<duckdb.duckdb.DuckDBPyConnection at 0x7fd36aa04cf0>

# TPCH Query 3

```sql
SELECT
    l_orderkey,
    sum(l_extendedprice * (1 - l_discount)) as revenue,
    o_orderdate,
    o_shippriority
FROM
    customer,
    orders,
    lineitem
WHERE
    c_mktsegment = 'BUILDING'
    AND c_custkey = o_custkey
    AND l_orderkey = o_orderkey
    AND o_orderdate < date '1995-03-15'
    AND l_shipdate > date '1995-03-15'
GROUP BY
    l_orderkey,
    o_orderdate,
    o_shippriority
ORDER BY
    revenue desc,
    o_orderdate
LIMIT 20;
```

In this instance we will be changing `o_orderdate` and `l_shipdate` predicates

In [5]:
query_template = """
    SELECT
        l_orderkey,
        sum(l_extendedprice * (1 - l_discount)) as revenue,
        o_orderdate,
        o_shippriority
    FROM
        customer,
        orders,
        lineitem
    WHERE
        c_mktsegment = 'BUILDING'
        AND c_custkey = o_custkey
        AND l_orderkey = o_orderkey
        AND o_orderdate < $orderdate
        AND l_shipdate > $shipdate
    GROUP BY
        l_orderkey,
        o_orderdate,
        o_shippriority
    ORDER BY
        revenue desc,
        o_orderdate
    LIMIT 20;
"""

Query the database to find the maximum and minimum values for both `o_orderdate` and `l_shipdate`, we will set a specific amount of days we want to sweep over.

In [6]:
def daterange(start_date: datetime.date, end_date: datetime.date, day_jumps=1):
    total_days = int((end_date - start_date).days)
    return (start_date + datetime.timedelta(n) for n in range(0, total_days, day_jumps))

In [7]:
min_orderdate = con.sql("SELECT MIN(o_orderdate) FROM orders").fetchone()[0]
max_orderdate = con.sql("SELECT MAX(o_orderdate) FROM orders").fetchone()[0]
(min_orderdate, max_orderdate)

(datetime.date(1992, 1, 1), datetime.date(1998, 8, 2))

In [8]:
min_shipdate = con.sql("SELECT MIN(l_shipdate) FROM lineitem").fetchone()[0]
max_shipdate = con.sql("SELECT MAX(l_shipdate) FROM lineitem").fetchone()[0]
(min_shipdate, max_shipdate)

(datetime.date(1992, 1, 2), datetime.date(1998, 12, 1))

In [None]:
table = []
for orderdate in tqdm(list(daterange(min_orderdate, max_orderdate, 30))):
    for shipdate in tqdm(list(daterange(min_shipdate, max_shipdate, 30)), leave=False):
        params = {'orderdate': orderdate, 'shipdate': shipdate}
        elapsed_times = []
        for trial in range(3):
            start = time.time()
            res = con.sql(query_template, params=params)
            elapsed = time.time() - start
            elapsed_times.append(elapsed)
        elapsed_times = {f'elapsed_{trial}': elapsed_times[trial] for trial in range(len(elapsed_times))}
        params.update(elapsed_times)
        table.append(params)

table = pd.DataFrame(table)
table

  0%|          | 0/81 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

In [None]:
elapsed_times

In [None]:
# table.to_csv('tpch_q3_sweep.csv')

In [None]:
_ = sns.heatmap(table.pivot(index="shipdate", columns="orderdate", values="elapsed"), cbar_kws={'label': 'Query Time (s)'})

In [None]:
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(projection='3d')
surf = ax.plot_trisurf(table['shipdate'].apply(lambda x: x.toordinal()), table['orderdate'].apply(lambda x: x.toordinal()), table['elapsed'], cmap=plt.cm.viridis, linewidth=0.2)
ax.set_xticklabels([]), ax.set_yticklabels([])
ax.set_xlabel('Shipdate'), ax.set_ylabel('Orderdate')
plt.show()

In [None]:
table[(
    (table['orderdate'] < datetime.date(1994, 5, 1))
    & (table['orderdate'] > datetime.date(1994, 1, 1))
    & (table['shipdate'] < datetime.date(1994, 5, 1))
    & (table['shipdate'] > datetime.date(1994, 1, 1))
)]