In [1]:
import duckdb
import time
import pandas as pd
from tqdm.auto import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
con = duckdb.connect(database="tpch_sf1.db")

In [88]:
#5D version
query_template = """select
	sum(l_extendedprice * l_discount) as revenue
from
	lineitem
where
	l_shipdate >= '$shipdate1'
	and l_shipdate < '$shipdate2'
	and l_discount >= '$discount1' 
    and l_discount < '$discount2'
	and l_quantity < '$quantity';"""


In [3]:
#3D version
query_template = """select
	sum(l_extendedprice * l_discount) as revenue
	from
	lineitem
	where
	l_shipdate >= '$shipdate'
	and l_discount >= '$discount'
	and l_quantity < '$quantity';"""

In [4]:
min_shipdate = con.sql("SELECT MIN(l_shipdate) FROM lineitem").fetchone()[0]
max_shipdate = con.sql("SELECT MAX(l_shipdate) FROM lineitem").fetchone()[0]
(min_shipdate, max_shipdate)

(datetime.date(1992, 1, 2), datetime.date(1998, 12, 1))

In [5]:
min_discount = float(con.sql("SELECT MIN(l_discount) FROM lineitem").fetchone()[0])
max_discount = float(con.sql("SELECT MAX(l_discount) FROM lineitem").fetchone()[0])
(min_discount, max_discount)

(0.0, 0.1)

In [6]:
min_quantity = float(con.sql("SELECT MIN(l_quantity) FROM lineitem").fetchone()[0])
max_quantity = float(con.sql("SELECT MAX(l_quantity) FROM lineitem").fetchone()[0])
(min_quantity, max_quantity)

(1.0, 50.0)

In [7]:
table = []

# Loop over the date range for `shipdate`
for shipdate in tqdm(list(pd.date_range(min_shipdate, max_shipdate, freq='30D'))):
    # Loop over the range for `discount`
    for discount in tqdm(list(np.arange(min_discount, max_discount + 0.01, 0.01)), leave=False):
        # Loop over the range for `quantity`
        for quantity in tqdm(list(np.arange(min_quantity, max_quantity + 1.00, 1.00)), leave=False):
            params = {'shipdate': shipdate, 'discount': discount, 'quantity': quantity}
            for key, val in params.items():
                query_template = query_template.replace(f"${key}", str(val))
            start = time.time()
            res = con.sql(query_template)
            elapsed = time.time() - start
            params['elapsed'] = elapsed
            table.append(params)

table = pd.DataFrame(table)
table

  0%|          | 0/85 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  1%|          | 1/85 [00:00<00:08,  9.73it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  2%|▏         | 2/85 [00:00<00:08,  9.76it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  5%|▍         | 4/85 [00:00<00:07, 11.00it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  7%|▋         | 6/85 [00:00<00:06, 11.46it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  9%|▉         | 8/85 [00:00<00:06, 12.00it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[

Unnamed: 0,shipdate,discount,quantity,elapsed
0,1992-01-02,0.0,1.0,0.000317
1,1992-01-02,0.0,2.0,0.000104
2,1992-01-02,0.0,3.0,0.000082
3,1992-01-02,0.0,4.0,0.000077
4,1992-01-02,0.0,5.0,0.000086
...,...,...,...,...
46745,1998-11-26,0.1,46.0,0.000070
46746,1998-11-26,0.1,47.0,0.000071
46747,1998-11-26,0.1,48.0,0.000071
46748,1998-11-26,0.1,49.0,0.000070
