# TPC-H

TPC-H is a decision support benchmark that offers business-oriented ad hoc queries.
More information can be found [here](http://www.tpc.org/tpch)

The queries are originally in SQL format and here they are implemented using the pandas API.

### Notes on running these queries:

By defaults runs use Bodo. Hence, data is distributed in chunks across processes.

The current results are based on running on one **m5.8xlarge** instance (16 cores, 128GiB memory)

Dataset size is 2GB.

There's another dataset available on "s3://bodo-examples-data/tpch/s4/" which is 4GB.

To run the code:
1. Make sure you add your AWS account credentials to access the data. 
2. If you want to run a query in regular pandas:
    1. Comment magic expression (`%%px`) from AWS, import statements, its corresponding functions to load data, and the query cells.
    2. Comment bodo decorator (`@bodo.jit`) from the load_data functions and the query that you'll run. 
    3. Re-run the import cells.
    4. Re-run functions in [loading data](#loading_data) section to have the requried dataset in one process. 


In [1]:
%%px
import os

os.environ["AWS_ACCESS_KEY_ID"] = "AKIAWHBMWY5427V5RUE5"
os.environ["AWS_SECRET_ACCESS_KEY"] = "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"
os.environ["AWS_DEFAULT_REGION"] = "us-east-2"

In [2]:
%%px
import bodo
import time
import numpy as np
import pandas as pd

<a id="loading_data"></a>
## Loading data

In this section, we load the data required by the queries in pandas DataFrame.

In [3]:
%%px
@bodo.jit(distributed=["rel"], cache=True)
def load_lineitem(data_folder):
    t1 = time.time()
    file = data_folder + "/lineitem.tbl"
    cols_names = ['L_ORDERKEY' , 'L_PARTKEY', 'L_SUPPKEY', 'L_LINENUMBER', 'L_QUANTITY',
            'L_EXTENDEDPRICE', 'L_DISCOUNT', 'L_TAX', 'L_RETURNFLAG', 'L_LINESTATUS', 'L_SHIPDATE',
            'L_COMMITDATE', 'L_RECEIPTDATE', 'L_SHIPINSTRUCT', 'L_SHIPMODE', 'L_COMMENT']
    cols = {'L_ORDERKEY' : np.int64, 'L_PARTKEY' : np.int64, 'L_SUPPKEY' : np.int64, 'L_LINENUMBER' : np.int64, 'L_QUANTITY' : np.float64,
            'L_EXTENDEDPRICE' : np.float64, 'L_DISCOUNT' : np.float64, 'L_TAX' : np.float64, 'L_RETURNFLAG' : str, 'L_LINESTATUS' : str, 'L_SHIPDATE' : str,
            'L_COMMITDATE' : str, 'L_RECEIPTDATE' : str, 'L_SHIPINSTRUCT' : str, 'L_SHIPMODE' : str, 'L_COMMENT' : str}
    rel = pd.read_csv(file, sep='|', header=None,
        names=cols_names,
        dtype=cols,
        parse_dates=[10, 11, 12]
        )
    print("Lineitem Reading time: ", ((time.time() - t1) * 1000), " (ms)")
    print(rel.head())
    return rel

lineitem = load_lineitem("s3://bodo-examples-data/tpch/s2")

[stdout:0] 
'coroutine' object is not subscriptable.
Will use the value defined in the AWS_DEFAULT_REGION environment variable (or us-east-1 if that is not provided either).
Lineitem Reading time:  6526.566982269287  (ms)
   L_ORDERKEY  L_PARTKEY  L_SUPPKEY  L_LINENUMBER  L_QUANTITY  \
0           1     310379      15395             1        17.0   
1           1     134619      14620             2        36.0   
2           1     127400       7401             3         8.0   
3           1       4263       9264             4        28.0   
4           1      48054       3061             5        24.0   

   L_EXTENDEDPRICE  L_DISCOUNT  L_TAX L_RETURNFLAG L_LINESTATUS L_SHIPDATE  \
0         23619.12        0.04   0.02            N            O 1996-03-13   
1         59529.96        0.09   0.06            N            O 1996-04-12   
2         11419.20        0.10   0.02            N            O 1996-01-29   
3         32683.28        0.09   0.06            N            O 1996-04-21 

In [4]:
%%px

@bodo.jit(distributed=["rel"], cache=True)
def load_orders(data_folder):
    t1 = time.time()    
    file = data_folder + "/orders.tbl"
    cols_names = ['O_ORDERKEY', 'O_CUSTKEY', 'O_ORDERSTATUS',
            'O_TOTALPRICE', 'O_ORDERDATE', 'O_ORDERPRIORITY',
            'O_CLERK', 'O_SHIPPRIORITY', 'O_COMMENT']
    cols = {'O_ORDERKEY' : np.int64, 'O_CUSTKEY' : np.int64, 'O_ORDERSTATUS' : str,
            'O_TOTALPRICE' : np.float64, 'O_ORDERDATE' : np.int64, 'O_ORDERPRIORITY' : str,
            'O_CLERK' : str, 'O_SHIPPRIORITY' : np.int64, 'O_COMMENT' : str}
    rel = pd.read_csv(file, sep='|', header=None,
        names=cols_names,
        dtype=cols,
        parse_dates=[4]
        )
    print("Orders Reading time: ", ((time.time() - t1) * 1000), " (ms)")
    print(rel.head())
    return rel
    
orders = load_orders("s3://bodo-examples-data/tpch/s2")

[stdout:0] 
'coroutine' object is not subscriptable.
Will use the value defined in the AWS_DEFAULT_REGION environment variable (or us-east-1 if that is not provided either).
Orders Reading time:  1330.5301666259766  (ms)
   O_ORDERKEY  O_CUSTKEY O_ORDERSTATUS  O_TOTALPRICE O_ORDERDATE  \
0           1      73801             O     181503.69  1996-01-02   
1           2     156004             O      49967.96  1996-12-01   
2           3     246628             F     227024.64  1993-10-14   
3           4     273553             O      36018.68  1995-10-11   
4           5      88970             F     112288.43  1994-07-30   

  O_ORDERPRIORITY          O_CLERK  O_SHIPPRIORITY  \
0           5-LOW  Clerk#000001902               0   
1        1-URGENT  Clerk#000001759               0   
2           5-LOW  Clerk#000001909               0   
3           5-LOW  Clerk#000000247               0   
4           5-LOW  Clerk#000001850               0   

                                           O_

In [7]:
%%px

@bodo.jit(distributed=["rel"], cache=True)
def load_customer(data_folder):
    t1 = time.time()
    file = data_folder + "/customer.tbl"
    cols_names = ['C_CUSTKEY', 'C_NAME',
            'C_ADDRESS', 'C_NATIONKEY',
            'C_PHONE', 'C_ACCTBAL',
            'C_MKTSEGMENT', 'C_COMMENT']
    cols = {'C_CUSTKEY' : np.int64, 'C_NAME' : str,
            'C_ADDRESS' : str, 'C_NATIONKEY' : np.int64,
            'C_PHONE' : str, 'C_ACCTBAL' : np.float64,
            'C_MKTSEGMENT' : str, 'C_COMMENT' : str}
    rel = pd.read_csv(file, sep='|', header=None,
        names=cols_names,
        dtype=cols
        )
    print("Customer Reading time: ", ((time.time() - t1) * 1000), " (ms)")
    print(rel.head())
    return rel

customer = load_customer("s3://bodo-examples-data/tpch/s2")

[stdout:0] 
'coroutine' object is not subscriptable.
Will use the value defined in the AWS_DEFAULT_REGION environment variable (or us-east-1 if that is not provided either).
Customer Reading time:  1080.2128314971924  (ms)
   C_CUSTKEY              C_NAME                       C_ADDRESS  C_NATIONKEY  \
0          1  Customer#000000001               IVhzIApeRb ot,c,E           15   
1          2  Customer#000000002  XSTf4,NCwDVaWNe6tEgvwfmRchLXak           13   
2          3  Customer#000000003                    MG9kdTD2WBHm            1   
3          4  Customer#000000004                     XxVSJsLAGtn            4   
4          5  Customer#000000005    KvpyuHCplrB84WgAiGV6sYpZq7Tj            3   

           C_PHONE  C_ACCTBAL C_MKTSEGMENT  \
0  25-989-741-2988     711.56     BUILDING   
1  23-768-687-3665     121.65   AUTOMOBILE   
2  11-719-748-3364    7498.12   AUTOMOBILE   
3  14-128-190-5944    2866.83    MACHINERY   
4  13-750-942-6364     794.47    HOUSEHOLD   

             

In [8]:
%%px

@bodo.jit(distributed=["rel"], cache=True)
def load_nation(data_folder):
    t1 = time.time()
    file = data_folder + "/nation.tbl"
    cols_names = ['N_NATIONKEY', 'N_NAME',
            'N_REGIONKEY', 'N_COMMENT']
    cols = {'N_NATIONKEY' : np.int64, 'N_NAME' : str,
            'N_REGIONKEY' : np.int64, 'N_COMMENT' : str}
    rel = pd.read_csv(file, sep='|', header=None,
        names=cols_names,
        dtype=cols
        )
    print("Nation Reading time: ", ((time.time() - t1) * 1000), " (ms)")
    print(rel.head())
    return rel

nation = load_nation("s3://bodo-examples-data/tpch/s2")

[stdout:0] 
'coroutine' object is not subscriptable.
Will use the value defined in the AWS_DEFAULT_REGION environment variable (or us-east-1 if that is not provided either).
Nation Reading time:  590.7480716705322  (ms)
   N_NATIONKEY     N_NAME  N_REGIONKEY  \
0            0    ALGERIA            0   
1            1  ARGENTINA            1   
2            2     BRAZIL            1   
3            3     CANADA            1   
4            4      EGYPT            4   

                                           N_COMMENT  
0   haggle. carefully final deposits detect slyly...  
1  al foxes promise slyly according to the regula...  
2  y alongside of the pending deposits. carefully...  
3  eas hang ironic, silent packages. slyly regula...  
4  y above the carefully unusual theodolites. fin...  


In [9]:
%%px

@bodo.jit(distributed=["rel"], cache=True)
def load_supplier(data_folder):
    t1 = time.time()    
    file = data_folder + "/supplier.tbl"
    cols_names = ['S_SUPPKEY', 'S_NAME', 'S_ADDRESS',
            'S_NATIONKEY', 'S_PHONE', 'S_ACCTBAL',
            'S_COMMENT']
    cols = {'S_SUPPKEY' : np.int64, 'S_NAME' : str, 'S_ADDRESS' : str,
            'S_NATIONKEY' : np.int64, 'S_PHONE' : str, 'S_ACCTBAL' : np.float64,
            'S_COMMENT' : str}
    rel = pd.read_csv(file, sep='|', header=None,
        names=cols_names,
        dtype=cols
        )
    print("Supplier Reading time: ", ((time.time() - t1) * 1000), " (ms)")
    print(rel.head())        
    return rel

supplier = load_supplier("s3://bodo-examples-data/tpch/s2")

[stdout:0] 
'coroutine' object is not subscriptable.
Will use the value defined in the AWS_DEFAULT_REGION environment variable (or us-east-1 if that is not provided either).
Supplier Reading time:  303.7829399108887  (ms)
   S_SUPPKEY              S_NAME                            S_ADDRESS  \
0          1  Supplier#000000001   N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ   
1          2  Supplier#000000002                89eJ5ksX3ImxJQBvxObC,   
2          3  Supplier#000000003    q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3   
3          4  Supplier#000000004            Bk7ah4CK8SYQTepEmvMkkgMwg   
4          5  Supplier#000000005                    Gcdm2rJRzl5qlTVzc   

   S_NATIONKEY          S_PHONE  S_ACCTBAL  \
0           17  27-918-335-1736    5755.94   
1            5  15-679-861-2259    4032.68   
2            1  11-383-516-1199    4192.40   
3           15  25-843-787-7479    4641.08   
4           11  21-151-690-3663    -283.84   

                                           S_COMMENT  
0      

In [10]:
%%px

@bodo.jit(distributed=["rel"], cache=True)
def load_partsupp(data_folder):
    t1 = time.time()
    file = data_folder + "/partsupp.tbl"
    cols_names = ['PS_PARTKEY', 'PS_SUPPKEY', 'PS_AVAILQTY',
            'PS_SUPPLYCOST', 'PS_COMMENT']
    cols = {'PS_PARTKEY' : np.int64, 'PS_SUPPKEY' : np.int64, 'PS_AVAILQTY' : np.int64,
            'PS_SUPPLYCOST' : np.float64, 'PS_COMMENT' : str}
    rel = pd.read_csv(file, sep='|', header=None,
        names=cols_names,
        dtype=cols
        )
    print("Partsupp Reading time: ", ((time.time() - t1) * 1000), " (ms)")
    print(rel.head())
    return rel

partsupp = load_partsupp("s3://bodo-examples-data/tpch/s2")

[stdout:0] 
'coroutine' object is not subscriptable.
Will use the value defined in the AWS_DEFAULT_REGION environment variable (or us-east-1 if that is not provided either).
Partsupp Reading time:  1413.774013519287  (ms)
   PS_PARTKEY  PS_SUPPKEY  PS_AVAILQTY  PS_SUPPLYCOST  \
0           1           2         3325         771.64   
1           1        5002         8076         993.49   
2           1       10002         3956         337.09   
3           1       15002         4069         357.84   
4           2           3         8895         378.49   

                                          PS_COMMENT  
0  , even theodolites. regular, final theodolites...  
1  ven ideas. quickly even packages print. pendin...  
2  after the fluffily ironic deposits? blithely s...  
3  al, regular dependencies serve carefully after...  
4  nic accounts. final accounts sleep furiously a...  


In [11]:
%%px

@bodo.jit(distributed=["rel"], cache=True)
def load_part(data_folder):
    t1 = time.time()
    file = data_folder + "/part.tbl"
    cols_names = ['P_PARTKEY', 'P_NAME', 'P_MFGR', 'P_BRAND',
            'P_TYPE', 'P_SIZE', 'P_CONTAINER',
            'P_RETAILPRICE', 'P_COMMENT']
    cols = {'P_PARTKEY' : np.int64, 'P_NAME' : str, 'P_MFGR' : str, 'P_BRAND' : str,
            'P_TYPE' : str, 'P_SIZE' : np.int64, 'P_CONTAINER' : str,
            'P_RETAILPRICE' : np.float64, 'P_COMMENT' : str}
    rel = pd.read_csv(file, sep='|', header=None,
        names=cols_names,
        dtype=cols
        )
    print("Part Reading time: ", ((time.time() - t1) * 1000), " (ms)")
    print(rel.head())
    return rel

part = load_part("s3://bodo-examples-data/tpch/s2")

[stdout:0] 
'coroutine' object is not subscriptable.
Will use the value defined in the AWS_DEFAULT_REGION environment variable (or us-east-1 if that is not provided either).
Part Reading time:  488.7721538543701  (ms)
   P_PARTKEY                                    P_NAME          P_MFGR  \
0          1  goldenrod lavender spring chocolate lace  Manufacturer#1   
1          2          blush thistle blue yellow saddle  Manufacturer#1   
2          3       spring green yellow purple cornsilk  Manufacturer#4   
3          4     cornflower chocolate smoke green pink  Manufacturer#3   
4          5             forest brown coral puff cream  Manufacturer#3   

    P_BRAND                   P_TYPE  P_SIZE P_CONTAINER  P_RETAILPRICE  \
0  Brand#13   PROMO BURNISHED COPPER       7   JUMBO PKG          901.0   
1  Brand#13      LARGE BRUSHED BRASS       1     LG CASE          902.0   
2  Brand#42  STANDARD POLISHED BRASS      21   WRAP CASE          903.0   
3  Brand#34       SMALL PLATED BRASS 

## Query Definitions

This section includes some of the queries using Python (Pandas)

### Q1: Pricing Summary Report Query
This query reports the amount of businesses that were billed, shipped, and returned.

Make sure you have run **`load_lineitem`** from [loading data section](#loading_data) before running this query.

In [12]:
%%px
@bodo.jit(distributed=["lineitem"], cache=True)
def q1(lineitem):
    t1 = time.time()
    sel = lineitem.L_SHIPDATE <= "1998-09-02"
    flineitem = lineitem[sel]
    flineitem["DISC_PRICE"] = flineitem.L_EXTENDEDPRICE * (1 - flineitem.L_DISCOUNT)
    flineitem["CHARGE"] = (
        flineitem.L_EXTENDEDPRICE * (1 - flineitem.L_DISCOUNT) * (1 + flineitem.L_TAX)
    )
    gb = flineitem.groupby(["L_RETURNFLAG", "L_LINESTATUS"], as_index=False)
    total = gb.agg({"L_QUANTITY": ["sum", "mean"], "L_EXTENDEDPRICE": ["sum", "mean"],
                   "DISC_PRICE": "sum", "CHARGE": "sum",
                   "L_DISCOUNT": "mean", "L_ORDERKEY": "count"})
    total = total.sort_values(["L_RETURNFLAG", "L_LINESTATUS"])
    print("Execution time: ", ((time.time() - t1) * 1000), " (ms)")
    print(total.head(10))
    return total.head(10)

q1_result = q1(lineitem)

[stdout:0] 
Execution time:  849.1220474243164  (ms)
  L_RETURNFLAG L_LINESTATUS   L_QUANTITY            L_EXTENDEDPRICE  \
                                     sum       mean             sum   
2            A            F   75478173.0  25.505699    1.131973e+11   
1            N            F    1966480.0  25.530081    2.946115e+09   
0            N            O  148642120.0  25.495192    2.229036e+11   
3            R            F   75577628.0  25.512150    1.133519e+11   

                   DISC_PRICE        CHARGE L_DISCOUNT L_ORDERKEY  
           mean           sum           sum       mean      count  
2  38251.814164  1.075364e+11  1.118389e+11   0.050004    2959267  
1  38248.316500  2.798797e+09  2.911030e+09   0.049996      77026  
0  38232.562546  2.117623e+11  2.202358e+11   0.049981    5830202  
3  38263.321544  1.076881e+11  1.119943e+11   0.049980    2962417  


### Q3: Shipping Priority Query
This query retrieves the 10 unshipped orders with the highest value.

Make sure you have run **`load_lineitem`, `load_orders`, and `load_customer`** from [loading data section](#loading_data) before running this query.

In [13]:
%%px

@bodo.jit(distributed=["lineitem", "orders", "customer"], cache=True)
def q3(lineitem, orders, customer):
    date = "1995-03-04"    
    t1 = time.time()
    lsel = lineitem.L_SHIPDATE > date
    osel = orders.O_ORDERDATE < date
    csel = customer.C_MKTSEGMENT == "HOUSEHOLD"
    flineitem = lineitem[lsel]
    forders = orders[osel]
    fcustomer = customer[csel]
    jn1 = fcustomer.merge(forders, left_on="C_CUSTKEY", right_on="O_CUSTKEY")
    jn2 = jn1.merge(flineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY")

    jn2["TMP"] = jn2.L_EXTENDEDPRICE * (1 - jn2.L_DISCOUNT)

    total = (
        jn2.groupby(
            ["L_ORDERKEY", "O_ORDERDATE", "O_SHIPPRIORITY"], as_index=False
        )["TMP"]
        .sum()
        .sort_values(["TMP"], ascending=False)
    )
    res = total[["L_ORDERKEY", "TMP", "O_ORDERDATE", "O_SHIPPRIORITY"]]

    print("Execution time: ", ((time.time() - t1) * 1000), " (ms)")
    print(res.head(10))
    return res.head(10)

q3_result = q3(lineitem, orders, customer)

[stdout:0] 
Execution time:  277.27818489074707  (ms)
       L_ORDERKEY          TMP O_ORDERDATE  O_SHIPPRIORITY
22440    11495873  417031.8957  1995-01-17               0
2103      4163074  416152.8027  1995-02-13               0
22319     6487431  412710.0508  1995-02-06               0
9123      5536290  407857.9678  1995-01-31               0
20749    10666915  397407.4185  1995-02-14               0
13060     4225253  391852.7649  1995-02-28               0
13076     6232772  391241.2513  1995-01-17               0
2012      4724865  387185.9724  1995-02-03               0
16296     3900355  386089.5768  1995-02-13               0
5790      2377474  383057.9763  1995-02-26               0


### Q4: Order Priority Checking Query
This query determines how well the order priority system is working and gives an assessment of customer satisfaction.

Make sure you have run **`load_lineitem` and `load_orders`** from [loading data section](#loading_data) before running this query.

In [14]:
%%px
@bodo.jit(distributed=["lineitem", "orders"], cache=True)
def q4(lineitem, orders):
    date1 = "1993-11-01"
    date2 = "1993-08-01"
    t1 = time.time()
    lsel = lineitem.L_COMMITDATE < lineitem.L_RECEIPTDATE
    osel = (orders.O_ORDERDATE < date1) & (orders.O_ORDERDATE >= date2)
    flineitem = lineitem[lsel]
    forders = orders[osel]
    jn = forders[forders["O_ORDERKEY"].isin(flineitem["L_ORDERKEY"])]
    total = (
        jn.groupby("O_ORDERPRIORITY", as_index=False)["O_ORDERKEY"]
        .count()
        .sort_values(["O_ORDERPRIORITY"])
    )
    print("Execution time: ", ((time.time() - t1) * 1000), " (ms)")
    print(total.head(10))
    return total.head(10)

q4_result = q4(lineitem, orders)

[stdout:0] 
Execution time:  293.0300235748291  (ms)
   O_ORDERPRIORITY  O_ORDERKEY
3         1-URGENT       21039
1           2-HIGH       20986
4         3-MEDIUM       20918
0  4-NOT SPECIFIED       21056
2            5-LOW       21247


### Q6: Forecasting Revenue Change Query
This query quantifies the amount of revenue increase that would have resulted from eliminating certain company-wide discounts in a given percentage range in a given year.

Make sure you have run **`load_lineitem`** from [loading data section](#loading_data) before running this query.

In [15]:
%%px
@bodo.jit(distributed=["lineitem"], cache=True)
def q6(lineitem):
    date1 = "1996-01-01"
    date2 = "1997-01-01"
    t1 = time.time()
    sel = (
        (lineitem.L_SHIPDATE >= date1)
        & (lineitem.L_SHIPDATE < date2)
        & (lineitem.L_DISCOUNT >= 0.08)
        & (lineitem.L_DISCOUNT <= 0.1)
        & (lineitem.L_QUANTITY < 24)
    )
    flineitem = lineitem[sel]
    total = (flineitem.L_EXTENDEDPRICE * flineitem.L_DISCOUNT).sum()
    print("Execution time: ", ((time.time() - t1) * 1000), " (ms)")
    print(total)
    return total

q6_result = q6(lineitem)

[stdout:0] 
Execution time:  1285.8550548553467  (ms)
369930968.9013001


### Q9: Product Type Profit Measure Query
This query determines how much profit is made on a given line of parts, broken out by supplier nation and year.

Make sure you have run **`load_lineitem`, `load_orders`, `load_part`, `load_nation`, `load_partsupp`, and `load_supplier`** from [loading data section](#loading_data) before running this query.

In [16]:
%%px
@bodo.jit(distributed=["lineitem", "orders", "part", "nation", "partsupp", "supplier"], cache=True)
def q9(lineitem, orders, part, nation, partsupp, supplier):
    t1 = time.time()
    psel = part.P_NAME.str.contains("ghost")
    fpart = part[psel]
    jn1 = lineitem.merge(fpart, left_on="L_PARTKEY", right_on="P_PARTKEY")
    jn2 = jn1.merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY")
    jn3 = jn2.merge(nation, left_on="S_NATIONKEY", right_on="N_NATIONKEY")
    jn4 = partsupp.merge(
        jn3, left_on=["PS_PARTKEY", "PS_SUPPKEY"], right_on=["L_PARTKEY", "L_SUPPKEY"]
    )
    jn5 = jn4.merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY")
    jn5["TMP"] = jn5.L_EXTENDEDPRICE * (1 - jn5.L_DISCOUNT) - (
        (1 * jn5.PS_SUPPLYCOST) * jn5.L_QUANTITY
    )
    jn5["O_YEAR"] = jn5.O_ORDERDATE.dt.year
    gb = jn5.groupby(["N_NAME", "O_YEAR"], as_index=False)["TMP"].sum()
    total = gb.sort_values(["N_NAME", "O_YEAR"], ascending=[True, False])
    print("Execution time: ", ((time.time() - t1) * 1000), " (ms)")
    print(total.head(10))
    return total.head(10)

q9_result = q9(lineitem, orders, part, nation, partsupp, supplier)

[stdout:0] 
Execution time:  704.9009799957275  (ms)
        N_NAME  O_YEAR           TMP
138    ALGERIA    1998  5.356921e+07
43     ALGERIA    1997  9.099153e+07
80     ALGERIA    1996  9.347106e+07
78     ALGERIA    1995  8.867859e+07
155    ALGERIA    1994  8.825740e+07
163    ALGERIA    1993  9.180006e+07
42     ALGERIA    1992  8.803464e+07
29   ARGENTINA    1998  5.681634e+07
39   ARGENTINA    1997  9.869196e+07
173  ARGENTINA    1996  1.025402e+08


### Q10: Returned Item Reporting Query
This query identifies customers who might be having problems with the parts that are shipped to them.

Make sure you have run **`load_lineitem`, `load_orders`, `load_customer`, and `load_nation`** from [loading data section](#loading_data) before running this query.

In [18]:
%%px
@bodo.jit(distributed=["lineitem", "orders", "customer", "nation"], cache=True)
def q10(lineitem, orders, customer, nation):
    date1 = "1994-11-01"
    date2 = "1995-02-01"
    t1 = time.time()
    osel = (orders.O_ORDERDATE >= date1) & (orders.O_ORDERDATE < date2)
    lsel = lineitem.L_RETURNFLAG == "R"
    forders = orders[osel]
    flineitem = lineitem[lsel]
    jn1 = flineitem.merge(forders, left_on="L_ORDERKEY", right_on="O_ORDERKEY")
    jn2 = jn1.merge(customer, left_on="O_CUSTKEY", right_on="C_CUSTKEY")
    jn3 = jn2.merge(nation, left_on="C_NATIONKEY", right_on="N_NATIONKEY")
    jn3["TMP"] = jn3.L_EXTENDEDPRICE * (1.0 - jn3.L_DISCOUNT)
    gb = jn3.groupby(
        [
            "C_CUSTKEY",
            "C_NAME",
            "C_ACCTBAL",
            "C_PHONE",
            "N_NAME",
            "C_ADDRESS",
            "C_COMMENT",
        ],
        as_index=False,
    )["TMP"].sum()
    total = gb.sort_values("TMP", ascending=False)
    print("Execution time: ", ((time.time() - t1) * 1000), " (ms)")
    print(total.head(10))
    return total.head(10)

q10_result = q10(lineitem, orders, customer, nation)

[stdout:0] 
Execution time:  3168.290853500366  (ms)
       C_CUSTKEY              C_NAME  C_ACCTBAL          C_PHONE  \
53446     202000  Customer#000202000    3155.77  30-860-645-7227   
58620     117250  Customer#000117250    8260.39  17-524-241-3788   
42537      67172  Customer#000067172    1747.27  21-578-917-6336   
68775      87863  Customer#000087863    3105.23  21-393-302-3317   
18890     284104  Customer#000284104    2668.71  32-167-107-4631   
6472      168475  Customer#000168475    3873.61  21-537-772-3811   
23509     217825  Customer#000217825    1753.84  23-765-943-1680   
52391     288742  Customer#000288742    6316.82  12-549-320-8899   
73763      50281  Customer#000050281    3306.39  25-210-337-8539   
43776     159304  Customer#000159304    -342.39  18-681-863-8034   

             N_NAME                             C_ADDRESS  \
53446  SAUDI ARABIA                        RITU1eYat8iNeD   
58620       GERMANY                2N bS9peD0b5Dr3tf Vbxq   
42537          

### Q12: Shipping Modes and Order Priority Query
This query determines whether selecting less expensive modes of shipping is negatively affecting the critical-priority orders by causing more parts to be received by customers after the committed date.

Make sure you have run **`load_lineitem` and `load_orders`** from [loading data section](#loading_data) before running this query.

In [19]:
%%px
@bodo.jit(distributed=["lineitem", "orders"], cache=True)
def q12(lineitem, orders):
    date1 = "1994-01-01"
    date2 = "1995-01-01"
    t1 = time.time()
    sel = (
        (lineitem.L_RECEIPTDATE < date2)
        & (lineitem.L_COMMITDATE < date2)
        & (lineitem.L_SHIPDATE < date2)
        & (lineitem.L_SHIPDATE < lineitem.L_COMMITDATE)
        & (lineitem.L_COMMITDATE < lineitem.L_RECEIPTDATE)
        & (lineitem.L_RECEIPTDATE >= date1)
        & ((lineitem.L_SHIPMODE == "MAIL") | (lineitem.L_SHIPMODE == "SHIP"))
    )
    flineitem = lineitem[sel]
    jn = flineitem.merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY")

    def g1(x):
        return ((x == "1-URGENT") | (x == "2-HIGH")).sum()

    def g2(x):
        return ((x != "1-URGENT") & (x != "2-HIGH")).sum()

    total = jn.groupby("L_SHIPMODE", as_index=False)["O_ORDERPRIORITY"].agg((g1, g2))
    total = total.sort_values("L_SHIPMODE")
    print("Execution time: ", ((time.time() - t1) * 1000), " (ms)")
    print(total.head(10))
    return total.head(10)

q12_result = q12(lineitem, orders)

[stdout:0] 
Execution time:  557.7070713043213  (ms)
  L_SHIPMODE     g1     g2
1       MAIL  12354  18548
0       SHIP  12430  18644


### Q14: Promotion Effect Query
This query monitors the market response to a promotion such as TV advertisements or a special campaign.

Make sure you have run **`load_lineitem`** and **`load_part`** from [loading data section](#loading_data) before running this query.

In [20]:
%%px
@bodo.jit(distributed=["lineitem", "part"], cache=True)
def q14(lineitem, part):
    startDate = "1994-03-01"
    endDate = "1994-04-01"
    p_type_like = "PROMO"
    t1 = time.time()
    sel = (lineitem.L_SHIPDATE >= startDate) & (lineitem.L_SHIPDATE < endDate)
    flineitem = lineitem[sel]
    jn = flineitem.merge(part, left_on="L_PARTKEY", right_on="P_PARTKEY")
    jn["TMP"] = jn.L_EXTENDEDPRICE * (1.0 - jn.L_DISCOUNT)
    total = jn[jn.P_TYPE.str.startswith(p_type_like)].TMP.sum() * 100 / jn.TMP.sum()
    print("Execution time: ", ((time.time() - t1) * 1000), " (ms)")
    print(total)
    return total

q14_result = q14(lineitem, part)

[stdout:0] 
Execution time:  120.07999420166016  (ms)
16.545810644547494


### Q18: Large Volume Customer Query
This query ranks customers based on their having placed a large quantity order. Large quantity orders are defined as those orders whose total quantity is above a certain level.

Make sure you have run **`load_lineitem`, `load_orders`, and `load_customer`** from [loading data section](#loading_data) before running this query.

In [21]:
%%px
@bodo.jit(distributed=["lineitem", "orders", "customer"], cache=True)
def q18(lineitem, orders, customer):
    t1 = time.time()
    gb1 = lineitem.groupby("L_ORDERKEY", as_index=False)["L_QUANTITY"].sum()
    fgb1 = gb1[gb1.L_QUANTITY > 300]
    jn1 = fgb1.merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY")
    jn2 = jn1.merge(customer, left_on="O_CUSTKEY", right_on="C_CUSTKEY")
    gb2 = jn2.groupby(
        ["C_NAME", "C_CUSTKEY", "O_ORDERKEY", "O_ORDERDATE", "O_TOTALPRICE"],
        as_index=False,
    )["L_QUANTITY"].sum()
    total = gb2.sort_values(["O_TOTALPRICE", "O_ORDERDATE"], ascending=[False, True])
    print("Execution time: ", ((time.time() - t1) * 1000), " (ms)")
    print(total.head(10))
    return total.head(10)

q18_result = q18(lineitem, orders, customer)

[stdout:0] 
Execution time:  786.4959239959717  (ms)
                C_NAME  C_CUSTKEY  O_ORDERKEY O_ORDERDATE  O_TOTALPRICE  \
8   Customer#000256240     256240     4722021  1994-04-07     543948.47   
37  Customer#000192203     192203     5984582  1992-03-16     539085.20   
33  Customer#000273004     273004    11785570  1996-07-18     535097.55   
49  Customer#000198385     198385     8574884  1992-07-04     530902.09   
26  Customer#000186325     186325     9436480  1992-05-22     523925.49   
12  Customer#000048682      48682     1474818  1992-11-15     522718.60   
59  Customer#000027880      27880     2232932  1997-04-13     519887.44   
54  Customer#000082297      82297     8231942  1993-02-06     516398.50   
24  Customer#000176644     176644    10889601  1992-05-22     513824.71   
21  Customer#000258475     258475     7125602  1994-07-08     509018.85   

    L_QUANTITY  
8        323.0  
37       312.0  
33       303.0  
49       308.0  
26       311.0  
12       302.0  
59

### Q19: Discounted Revenue Query
This query reports the gross discounted revenue attributed to the sale of selected parts handled in a particular manner.

Make sure you have run **`load_lineitem`** and **`load_part`** from [loading data section](#loading_data) before running this query.

In [22]:
%%px
@bodo.jit(distributed=["lineitem", "part"], cache=True)
def q19(lineitem, part):
    Brand31 = "Brand#31"
    Brand43 = "Brand#43"
    SMBOX = "SM BOX"
    SMCASE = "SM CASE"
    SMPACK = "SM PACK"
    SMPKG = "SM PKG"
    MEDBAG = "MED BAG"
    MEDBOX = "MED BOX"
    MEDPACK = "MED PACK"
    MEDPKG = "MED PKG"
    LGBOX = "LG BOX"
    LGCASE = "LG CASE"
    LGPACK = "LG PACK"
    LGPKG = "LG PKG"
    DELIVERINPERSON = "DELIVER IN PERSON"
    AIR = "AIR"
    AIRREG = "AIRREG"
    t1 = time.time()
    lsel = (
        (
            ((lineitem.L_QUANTITY <= 36) & (lineitem.L_QUANTITY >= 26))
            | ((lineitem.L_QUANTITY <= 25) & (lineitem.L_QUANTITY >= 15))
            | ((lineitem.L_QUANTITY <= 14) & (lineitem.L_QUANTITY >= 4))
        )
        & (lineitem.L_SHIPINSTRUCT == DELIVERINPERSON)
        & ((lineitem.L_SHIPMODE == AIR) | (lineitem.L_SHIPMODE == AIRREG))
    )
    psel = (part.P_SIZE >= 1) & (
        (
            (part.P_SIZE <= 5)
            & (part.P_BRAND == Brand31)
            & (
                (part.P_CONTAINER == SMBOX)
                | (part.P_CONTAINER == SMCASE)
                | (part.P_CONTAINER == SMPACK)
                | (part.P_CONTAINER == SMPKG)
            )
        )
        | (
            (part.P_SIZE <= 10)
            & (part.P_BRAND == Brand43)
            & (
                (part.P_CONTAINER == MEDBAG)
                | (part.P_CONTAINER == MEDBOX)
                | (part.P_CONTAINER == MEDPACK)
                | (part.P_CONTAINER == MEDPKG)
            )
        )
        | (
            (part.P_SIZE <= 15)
            & (part.P_BRAND == Brand43)
            & (
                (part.P_CONTAINER == LGBOX)
                | (part.P_CONTAINER == LGCASE)
                | (part.P_CONTAINER == LGPACK)
                | (part.P_CONTAINER == LGPKG)
            )
        )
    )
    flineitem = lineitem[lsel]
    fpart = part[psel]
    jn = flineitem.merge(fpart, left_on="L_PARTKEY", right_on="P_PARTKEY")
    jnsel = (
        (jn.P_BRAND == Brand31)
        & (
            (jn.P_CONTAINER == SMBOX)
            | (jn.P_CONTAINER == SMCASE)
            | (jn.P_CONTAINER == SMPACK)
            | (jn.P_CONTAINER == SMPKG)
        )
        & (jn.L_QUANTITY >= 4)
        & (jn.L_QUANTITY <= 14)
        & (jn.P_SIZE <= 5)
        | (jn.P_BRAND == Brand43)
        & (
            (jn.P_CONTAINER == MEDBAG)
            | (jn.P_CONTAINER == MEDBOX)
            | (jn.P_CONTAINER == MEDPACK)
            | (jn.P_CONTAINER == MEDPKG)
        )
        & (jn.L_QUANTITY >= 15)
        & (jn.L_QUANTITY <= 25)
        & (jn.P_SIZE <= 10)
        | (jn.P_BRAND == Brand43)
        & (
            (jn.P_CONTAINER == LGBOX)
            | (jn.P_CONTAINER == LGCASE)
            | (jn.P_CONTAINER == LGPACK)
            | (jn.P_CONTAINER == LGPKG)
        )
        & (jn.L_QUANTITY >= 26)
        & (jn.L_QUANTITY <= 36)
        & (jn.P_SIZE <= 15)
    )
    jn = jn[jnsel]
    total = (jn.L_EXTENDEDPRICE * (1.0 - jn.L_DISCOUNT)).sum()
    print("Execution time: ", ((time.time() - t1) * 1000), " (ms)")
    print(total)
    return total

q19_result = q19(lineitem, part)

[stdout:0] 
Execution time:  2656.430959701538  (ms)
7178285.3134


### Q20: Potential Part Promotion Query
This query identifies suppliers in a particular nation having selected parts that may be candidates for a promotional offer.

Make sure you have run **`load_lineitem`, `load_part`, `load_nation`, `load_partsupp`, and `load_supplier`** from [loading data section](#loading_data) before running this query.

In [23]:
%%px
@bodo.jit(distributed=["lineitem", "part", "nation", "partsupp", "supplier"], cache=True)
def q20(lineitem, part, nation, partsupp, supplier):
    date1 = "1996-01-01"
    date2 = "1997-01-01"
    t1 = time.time()
    psel = part.P_NAME.str.startswith("azure")
    nsel = nation.N_NAME == "JORDAN"
    lsel = (lineitem.L_SHIPDATE >= date1) & (lineitem.L_SHIPDATE < date2)
    fpart = part[psel]
    fnation = nation[nsel]
    flineitem = lineitem[lsel]
    jn1 = fpart.merge(partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY")
    jn2 = jn1.merge(
        flineitem,
        left_on=["PS_PARTKEY", "PS_SUPPKEY"],
        right_on=["L_PARTKEY", "L_SUPPKEY"],
    )
    gb = jn2.groupby(["PS_PARTKEY", "PS_SUPPKEY", "PS_AVAILQTY"], as_index=False)[
        "L_QUANTITY"
    ].sum()
    gbsel = gb.PS_AVAILQTY > (0.5 * gb.L_QUANTITY)
    fgb = gb[gbsel]
    jn3 = fgb.merge(supplier, left_on="PS_SUPPKEY", right_on="S_SUPPKEY")
    jn4 = fnation.merge(jn3, left_on="N_NATIONKEY", right_on="S_NATIONKEY")
    jn4 = jn4[["S_NAME", "S_ADDRESS"]]
    total = jn4.sort_values("S_NAME").drop_duplicates()
    print("Execution time: ", ((time.time() - t1) * 1000), " (ms)")
    print(total.head(10))
    return total.head(10)

q20_result = q20(lineitem, part, nation, partsupp, supplier)

[stdout:0] 
Execution time:  201.1561393737793  (ms)
                 S_NAME                               S_ADDRESS
12   Supplier#000000824  wJnn6YrLnzsQWLOZNdMSBz1utk9EFS6icrvQyy
47   Supplier#000001385   UUD4ymFUhLSjsYHaBwlWf,1zdlh1vFFQH6Tqo
55   Supplier#000001387         rhjiTEU33edkiQ5BFQsHLQ1h9Zv8Vmb
78   Supplier#000003125   0A 825 JYoZc3fVFu0i6cMLWyo724X Cu77XB
101  Supplier#000004778                   NFVxKobRQOHN B1O0U7U,
125  Supplier#000005087                   q0c6r9wYVQx31IeGBZKfe
122  Supplier#000005991      jFAkBBG3JDqoH8nIXUZGn OzkVVfW6YsNh
156  Supplier#000006252                       UwHj WvLeLCueyOUR
192  Supplier#000008373                             fKWgRbTLSos
211  Supplier#000009780                             uukEUY7b0iw
