In [None]:
%%px
import time
import numpy as np
import pandas as pd
import bodo

@bodo.jit
def q10(data_folder):
    t1 = time.time()
    lineitem = load_lineitem(data_folder)
    orders = load_orders(data_folder)
    customer = load_customer(data_folder)
    nation = load_nation(data_folder)
    print("Reading time (s): ", time.time() - t1)

    t1 = time.time()
    date1 = pd.Timestamp("1994-11-01")
    date2 = pd.Timestamp("1995-02-01")
    osel = (orders.O_ORDERDATE >= date1) & (orders.O_ORDERDATE < date2)
    lsel = lineitem.L_RETURNFLAG == "R"
    forders = orders[osel]
    flineitem = lineitem[lsel]
    jn1 = flineitem.merge(forders, left_on="L_ORDERKEY", right_on="O_ORDERKEY")
    jn2 = jn1.merge(customer, left_on="O_CUSTKEY", right_on="C_CUSTKEY")
    jn3 = jn2.merge(nation, left_on="C_NATIONKEY", right_on="N_NATIONKEY")
    jn3["TMP"] = jn3.L_EXTENDEDPRICE * (1.0 - jn3.L_DISCOUNT)
    gb = jn3.groupby(
        [
            "C_CUSTKEY",
            "C_NAME",
            "C_ACCTBAL",
            "C_PHONE",
            "N_NAME",
            "C_ADDRESS",
            "C_COMMENT",
        ],
        as_index=False,
    )["TMP"].sum()
    total = gb.sort_values("TMP", ascending=False)
    print(total.head(20))
    print("Q10 Execution time (s): ", time.time() - t1)


@bodo.jit
def load_lineitem(data_folder):
    data_path = data_folder + "/lineitem.parquet"
    df = pd.read_parquet(
        data_path,
    )
    # BodoSQL expects date columns to be datetime64.
    # This keeps Bodo consistent
    df["L_SHIPDATE"] = df["L_SHIPDATE"].astype("datetime64[ns]")
    df["L_COMMITDATE"] = df["L_COMMITDATE"].astype("datetime64[ns]")
    df["L_RECEIPTDATE"] = df["L_RECEIPTDATE"].astype("datetime64[ns]")
    # Count to load all the data. This is done for a fair
    # read data comparison with Spark since you must perform
    # an action to isolate reads in Spark
    print(df.count())
    return df

@bodo.jit
def load_orders(data_folder):
    data_path = data_folder + "/orders.parquet"
    df = pd.read_parquet(
        data_path,
    )
    # BodoSQL expects date columns to be datetime64
    # This keeps Bodo consistent
    df["O_ORDERDATE"] = df["O_ORDERDATE"].astype("datetime64[ns]")
    # Count to load all the data. This is done for a fair
    # read data comparison with Spark since you must perform
    # an action to isolate reads in Spark
    print(df.count())
    return df


@bodo.jit
def load_customer(data_folder):
    data_path = data_folder + "/customer.parquet"
    df = pd.read_parquet(
        data_path,
    )
    # Count to load all the data. This is done for a fair
    # read data comparison with Spark since you must perform
    # an action to isolate reads in Spark
    print(df.count())
    return df


@bodo.jit(distributed=False)
def load_nation(data_folder):
    # Nation is a very small file so set it to replicated
    data_path = data_folder + "/nation.parquet"
    df = pd.read_parquet(
        data_path,
    )
    # Count to load all the data. This is done for a fair
    # read data comparison with Spark since you must perform
    # an action to isolate reads in Spark
    print(df.count())

In [None]:
%%px
# Add a path to your data and any S3 configuration here
data_path = ""
q10(data_path)