In [None]:
# Before running this notebook you need to install BodoSQL on the BCP.
# Please refer to our documentation for information on this process.

In [None]:
%%px
import time
import numpy as np
import pandas as pd
import bodo
import bodosql

@bodo.jit
def q10(data_folder):
    t1 = time.time()
    lineitem = load_lineitem(data_folder)
    orders = load_orders(data_folder)
    customer = load_customer(data_folder)
    nation = load_nation(data_folder)
    print("Reading time (s): ", time.time() - t1)
    bc = bodosql.BodoSQLContext(
        {
            "lineitem": lineitem,
            "orders": orders,
            "customer": customer,
            "nation": nation,
        }
    )
    output = bc.sql(
        """select
                c_custkey,
                c_name,
                sum(l_extendedprice * (1 - l_discount)) as revenue,
                c_acctbal,
                n_name,
                c_address,
                c_phone,
                c_comment
            from
                customer,
                orders,
                lineitem,
                nation
            where
                c_custkey = o_custkey
                and l_orderkey = o_orderkey
                and o_orderdate >= '1993-10-01'
                and o_orderdate < '1994-01-01'
                and l_returnflag = 'R'
                and c_nationkey = n_nationkey
            group by
                c_custkey,
                c_name,
                c_acctbal,
                c_phone,
                n_name,
                c_address,
                c_comment
            order by
                revenue desc,
                c_custkey
            limit 20
        """
    )
    print(output)
    print("Q10 Execution time (s): ", time.time() - t1)


@bodo.jit
def load_lineitem(data_folder):
    data_path = data_folder + "/lineitem.pq"
    df = pd.read_parquet(
        data_path,
    )
    # BodoSQL expects date columns to be datetime64
    df["L_SHIPDATE"] = df["L_SHIPDATE"].astype("datetime64[ns]")
    df["L_COMMITDATE"] = df["L_COMMITDATE"].astype("datetime64[ns]")
    df["L_RECEIPTDATE"] = df["L_RECEIPTDATE"].astype("datetime64[ns]")
    # Count to load all the data. This is done for a fair
    # read data comparison with Spark since you must perform
    # an action to isolate reads in Spark
    print(df.count())
    return df

@bodo.jit
def load_orders(data_folder):
    data_path = data_folder + "/orders.pq"
    df = pd.read_parquet(
        data_path,
    )
    # BodoSQL expects date columns to be datetime64
    df["O_ORDERDATE"] = df["O_ORDERDATE"].astype("datetime64[ns]")
    # Count to load all the data. This is done for a fair
    # read data comparison with Spark since you must perform
    # an action to isolate reads in Spark
    print(df.count())
    return df


@bodo.jit
def load_customer(data_folder):
    data_path = data_folder + "/customer.pq"
    df = pd.read_parquet(
        data_path,
    )
    # Count to load all the data. This is done for a fair
    # read data comparison with Spark since you must perform
    # an action to isolate reads in Spark
    print(df.count())
    return df


@bodo.jit(distributed=False)
def load_nation(data_folder):
    # Nation is a very small file so set it to replicated
    data_path = data_folder + "/nation.pq"
    df = pd.read_parquet(
        data_path,
    )
    # Count to load all the data. This is done for a fair
    # read data comparison with Spark since you must perform
    # an action to isolate reads in Spark
    print(df.count())
    return df

In [None]:
%%px
# Add a path to your data and any S3 configuration here
data_path = ""
q10(data_path)