In [1]:
import os
if os.environ.get("BODO_PLATFORM_WORKSPACE_UUID",'NA') == 'NA':
    print("You are not on Bodo Platform, running ipyparallel.. ")
    import ipyparallel as ipp
    import psutil; n = min(psutil.cpu_count(logical=False), 8)
    rc = ipp.Cluster(engines='mpi', n=n).start_and_connect_sync(activate=True)

You are not on Bodo Platform, running ipyparallel.. 
Starting 8 engines with <class 'ipyparallel.cluster.launcher.MPIEngineSetLauncher'>
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:07<00:00,  1.11engine/s]


In [2]:
%autopx

%autopx enabled


In [3]:
import bodo
import time
import numpy as np
import pandas as pd
import json
import os

%px: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:09<00:00,  1.20s/tasks]


You need to add your AWS account and iceberg credentials to access the data. You can store your credentials in a file called credentials.json similar to the one shown below. 
# content of credentials.json file
{
    "aws": {
        "aws_access_key_id": "xxxxxx",   
        "aws_secret_access_key": "xxxxxx",  
        "aws_default_region":"xxxxxx"
    },

    "iceberg": {
        "nessie_endpoint": "xxxxxx",
        "token": "xxxxxx"
    },
}

In [6]:
path_to_conn_creds = "credentials.json"
with open(path_to_conn_creds) as f:
    creds = json.load(f)

os.environ["AWS_ACCESS_KEY_ID"] = creds["aws"]["aws_access_key_id"]
os.environ["AWS_SECRET_ACCESS_KEY"] = creds["aws"]["aws_secret_access_key"]
os.environ["AWS_DEFAULT_REGION"] = creds["aws"]["aws_default_region"]

Nessie_Endpoint = creds["iceberg"]["nessie_endpoint"]
Dremio_Token = creds["iceberg"]["token"]

In [7]:
iceberg_url = f"iceberg+{Nessie_Endpoint}?type=nessie&authentication.type=BEARER&authentication.token={Dremio_Token}"

In [12]:
@bodo.jit
def run_queries(iceberg_url):
    #tracing.start()

    print("#" * 128)
    print("Started Q01 Execution...")
    print("#" * 128)

    # Load the data
    start_time = time.time()


    lineitem = pd.read_sql_table("SF1_LINEITEM_PQ_A", iceberg_url,"")
    
    q01(lineitem)
    

In [13]:
@bodo.jit
def q01(lineitem):
    t1 = time.time()
    date = pd.Timestamp("1998-09-02")
    lineitem_filtered = lineitem.loc[
                        :,
                        [
                            "L_QUANTITY",
                            "L_EXTENDEDPRICE",
                            "L_DISCOUNT",
                            "L_TAX",
                            "L_RETURNFLAG",
                            "L_LINESTATUS",
                            "L_SHIPDATE",
                            "L_ORDERKEY",
                        ],
                        ]
    sel = lineitem_filtered.L_SHIPDATE <= date
    lineitem_filtered = lineitem_filtered[sel]
    lineitem_filtered["AVG_QTY"] = lineitem_filtered.L_QUANTITY
    lineitem_filtered["AVG_PRICE"] = lineitem_filtered.L_EXTENDEDPRICE
    lineitem_filtered["DISC_PRICE"] = lineitem_filtered.L_EXTENDEDPRICE * (
            1 - lineitem_filtered.L_DISCOUNT
    )
    lineitem_filtered["CHARGE"] = (
            lineitem_filtered.L_EXTENDEDPRICE
            * (1 - lineitem_filtered.L_DISCOUNT)
            * (1 + lineitem_filtered.L_TAX)
    )
    gb = lineitem_filtered.groupby(["L_RETURNFLAG", "L_LINESTATUS"], as_index=False)[
        "L_QUANTITY",
        "L_EXTENDEDPRICE",
        "DISC_PRICE",
        "CHARGE",
        "AVG_QTY",
        "AVG_PRICE",
        "L_DISCOUNT",
        "L_ORDERKEY",
    ]
    total = gb.agg(
        {
            "L_QUANTITY": "sum",
            "L_EXTENDEDPRICE": "sum",
            "DISC_PRICE": "sum",
            "CHARGE": "sum",
            "AVG_QTY": "mean",
            "AVG_PRICE": "mean",
            "L_DISCOUNT": "mean",
            "L_ORDERKEY": "count",
        }
    )
    total = total.sort_values(["L_RETURNFLAG", "L_LINESTATUS"])
    print(total.head())
    print("Q01 Execution time (s): ", time.time() - t1)
    

In [15]:
run_queries(iceberg_url)

[stdout:0] ################################################################################################################################
Started Q01 Execution...
################################################################################################################################
  L_RETURNFLAG L_LINESTATUS  L_QUANTITY  L_EXTENDEDPRICE    DISC_PRICE  \
2            A            F  37734107.0     5.658655e+10  5.375826e+10   
1            N            F    991417.0     1.487505e+09  1.413082e+09   
3            N            O  74476040.0     1.117017e+11  1.061182e+11   
0            R            F  37719753.0     5.656804e+10  5.374129e+10   

         CHARGE    AVG_QTY     AVG_PRICE  L_DISCOUNT  L_ORDERKEY  
2  5.590907e+10  25.522006  38273.129735    0.049985     1478493  
1  1.469649e+09  25.516472  38284.467761    0.050093       38854  
3  1.103670e+11  25.502227  38249.117989    0.049997     2920374  
0  5.588962e+10  25.505794  38250.854626    0.050009     1478870  


For best performance the number of row groups should be greater than the number of workers (144). For more details, refer to
https://docs.bodo.ai/latest/file_io/#parquet-section.



%px:   0%|          | 0/144 [00:00<?, ?tasks/s]