In [1]:
import pyarrow.dataset as ds
import polars as pl
from datetime import datetime
%load_ext memory_profiler
import time



In [2]:
data =  ds.dataset("~/internal/data/trade_taq2019.arrow", format = "arrow")
data = pl.scan_ds(data, allow_pyarrow_filter=True)

In [3]:
def window(data):
    
    # QUERY
    start = time.process_time()
    print("running window query...")
    data = data.filter(
        (pl.col("Symbol") == "AAPL")&
        (pl.col("Sale_Condition").str.contains("O"))&
        (pl.col("Time").is_between(datetime(2019,10,7,9,30), datetime(2019,10,7,16,0)))
    ).collect()
    
    
    return data

%memit window(data)

running window query...
peak memory: 9814.97 MiB, increment: 9666.90 MiB


In [4]:
def agg(data):
    
    #sort the data first
    print("Sorting on time first...")
    data = data.sort("Time")
    
    #run the agg query
    print("running query now")
    
    data = data.groupby_dynamic(index_column="Time", every="1m", period="1m", by="Symbol").agg(
    [
        pl.col("Trade_Volume").sum().alias("Total_vol")
    ]).sort("Time").collect()
    
    return data

%memit agg(data)


Sorting on time first...
running query now
peak memory: 4686.54 MiB, increment: 2.28 MiB


In [5]:
def other(data):
    
    print ("running query...")
    out = data.groupby(["Symbol","Exchange"]).agg(
        [
            pl.col("Trade_Price").mean().alias("avg price"),
            pl.col("Trade_Volume").mean().alias("avg volume")
        ]
    ).sort("avg price", reverse = True).collect()
    
    return out
    
%memit other(data)


running query...
peak memory: 4446.96 MiB, increment: 3123.02 MiB


In [6]:
#data ingest of the bigger datset
data = ds.dataset("~/internal/data/quote_taq2019.arrow", format = "arrow")
data = pl.scan_ds(data)


In [7]:
def bagg(data):

    print("sorting on time")
    data = data.sort("Time")

    #QUERY
    print("running big query")
    data = data.groupby_dynamic(index_column="Time", every="1h", period="1h", by="Symbol").agg(
            [
                (0.5*(pl.col("Bid_Price")+pl.col("Offer_Price")).mean()).alias("mid"),
                pl.col("Time").min().alias("min_time"),
                pl.col("Time").max().alias("max_time")

                ]
            ).sort("Time").collect()

    return data

%memit bagg(data)

sorting on time
running big query
peak memory: 8290.13 MiB, increment: 5844.37 MiB
