In [1]:
import pyarrow.dataset as ds
import polars as pl
from datetime import datetime
%load_ext memory_profiler
import time

In [2]:
data = ds.dataset("~/data/trade_taq2019.arrow", format = "arrow")

In [3]:
def window(data):
    
    # make the scanner
    print("data into scan...")
    data = pl.scan_ds(data, allow_pyarrow_filter=True)
    
    # QUERY
    print("running window query...")
    results = data.filter(
        (pl.col("Symbol") == "AAPL")&
        (pl.col("Sale_Condition").str.contains("O"))&
        (pl.col("Time").is_between(datetime(2019,10,7,9,30), datetime(2019,10,7,16,0)))
    )
    
    
    return results

query = window(data)

result=query.collect()
result

data into scan...
running window query...


Time,Exchange,Symbol,Sale_Condition,Trade_Volume,Trade_Price,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Source_of_Trade,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator
datetime[ns],str,str,str,i64,f64,str,i64,i64,i64,str,str,datetime[ns],str,i64
2019-10-07 09:30:00.562307,"""Q""","""AAPL""","""@O X""",223839,226.26,,0,10250,356,"""N""",""" """,2019-10-07 09:30:00.562284,,1


In [4]:
def agg(data):
    
    data = pl.scan_ds(data, allow_pyarrow_filter=True)
    
    #sort the data first
    print("Sorting on time first...")
    data = data.sort("Time")
    
    #run the agg query
    print("running query now")
    
    result = data.groupby_dynamic(index_column="Time", every="1h", period="1h", by="Symbol").agg(
    [
        pl.col("Trade_Volume").sum().alias("Total_vol")
    ])
    
    return result

query = agg(data)
result = query.collect()
result

Sorting on time first...
running query now


Symbol,Time,Total_vol
str,datetime[ns],i64
"""IWM""",2019-10-07 04:00:00,5925
"""IWM""",2019-10-07 05:00:00,1500
"""IWM""",2019-10-07 06:00:00,13780
"""IWM""",2019-10-07 07:00:00,5736
"""IWM""",2019-10-07 08:00:00,28465
"""IWM""",2019-10-07 09:00:00,2280623
"""IWM""",2019-10-07 10:00:00,2516881
"""IWM""",2019-10-07 11:00:00,2057932
"""IWM""",2019-10-07 12:00:00,618580
"""IWM""",2019-10-07 13:00:00,2507266


In [5]:
def newagg(data):
    
    data = pl.scan_ds(data, allow_pyarrow_filter=True)
    print ("running query...")
    result = data.groupby(["Symbol","Exchange"]).agg(
        [
            pl.col("Trade_Price").mean().alias("avg price"),
            pl.col("Trade_Volume").mean().alias("avg volume")
        ]
    ).sort("avg price", reverse = True)
    
    return result
    
query = newagg(data)
result = query.collect()
result

running query...


Symbol,Exchange,avg price,avg volume
str,str,f64,f64
"""BRK A""","""K""",310979.198571,1.0
"""BRK A""","""N""",310927.54,1.114754
"""BRK A""","""P""",310875.0,1.0
"""BRK A""","""D""",310857.104533,1.444444
"""BRK A""","""Z""",310752.624,1.0
"""BRK A""","""V""",310735.25,1.0
"""SEB""","""Z""",4201.5,1.0
"""SEB""","""K""",4168.735455,1.090909
"""SEB""","""V""",4166.590833,1.5
"""SEB""","""T""",4165.869697,1.030303


In [6]:
#data ingest of the bigger datset
data = ds.dataset("~/data/quote_taq2019.arrow", format = "arrow")


In [7]:
def bagg(data):

    data = pl.scan_ds(data)
    print("sorting on time")
    data = data.sort("Time")

    #QUERY
    print("running big query")
    data = data.groupby_dynamic(index_column="Time", every="1h", period="1h", by="Symbol").agg(
            [
                (0.5*(pl.col("Bid_Price")+pl.col("Offer_Price")).mean()).alias("mid"),
                pl.col("Time").min().alias("min_time"),
                pl.col("Time").max().alias("max_time")

                ]
            ).sort("Time")
    return data

query = bagg(data)
result = query.collect()
result

sorting on time
running big query


Symbol,Time,mid,min_time,max_time
str,datetime[ns],f64,datetime[ns],datetime[ns]
"""SOL""",2019-10-07 03:00:00,0.0,2019-10-07 03:51:01.777668,2019-10-07 03:51:01.777668
"""SOL""",2019-10-07 04:00:00,1.201667,2019-10-07 04:00:00.020381,2019-10-07 04:00:00.020381
"""SDY""",2019-10-07 04:00:00,98.29433,2019-10-07 04:00:00.004187,2019-10-07 04:59:55.547784
"""SPHQ""",2019-10-07 04:00:00,13.665,2019-10-07 04:00:00.004273,2019-10-07 04:00:00.004273
"""SPLG""",2019-10-07 04:00:00,14.2725,2019-10-07 04:00:00.004305,2019-10-07 04:53:23.293591
"""SLY""",2019-10-07 04:00:00,24.68125,2019-10-07 04:00:00.004333,2019-10-07 04:53:23.259196
"""SPMD""",2019-10-07 04:00:00,13.7075,2019-10-07 04:00:00.004339,2019-10-07 04:53:23.276241
"""SLX""",2019-10-07 04:00:00,11.94,2019-10-07 04:00:00.004368,2019-10-07 04:00:00.004368
"""SSO""",2019-10-07 04:00:00,124.136168,2019-10-07 04:00:00.004421,2019-10-07 04:59:58.212920
"""SPDW""",2019-10-07 04:00:00,13.4575,2019-10-07 04:00:00.004431,2019-10-07 04:00:00.004445
