In [None]:
import os
import sys
sys.path.append("../")

import pandas as pd
import numpy as np
import datetime as dt
import re

from src.utils.parser import parse_config
from src.utils.vault import get_secrets
from src.utils.processing import downcast
import snowflake.connector

config = parse_config(os.path.abspath(os.path.join(os.getcwd(), "../src/settings.yml")))
parquet_file = "../data/raw"

In [None]:
data_config = get_secrets("snowflake")
data_config.update(config["snowflake"]["data"])
snowflake_ctx = snowflake.connector.connect(**data_config)

In [None]:
last_processed = "2000-1-1 15:35:04.518 UTC"
# last_processed = pd.read_parquet("../data/raw", columns=["processed_on"]).max()[0]
last_processed = dt.datetime.strptime(last_processed, "%Y-%m-%d %H:%M:%S.%f %Z").date()
from_date = last_processed - dt.timedelta(days=last_processed.weekday())

channel_like = "register://electricity/0/activepower/%?avg=15"
# channel_like = "register://electricity/0/activepower/l_?avg=15"


query = f"""
SELECT
    t0.BOXID AS BOXID,
    t0.CHANNELID AS CHANNELID,
    YEAROFWEEKISO(t0.DATUMTIJD) AS YEAR,
    WEEKISO(t0.DATUMTIJD) AS WEEK,
    CURRENT_TIMESTAMP AS PROCESSED_ON,
    MAX(t0.WAARDE) AS MAX_VALUE,
    MIN(t0.WAARDE) AS MIN_VALUE
FROM {data_config["database"]}.{data_config["schema"]}.{data_config["table"]} t0
    WHERE t0.CHANNELID LIKE '{channel_like}'
     AND t0.DATUMTIJD >= DATE('{from_date}')
     AND t0.DATUMTIJD < DATEADD(DAY, -DAYOFWEEKISO(CURRENT_DATE), CURRENT_DATE)
--     AND t0.BOXID IN ('075.547-1', '069.509-1')
GROUP BY t0.BOXID, t0.CHANNELID, YEAROFWEEKISO(t0.DATUMTIJD), WEEKISO(t0.DATUMTIJD)
"""

In [None]:
%%time
df_query = pd.read_sql(sql=query, con=snowflake_ctx)

In [None]:
pattern = channel_like\
    .replace("?", "\?")\
    .replace("/", "\/")\
    .replace("%", "(sumli|l[1,2,3])")
df_query["L"] = df_query["CHANNELID"].str.extract(pattern)

In [None]:
df_query = df_query.apply(downcast, try_numeric=True, category=True)

In [None]:
df_query.info()

In [None]:
# df_query.to_parquet(parquet_file, partition_cols=["BOXID", "L"])

In [None]:
cols = ["BOXID", "VALUE", "YEAR", "WEEK", "TOP", "BOTTOM", "L"]
df_read = (
    pd.read_parquet(parquet_file, columns=cols)
    .query("TOP == 1 & L == 'sumli'")
    .drop(columns=["TOP", "BOTTOM", "L"])
)
    

In [None]:
df_read.info()

## get meta data about parquet for sake of speed

In [None]:
import pyarrow.parquet as pq
metadata = pq.read_metadata("../data/raw/BOXID=001.622-1/L=l1/7c3663dfe57547cb8fc54c33857b6caf.parquet")

# parquet_file = pq.ParquetFile("../data/raw/BOXID=001.622-1/L=l1/7c3663dfe57547cb8fc54c33857b6caf.parquet")
# metadata = parquet_file.metadata
# metadata.row_group(0).column(6)

## see parquet engine for possibilities to read _COMMON_METEDATA
https://arrow.apache.org/docs/python/parquet.html?highlight=pyarrow%20parquet%20partition