In [None]:
import os
import sys
sys.path.append("../")

import pandas as pd
import numpy as np
import datetime as dt
import re

from src.utils.parser import parse_config
from src.utils.vault import get_secrets
from src.utils.processing import downcast
import snowflake.connector

config = parse_config(os.path.abspath(os.path.join(os.getcwd(), "../src/settings.yml")))

In [None]:
snowflake_config = config["snowflake"]
snowflake_config.update(get_secrets("snowflake"))
snowflake_ctx = snowflake.connector.connect(**snowflake_config)

In [None]:
n_extremes = 5

last_processed = "2000-1-1 15:35:04.518 UTC"
# last_processed = pd.read_parquet("../data/raw", columns=["processed_on"]).max()[0]
last_processed = dt.datetime.strptime(last_processed, "%Y-%m-%d %H:%M:%S.%f %Z").date()
from_date = last_processed - dt.timedelta(days=last_processed.weekday())

channel_like = "register://electricity/0/activepower/%?avg=15"
# channel_like = "register://electricity/0/activepower/l_?avg=15"



query = f"""
SELECT *
FROM
(
    SELECT
        t1.boxid,
        t1.channelid,
        t1.value,
        t1.year,
        t1.week,
        ROW_NUMBER() OVER (PARTITION BY (t1.boxid, t1.channelid, t1.year, t1.week) ORDER BY (t1.value) DESC) AS top,
        -ROW_NUMBER() OVER (PARTITION BY (t1.boxid, t1.channelid, t1.year, t1.week) ORDER BY (t1.value) ASC) AS bottom,
        CURRENT_TIMESTAMP AS processed_on 
     FROM (
        SELECT 
            t0.BOXID AS boxid,
            t0.CHANNELID AS channelid,
            t0.WAARDE AS value,
            YEAROFWEEKISO(t0.DATUMTIJD) AS year,
            WEEKISO(t0.DATUMTIJD) AS week
        FROM {config["snowflake"]["database"]}.{config["snowflake"]["schema"]}.{config["snowflake"]["table"]} t0
        WHERE t0.CHANNELID LIKE '{channel_like}'
         AND t0.DATUMTIJD >= DATE('{from_date}')
         AND t0.DATUMTIJD < DATEADD('DAY', -DAYOFWEEKISO(CURRENT_DATE), CURRENT_DATE)
--         AND t0.BOXID IN ('075.547-1', '069.509-1')
--         LIMIT 100
         ) t1
) t2
WHERE t2.top <= {n_extremes} OR t2.bottom >=-{n_extremes}
"""

In [None]:
%%time
df_query = pd.read_sql(sql=query, con=snowflake_ctx)

In [None]:
pattern = channel_like\
    .replace("?", "\?")\
    .replace("/", "\/")\
    .replace("%", "(sumli|l[1,2,3])")
df_query["L"] = df_query["CHANNELID"].str.extract(pattern)

In [None]:
df_query = df_query.apply(downcast, try_numeric=True, category=True)

In [None]:
df_query.info()

In [None]:
df_query.to_parquet("../data/raw/", partition_cols=["BOXID", "L"])

In [None]:
cols = ["BOXID", "VALUE", "YEAR", "WEEK", "TOP", "BOTTOM", "L"]
df_read = (
    pd.read_parquet("../data/raw", columns=cols)
    .query("TOP == 1 & L == 'sumli'")
    .drop(columns=["TOP", "BOTTOM", "L"])
)
    

In [None]:
df_read.info()