In [1]:
import os
import io
import numpy as np
import matplotlib.pyplot as plt
import duckdb
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrowfs_adlgen2 as pa_adl

from dotenv import load_dotenv
from pathlib import Path

from azure.identity import DefaultAzureCredential, ClientSecretCredential
from azure.storage.filedatalake import DataLakeServiceClient
from azure.keyvault.secrets import SecretClient
from azure.core.exceptions import ResourceNotFoundError

from utilities.visualizers import view_signal_feature
from utilities.feature_extractors import extract_spectogam_stats

%load_ext autoreload
%autoreload 2

# local

In [2]:
# DATA_DIR = "../include/data"

In [3]:
# # local
# SILVER_FOLDER_NAME = "silver"
# SUB_FOLDER_NAME = "stage-02"
# SILVER_DATA_DIR = os.path.join("{DATA_DIR}", "{FOLDER_NAME}", "{SUB_FOLDER_NAME}").replace("\\", "/")
# SILVER_DATA_DIR

In [4]:
# subjects_features_paths = [
#     os.path.join(
#         SILVER_DATA_DIR.format(
#             DATA_DIR=DATA_DIR, 
#             FOLDER_NAME=SILVER_FOLDER_NAME, 
#             SUB_FOLDER_NAME=SUB_FOLDER_NAME
#         ),
#         file
#     ).replace("\\", "/")
#     for file in
#     os.listdir(SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR, 
#         FOLDER_NAME=SILVER_FOLDER_NAME, 
#         SUB_FOLDER_NAME=SUB_FOLDER_NAME
#     ))
# ]

In [5]:
# subjects_features_paths

In [6]:
# subjects_features_paths[0].replace("_signals.parquet", "", 1)

In [7]:
# for file in subjects_features_paths:
#     os.rename(file, file.replace("_signals.parquet", "", 1))

# load credentials for cloud

In [8]:
# Retrieve credentials from environment variables
# this is strictly used only in development
# load env variables
env_dir = Path('../').resolve()
load_dotenv(os.path.join(env_dir, '.env'))

True

In [9]:
storage_account_name = os.environ.get("STORAGE_ACCOUNT_NAME")
credential = os.environ.get("STORAGE_ACCOUNT_KEY")
conn_str = os.environ.get("STORAGE_ACCOUNT_CONN_STR")

In [10]:
# cloud
URL = "abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net"
SILVER_FOLDER_NAME = "sgppipelinesa-silver"
SUB_FOLDER_NAME = "stage-02"
SILVER_DATA_DIR = os.path.join(URL, "{SUB_FOLDER_NAME}").replace("\\", "/")
SILVER_DATA_DIR

'abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net/{SUB_FOLDER_NAME}'

In [16]:
# cloud
# create client with generated sas token
datalake_service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net", 
    credential=credential
)

# retrieves file system client/container client 
# to retrieve datalake client
silver_container_client = datalake_service_client.get_file_system_client(f"{storage_account_name}-silver")

# we only get the directories in the first level of 
# the container, if it has a "/" then it means it is not
# an immediate folder in the container. This only really
# gets the subject folders 
subjects_features_paths = [
    os.path.join(
        URL.format(
            FOLDER_NAME=SILVER_FOLDER_NAME
        ), 
        path.name
    ).replace("\\", "/")
    for path in silver_container_client.get_paths(path=SUB_FOLDER_NAME)
]
subjects_features_paths

['abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/1028-20100710-hne_signals.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/1337ad-20170321-ajg_signals.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/1337ad-20170321-tkg_signals.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/1snoke-20120412-hge_signals.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/23yipikaye-20100807-ujm_signals.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/Aaron-20080318-kdl_signals.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/Anniepoo-20140308-bft_signals.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/Anniepoo-20140308-cqj_signals.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/Anniepoo-2014030

# load all the single parquet files into one giant table (we won't need pyarrow for this one anymore as we won't need to convert between tables, numpy arrays, and pyarrow dataframes). This will now be easier to load as these only contain the relevant row of features from each subjects signals after windowing. We will also be loading the labels of different data splits so we can use it as basis to what subjects in this giant dataframe will be part of the train, val, and test set  

In [17]:
conn = duckdb.connect()

In [18]:
# for cloud only
# installing dependencies and creating secrets object
conn.sql(f"""INSTALL azure""")
conn.sql(f"""LOAD azure""")
conn.sql(f"""
    CREATE OR REPLACE SECRET az_sgp (
        TYPE azure,
        CONNECTION_STRING '{conn_str}'
    );
""")

┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

In [19]:
conn.sql(f"""
    CREATE OR REPLACE TEMPORARY TABLE features AS (
        SELECT * FROM read_parquet({subjects_features_paths}, union_by_name=True, filename=False)
    )
""")

# originally we had 6318 subjects but if observed keenly we can see that we only have 4350 + 967 + 946 or just 6263 subjects now, as we are missing 55 because in ingest_labels we had to discard some subjects who did not have labels attached to them  

In [21]:
SILVER_DATA_DIR

'abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net/{SUB_FOLDER_NAME}'

In [27]:
# cloud
train_labels_path = os.path.join(
    SILVER_DATA_DIR.format(
        FOLDER_NAME=SILVER_FOLDER_NAME,
        SUB_FOLDER_NAME="stage-01"
    ),
    "train_labels.parquet",
    # "**",
    # "*.parquet"
).replace("\\", "/")

# # local
# train_labels_path = os.path.join(
#     SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR,
#         FOLDER_NAME=SILVER_FOLDER_NAME,
#         SUB_FOLDER_NAME="stage-01"
#     ),
#     "train",
#     "labels.parquet",
#     "**",
#     "*.parquet"
# ).replace("\\", "/")

train_labels_path

'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-01/train_labels.parquet'

In [28]:
conn.sql(f"""
    CREATE OR REPLACE TEMPORARY TABLE train_labels AS (
        SELECT * FROM read_parquet('{train_labels_path}', union_by_name=True, filename=False)
    )
""")

In [29]:
conn.sql("""
    SELECT COUNT(*) FROM train_labels
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│            7 │
└──────────────┘

In [30]:
# cloud
val_labels_path = os.path.join(
    SILVER_DATA_DIR.format(
        FOLDER_NAME=SILVER_FOLDER_NAME,
        SUB_FOLDER_NAME="stage-01"
    ),
    "val_labels.parquet",
    # "**",
    # "*.parquet"
).replace("\\", "/")

# # local
# val_labels_path = os.path.join(
#     SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR,
#         FOLDER_NAME=SILVER_FOLDER_NAME,
#         SUB_FOLDER_NAME="stage-01"
#     ),
#     "val",
#     "labels.parquet",
#     "**",
#     "*.parquet"
# ).replace("\\", "/")

val_labels_path

'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-01/val_labels.parquet'

In [31]:
conn.sql(f"""
    CREATE OR REPLACE TEMPORARY TABLE val_labels AS (
        SELECT * FROM read_parquet('{val_labels_path}', union_by_name=True, filename=False)
    )
""")

In [32]:
# cloud
test_labels_path = os.path.join(
    SILVER_DATA_DIR.format(
        FOLDER_NAME=SILVER_FOLDER_NAME,
        SUB_FOLDER_NAME="stage-01"
    ),
    "test_labels.parquet",
    # "**",
    # "*.parquet"
).replace("\\", "/")

# # local
# test_labels_path = os.path.join(
#     SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR,
#         FOLDER_NAME=SILVER_FOLDER_NAME,
#         SUB_FOLDER_NAME="stage-01"
#     ),
#     "test",
#     "labels.parquet",
#     "**",
#     "*.parquet"
# ).replace("\\", "/")

test_labels_path

'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-01/test_labels.parquet'

In [33]:
conn.sql(f"""
    CREATE OR REPLACE TEMPORARY TABLE test_labels AS (
        SELECT * FROM read_parquet('{test_labels_path}', union_by_name=True, filename=False)
    )
""")

In [34]:
conn.sql("""
    SELECT * FROM train_labels
""")

┌─────────────────────────┬─────────┬─────────┬───────┬───────────┐
│        subjectId        │  value  │  split  │ rowId │ partition │
│         varchar         │ varchar │ varchar │ int64 │   int64   │
├─────────────────────────┼─────────┼─────────┼───────┼───────────┤
│ 1028-20100710-hne       │ male    │ train   │     0 │         0 │
│ 1337ad-20170321-ajg     │ female  │ train   │     1 │         1 │
│ 1337ad-20170321-tkg     │ female  │ train   │     2 │         2 │
│ 1snoke-20120412-hge     │ male    │ train   │     3 │         3 │
│ 23yipikaye-20100807-ujm │ male    │ train   │     4 │         4 │
│ Anniepoo-20140308-bft   │ female  │ train   │     5 │         5 │
│ Anniepoo-20140308-cqj   │ female  │ train   │     6 │         6 │
└─────────────────────────┴─────────┴─────────┴───────┴───────────┘

In [35]:
conn.sql("""
    SELECT * FROM features
""")

┌────────────────────┬────────────────────┬─────────────────────┬────────────────────┬────────────────────────┬───────────────┬───────────────┬──────────────┬──────────────┬───────────────────────┬─────────────────────┬──────────────────┬──────────────────┬─────────────┬────────────────────────┬──────────────────────┬─────────────────────────┬────────────────────┬────────────────────┬────────────────────┬──────────────┬────────────────────┬───────────────┬─────────────────┬───────────────────────┬───────────────────┬──────────────┬──────────────┬────────────────┬──────────────┬──────────────┬──────────────────────┬──────────────────────┬────────────────────────────┬──────────────────┬───────────────┬────────────────────┬──────────────────┬────────────────────┬─────────────────────┬──────────────────────┬─────────────────┬─────────────────┬───────────────────┬─────────────────┬─────────────────┬─────────────────────────┬─────────────────────────┬───────────────────────────────┬─────────

# since prior feature engineering produced null values for each subject we will need to impute these values in our features table partitioned across each subject

In [36]:
rows = conn.sql("""
    SELECT column_name
    FROM (DESCRIBE features)
""").fetchall()
rows[:5]

[('subjectId',),
 ('freq_kurt',),
 ('freq_skew',),
 ('freq_entropy',),
 ('freq_mean',)]

In [37]:
feat_cols = list(filter(lambda feat_col: not "subjectId" in feat_col, [row[-1] for row in rows]))
len(feat_cols)

81

In [38]:
query = """
    CREATE OR REPLACE TEMPORARY TABLE features_imp AS (SELECT
        subjectId,
"""

In [39]:
n_features = len(feat_cols)
for i, feat_col in enumerate(feat_cols):
    if i == (n_features - 1): 
        query += f"""
            COALESCE({feat_col}, AVG({feat_col}) OVER(PARTITION BY subjectId)) AS {feat_col}_imp
        """
        break
    query += f"""
        COALESCE({feat_col}, AVG({feat_col}) OVER(PARTITION BY subjectId)) AS {feat_col}_imp,
    """

In [40]:
print(i)

80


In [41]:
query += """
    FROM features)
"""

In [42]:
query

'\n    CREATE OR REPLACE TEMPORARY TABLE features_imp AS (SELECT\n        subjectId,\n\n        COALESCE(freq_kurt, AVG(freq_kurt) OVER(PARTITION BY subjectId)) AS freq_kurt_imp,\n    \n        COALESCE(freq_skew, AVG(freq_skew) OVER(PARTITION BY subjectId)) AS freq_skew_imp,\n    \n        COALESCE(freq_entropy, AVG(freq_entropy) OVER(PARTITION BY subjectId)) AS freq_entropy_imp,\n    \n        COALESCE(freq_mean, AVG(freq_mean) OVER(PARTITION BY subjectId)) AS freq_mean_imp,\n    \n        COALESCE(freq_median, AVG(freq_median) OVER(PARTITION BY subjectId)) AS freq_median_imp,\n    \n        COALESCE(freq_mode, AVG(freq_mode) OVER(PARTITION BY subjectId)) AS freq_mode_imp,\n    \n        COALESCE(freq_min, AVG(freq_min) OVER(PARTITION BY subjectId)) AS freq_min_imp,\n    \n        COALESCE(freq_max, AVG(freq_max) OVER(PARTITION BY subjectId)) AS freq_max_imp,\n    \n        COALESCE(freq_var, AVG(freq_var) OVER(PARTITION BY subjectId)) AS freq_var_imp,\n    \n        COALESCE(freq_st

In [43]:
conn.sql(query)

# Drop columns with infinity values with threshold greater than 5 at the most, if below 5 then impute

In [44]:
query = """
    SELECT
"""

In [45]:
n_features = len(feat_cols)
for i, feat_col in enumerate(feat_cols):
    if i == (n_features - 1): 
        query += f"""
            SUM(CAST(ISINF({feat_col}_imp) AS INTEGER)) AS {feat_col}_imp
        """
        break
    query += f"""
        SUM(CAST(ISINF({feat_col}_imp) AS INTEGER)) AS {feat_col}_imp,
    """

In [46]:
query += """
    FROM features_imp
"""

In [47]:
query

'\n    SELECT\n\n        SUM(CAST(ISINF(freq_kurt_imp) AS INTEGER)) AS freq_kurt_imp,\n    \n        SUM(CAST(ISINF(freq_skew_imp) AS INTEGER)) AS freq_skew_imp,\n    \n        SUM(CAST(ISINF(freq_entropy_imp) AS INTEGER)) AS freq_entropy_imp,\n    \n        SUM(CAST(ISINF(freq_mean_imp) AS INTEGER)) AS freq_mean_imp,\n    \n        SUM(CAST(ISINF(freq_median_imp) AS INTEGER)) AS freq_median_imp,\n    \n        SUM(CAST(ISINF(freq_mode_imp) AS INTEGER)) AS freq_mode_imp,\n    \n        SUM(CAST(ISINF(freq_min_imp) AS INTEGER)) AS freq_min_imp,\n    \n        SUM(CAST(ISINF(freq_max_imp) AS INTEGER)) AS freq_max_imp,\n    \n        SUM(CAST(ISINF(freq_var_imp) AS INTEGER)) AS freq_var_imp,\n    \n        SUM(CAST(ISINF(freq_stddev_imp) AS INTEGER)) AS freq_stddev_imp,\n    \n        SUM(CAST(ISINF(freq_first_quart_imp) AS INTEGER)) AS freq_first_quart_imp,\n    \n        SUM(CAST(ISINF(freq_third_quart_imp) AS INTEGER)) AS freq_third_quart_imp,\n    \n        SUM(CAST(ISINF(freq_range_i

In [48]:
inf_cnts = conn.sql(query).fetchdf()
inf_cnts

Unnamed: 0,freq_kurt_imp,freq_skew_imp,freq_entropy_imp,freq_mean_imp,freq_median_imp,freq_mode_imp,freq_min_imp,freq_max_imp,freq_var_imp,freq_stddev_imp,...,spec_cont_max_imp,spec_cont_range_imp,spec_cont_var_imp,spec_cont_std_imp,spec_cont_first_quart_imp,spec_cont_third_quart_imp,spec_cont_inter_quart_range_imp,spec_cont_entropy_imp,spec_cont_kurt_imp,spec_cont_skew_imp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
int(inf_cnts["mfcc_entropy_imp"].to_list()[-1])

402

In [50]:
def identify_inf_cols_to_remove(df, threshold=5):
    """
    it is assumed that df is single row and multidimensional 
    or with multiple columns
    """
    to_remove = []
    for column in df.columns:
        inf_cnt = int(inf_cnts[column].to_list()[-1])
        if inf_cnt > threshold:
            print(f"column {column}: {inf_cnt}")
            to_remove.append(column)

    return to_remove

In [51]:
cols_to_remove = identify_inf_cols_to_remove(inf_cnts)
cols_to_remove

column mfcc_entropy_imp: 402


['mfcc_entropy_imp']

In [52]:
# cols_to_remove = ["test", "test2"]

In [53]:
query_part = "\n".join([f"DROP COLUMN {col_to_remove}" for col_to_remove in cols_to_remove])
print(query_part)

DROP COLUMN mfcc_entropy_imp


In [54]:
query = f"""
    ALTER TABLE features_imp 
    {query_part}
"""

In [55]:
print(query)


    ALTER TABLE features_imp 
    DROP COLUMN mfcc_entropy_imp



In [56]:
conn.sql(query)

In [57]:
conn.sql("""
    SELECT * FROM features_imp
""")

┌───────────────────────┬────────────────────┬─────────────────────┬────────────────────┬────────────────────────┬────────────────────┬────────────────────┬────────────────────┬───────────────────┬───────────────────────┬─────────────────────┬──────────────────────┬──────────────────────┬───────────────────┬────────────────────────────┬─────────────────────┬────────────────────────┬────────────────────┬────────────────────┬────────────────────┬──────────────────────┬────────────────────┬────────────────────┬─────────────────────┬──────────────────────┬───────────────────────┬──────────────────────┬────────────────────┬────────────────────┬──────────────────┬────────────────────┬──────────────────────────┬──────────────────────────┬────────────────────────────────┬──────────────────────┬────────────────────┬────────────────────┬──────────────────────┬────────────────────────┬──────────────────────┬──────────────────────────┬─────────────────────┬─────────────────────┬───────────────────

# split the imputed feature table into train, val, and test sets and with it replace categorical values of label column to a numerical one 

In [58]:
conn.sql("""    
    CREATE OR REPLACE TEMPORARY TABLE train_data AS (
        WITH feature_label AS (
            SELECT 
                f.*,
                l.value AS label
            FROM train_labels l
            INNER JOIN features_imp f
            ON f.subjectId = l.subjectId
        )
        
        SELECT 
            * EXCLUDE (label, subjectId),
            CASE 
                WHEN label = 'male' THEN 0
                ELSE 1
            END AS label
        FROM feature_label
    )
""")

In [59]:
conn.sql("""
    SELECT COUNT(*) FROM train_data
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│          256 │
└──────────────┘

# if for example the last 3 windows of freq_kurt is null then for all 4350 subjects there would be 13050 nulls in total which we ought to impute during the final preprocessing

In [60]:
conn.sql("""    
    CREATE OR REPLACE TEMPORARY TABLE val_data AS (
        WITH feature_label AS (
            SELECT 
                f.*,
                l.value AS label
            FROM val_labels l
            INNER JOIN features_imp f
            ON f.subjectId = l.subjectId
        )
        
        SELECT 
            * EXCLUDE (label, subjectId),
            CASE 
                WHEN label = 'male' THEN 0
                ELSE 1
            END AS label
        FROM feature_label
    )
""")

In [61]:
conn.sql("""
    SELECT COUNT(*) FROM val_data
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│           50 │
└──────────────┘

In [62]:
conn.sql("""    
    CREATE OR REPLACE TEMPORARY TABLE test_data AS (
        WITH feature_label AS (
            SELECT 
                f.*,
                l.value AS label
            FROM test_labels l
            INNER JOIN features_imp f
            ON f.subjectId = l.subjectId
        )
        
        SELECT 
            * EXCLUDE (label, subjectId),
            CASE 
                WHEN label = 'male' THEN 0
                ELSE 1
            END AS label
        FROM feature_label
    )
""")

In [63]:
conn.sql("""
    SELECT COUNT(*) FROM test_data
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│           96 │
└──────────────┘

# We check how much our each split of our dataset is imbalanced

In [64]:
conn.sql("""
    SELECT COUNT(label) AS label_cnt, label FROM train_data
    GROUP BY label
""")

┌───────────┬───────┐
│ label_cnt │ label │
│   int64   │ int32 │
├───────────┼───────┤
│       110 │     0 │
│       146 │     1 │
└───────────┴───────┘

In [65]:
conn.sql("""
    SELECT COUNT(label) AS label_cnt, label FROM val_data
    GROUP BY label
""")

┌───────────┬───────┐
│ label_cnt │ label │
│   int64   │ int32 │
├───────────┼───────┤
│        23 │     0 │
│        27 │     1 │
└───────────┴───────┘

In [66]:
conn.sql("""
    SELECT COUNT(label) AS label_cnt, label FROM test_data
    GROUP BY label
""")

┌───────────┬───────┐
│ label_cnt │ label │
│   int64   │ int32 │
├───────────┼───────┤
│        33 │     0 │
│        63 │     1 │
└───────────┴───────┘

# once data is split and now recombined to our labels we save each into individual parquet files again  

In [68]:
# # local
# SAVE_DIR = SILVER_DATA_DIR.format(
#     DATA_DIR=DATA_DIR,
#     FOLDER_NAME=SILVER_FOLDER_NAME,
#     SUB_FOLDER_NAME="stage-03"
# )
# os.makedirs(SAVE_DIR, exist_ok=True)
# SAVE_DIR

# cloud
SAVE_DIR = SILVER_DATA_DIR.format(
    FOLDER_NAME=SILVER_FOLDER_NAME,
    SUB_FOLDER_NAME="stage-03"
)
SAVE_DIR

'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-03'

In [None]:
# conn.sql(f"""
#     COPY (SELECT * FROM train_data) TO '{SAVE_DIR}/train_data.parquet' (FORMAT parquet)
# """)

In [None]:
# conn.sql(f"""
#     COPY (SELECT * FROM val_data) TO '{SAVE_DIR}/val_data.parquet' (FORMAT parquet)
# """)

In [None]:
# conn.sql(f"""
#     COPY (SELECT * FROM test_data) TO '{SAVE_DIR}/test_data.parquet' (FORMAT parquet)
# """)

In [69]:
train_data_table = conn.sql("SELECT * FROM train_data").to_arrow_table()
val_data_table = conn.sql("SELECT * FROM val_data").to_arrow_table()
test_data_table = conn.sql("SELECT * FROM test_data").to_arrow_table()

In [70]:
handler = pa_adl.AccountHandler.from_account_name(storage_account_name, credential=credential)
fs = pa.fs.PyFileSystem(handler)

In [73]:
pq.write_table(train_data_table, "sgppipelinesa-silver/stage-03/train_data.parquet", filesystem=fs)
pq.write_table(val_data_table, "sgppipelinesa-silver/stage-03/val_data.parquet", filesystem=fs)
pq.write_table(test_data_table, "sgppipelinesa-silver/stage-03/test_data.parquet", filesystem=fs)