In [1]:
import os
import io
import numpy as np
import matplotlib.pyplot as plt
import duckdb
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrowfs_adlgen2 as pa_adl

from dotenv import load_dotenv
from pathlib import Path

from azure.identity import DefaultAzureCredential, ClientSecretCredential
from azure.storage.filedatalake import DataLakeServiceClient
from azure.keyvault.secrets import SecretClient
from azure.core.exceptions import ResourceNotFoundError

%load_ext autoreload
%autoreload 2

# local

In [None]:
# DATA_DIR = "../../include/data"

In [None]:
# # local
# SILVER_FOLDER_NAME = "silver"
# SUB_FOLDER_NAME = "stage-02"
# SILVER_DATA_DIR = os.path.join("{DATA_DIR}", "{FOLDER_NAME}", "{SUB_FOLDER_NAME}").replace("\\", "/")
# SILVER_DATA_DIR

'{DATA_DIR}/{FOLDER_NAME}/{SUB_FOLDER_NAME}'

In [None]:
# subjects_features_paths = [
#     os.path.join(
#         SILVER_DATA_DIR.format(
#             DATA_DIR=DATA_DIR, 
#             FOLDER_NAME=SILVER_FOLDER_NAME, 
#             SUB_FOLDER_NAME=SUB_FOLDER_NAME
#         ),
#         file
#     ).replace("\\", "/")
#     for file in
#     os.listdir(SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR, 
#         FOLDER_NAME=SILVER_FOLDER_NAME, 
#         SUB_FOLDER_NAME=SUB_FOLDER_NAME
#     ))
# ]

In [None]:
# subjects_features_paths

['../../include/data/silver/stage-02/1028-20100710-hne_features.parquet',
 '../../include/data/silver/stage-02/1337ad-20170321-ajg_features.parquet',
 '../../include/data/silver/stage-02/1337ad-20170321-tkg_features.parquet',
 '../../include/data/silver/stage-02/1snoke-20120412-hge_features.parquet',
 '../../include/data/silver/stage-02/23yipikaye-20100807-ujm_features.parquet',
 '../../include/data/silver/stage-02/Aaron-20080318-kdl_features.parquet',
 '../../include/data/silver/stage-02/Anniepoo-20140308-bft_features.parquet',
 '../../include/data/silver/stage-02/Anniepoo-20140308-cqj_features.parquet',
 '../../include/data/silver/stage-02/Anniepoo-20140308-fcp_features.parquet',
 '../../include/data/silver/stage-02/Anniepoo-20140308-hns_features.parquet',
 '../../include/data/silver/stage-02/Anniepoo-20140308-nky_features.parquet',
 '../../include/data/silver/stage-02/Coren-20141121-pxp_features.parquet']

In [None]:
# subjects_features_paths[0].replace("_signals.parquet", "", 1)

'../../include/data/silver/stage-02/1028-20100710-hne_features.parquet'

In [None]:
# for file in subjects_features_paths:
#     os.rename(file, file.replace("_signals.parquet", "", 1))

# load credentials for cloud

In [11]:
# Retrieve credentials from environment variables
# this is strictly used only in development
# load env variables
env_dir = Path('../../').resolve()
load_dotenv(os.path.join(env_dir, '.env'))

True

In [12]:
storage_account_name = os.environ.get("STORAGE_ACCOUNT_NAME")
credential = os.environ.get("STORAGE_ACCOUNT_KEY")
conn_str = os.environ.get("STORAGE_ACCOUNT_CONN_STR")

In [14]:
# cloud
URL = "abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net"
SILVER_FOLDER_NAME = "sgppipelinesa-silver"
SUB_FOLDER_NAME = "stage-02"
SILVER_DATA_DIR = os.path.join(URL, "{SUB_FOLDER_NAME}").replace("\\", "/")
SILVER_DATA_DIR

'abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net/{SUB_FOLDER_NAME}'

In [15]:
# cloud
# create client with generated sas token
datalake_service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net", 
    credential=credential
)

# retrieves file system client/container client 
# to retrieve datalake client
silver_container_client = datalake_service_client.get_file_system_client(f"{storage_account_name}-silver")

# we only get the directories in the first level of 
# the container, if it has a "/" then it means it is not
# an immediate folder in the container. This only really
# gets the subject folders 
subjects_features_paths = [
    os.path.join(
        URL.format(
            FOLDER_NAME=SILVER_FOLDER_NAME
        ), 
        path.name
    ).replace("\\", "/")
    for path in silver_container_client.get_paths(path=SUB_FOLDER_NAME)
]
subjects_features_paths

['abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/1028-20100710-hne_features.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/1337ad-20170321-ajg_features.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/1337ad-20170321-tkg_features.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/1snoke-20120412-hge_features.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/23yipikaye-20100807-ujm_features.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/Aaron-20080318-kdl_features.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/Anniepoo-20140308-bft_features.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/Anniepoo-20140308-cqj_features.parquet',
 'abfss://sgppipelinesa-silver@sgppipelinesa.dfs.core.windows.net/stage-02/Anniepoo

# load all the single parquet files into one giant table (we won't need pyarrow for this one anymore as we won't need to convert between tables, numpy arrays, and pyarrow dataframes). This will now be easier to load as these only contain the relevant row of features from each subjects signals after windowing. We will also be loading the labels of different data splits so we can use it as basis to what subjects in this giant dataframe will be part of the train, val, and test set  

In [8]:
conn = duckdb.connect()

In [13]:
# for cloud only
# installing dependencies and creating secrets object
conn.sql(f"""INSTALL azure""")
conn.sql(f"""LOAD azure""")
conn.sql(f"""
    CREATE OR REPLACE SECRET az_sgp (
        TYPE azure,
        CONNECTION_STRING '{conn_str}'
    );
""")
# the is required if this notebook is run in linux environment
# like airflow container
conn.sql("SET azure_transport_option_type = 'curl'")

In [14]:
conn.sql(f"""
    CREATE OR REPLACE TEMPORARY TABLE features AS (
        SELECT * FROM read_parquet({subjects_features_paths}, union_by_name=True, filename=False)
    )
""")

# originally we had 6318 subjects but if observed keenly we can see that we only have 4350 + 967 + 946 or just 6263 subjects now, as we are missing 55 because in ingest_labels we had to discard some subjects who did not have labels attached to them  

In [15]:
SILVER_DATA_DIR

'{DATA_DIR}/{FOLDER_NAME}/{SUB_FOLDER_NAME}'

In [None]:
# cloud
train_labels_path = os.path.join(
    SILVER_DATA_DIR.format(
        FOLDER_NAME=SILVER_FOLDER_NAME,
        SUB_FOLDER_NAME="stage-01"
    ),
    "train_labels.parquet",
    # "**",
    # "*.parquet"
).replace("\\", "/")

# # local
# train_labels_path = os.path.join(
#     SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR,
#         FOLDER_NAME=SILVER_FOLDER_NAME,
#         SUB_FOLDER_NAME="stage-01"
#     ),
#     "train_labels.parquet"
# ).replace("\\", "/")

# train_labels_path

'../../include/data/silver/stage-01/train_labels.parquet'

In [19]:
conn.sql(f"""
    CREATE OR REPLACE TEMPORARY TABLE train_labels AS (
        SELECT * FROM read_parquet('{train_labels_path}', union_by_name=True, filename=False)
    )
""")

In [20]:
conn.sql("""
    SELECT COUNT(*) FROM train_labels
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│            7 │
└──────────────┘

In [None]:
# cloud
val_labels_path = os.path.join(
    SILVER_DATA_DIR.format(
        FOLDER_NAME=SILVER_FOLDER_NAME,
        SUB_FOLDER_NAME="stage-01"
    ),
    "val_labels.parquet",
    # "**",
    # "*.parquet"
).replace("\\", "/")

# # local
# val_labels_path = os.path.join(
#     SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR,
#         FOLDER_NAME=SILVER_FOLDER_NAME,
#         SUB_FOLDER_NAME="stage-01"
#     ),
#     "val_labels.parquet"
# ).replace("\\", "/")

# val_labels_path

'../../include/data/silver/stage-01/val_labels.parquet'

In [22]:
conn.sql(f"""
    CREATE OR REPLACE TEMPORARY TABLE val_labels AS (
        SELECT * FROM read_parquet('{val_labels_path}', union_by_name=True, filename=False)
    )
""")

In [None]:
# cloud
test_labels_path = os.path.join(
    SILVER_DATA_DIR.format(
        FOLDER_NAME=SILVER_FOLDER_NAME,
        SUB_FOLDER_NAME="stage-01"
    ),
    "test_labels.parquet",
    # "**",
    # "*.parquet"
).replace("\\", "/")

# # local
# test_labels_path = os.path.join(
#     SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR,
#         FOLDER_NAME=SILVER_FOLDER_NAME,
#         SUB_FOLDER_NAME="stage-01"
#     ),
#     "test_labels.parquet"
# ).replace("\\", "/")

# test_labels_path

'../../include/data/silver/stage-01/test_labels.parquet'

In [24]:
conn.sql(f"""
    CREATE OR REPLACE TEMPORARY TABLE test_labels AS (
        SELECT * FROM read_parquet('{test_labels_path}', union_by_name=True, filename=False)
    )
""")

In [29]:
conn.sql("""
    SELECT * FROM test_labels
""")

┌───────────────────────┬─────────┬─────────┬───────┬───────────┐
│       subjectId       │  value  │  split  │ rowId │ partition │
│        varchar        │ varchar │ varchar │ int64 │   int64   │
├───────────────────────┼─────────┼─────────┼───────┼───────────┤
│ Anniepoo-20140308-hns │ female  │ test    │     0 │         0 │
│ Anniepoo-20140308-nky │ female  │ test    │     1 │         1 │
│ Coren-20141121-pxp    │ male    │ test    │     2 │         2 │
└───────────────────────┴─────────┴─────────┴───────┴───────────┘

In [32]:
conn.sql("""
    -- SELECT * FROM features
    SELECT mfcc_entropy FROM features
""")

┌──────────────┐
│ mfcc_entropy │
│    float     │
├──────────────┤
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│           ·  │
│           ·  │
│           ·  │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
├──────────────┤
│   402 rows   │
│  (20 shown)  │
└──────────────┘

# since prior feature engineering produced null values for each subject we will need to impute these values in our features table partitioned across each subject

In [33]:
rows = conn.sql("""
    SELECT column_name
    FROM (DESCRIBE features)
""").fetchall()
rows[:5]

[('subjectId',),
 ('freq_kurt',),
 ('freq_skew',),
 ('freq_entropy',),
 ('freq_mean',)]

In [34]:
feat_cols = list(filter(lambda feat_col: not "subjectId" in feat_col, [row[-1] for row in rows]))
len(feat_cols)

81

# <s>Drop columns with infinity values with threshold greater than 5 at the most, if below 5 then impute</s> Instead turn infinity rows into null values

In [85]:
query = """
    CREATE OR REPLACE TEMPORARY TABLE features_nulled AS (
        SELECT
            subjectId,
"""

In [None]:
n_features = len(feat_cols)
for i, feat_col in enumerate(feat_cols):
    if i == (n_features - 1): 
        query += f"""
            CASE
                WHEN {feat_col} = 'Infinity' THEN NULL
                WHEN {feat_col} = '-Infinity' THEN NULL
                ELSE {feat_col}
            END AS {feat_col}_nulled
        """
        break
    query += f"""
        CASE
            WHEN {feat_col} = 'Infinity' THEN NULL
            WHEN {feat_col} = '-Infinity' THEN NULL
            ELSE {feat_col}
        END AS {feat_col}_nulled,
    """


In [87]:
query += """
        FROM features
    );
"""

In [88]:
query

"\n    CREATE OR REPLACE TEMPORARY TABLE features_nulled AS (\n        SELECT\n            subjectId,\n\n        CASE\n            WHEN freq_kurt = 'Infinity' THEN NULL\n            WHEN freq_kurt = '-Infinity' THEN NULL\n            ELSE freq_kurt\n        END AS freq_kurt_nulled,\n    \n        CASE\n            WHEN freq_skew = 'Infinity' THEN NULL\n            WHEN freq_skew = '-Infinity' THEN NULL\n            ELSE freq_skew\n        END AS freq_skew_nulled,\n    \n        CASE\n            WHEN freq_entropy = 'Infinity' THEN NULL\n            WHEN freq_entropy = '-Infinity' THEN NULL\n            ELSE freq_entropy\n        END AS freq_entropy_nulled,\n    \n        CASE\n            WHEN freq_mean = 'Infinity' THEN NULL\n            WHEN freq_mean = '-Infinity' THEN NULL\n            ELSE freq_mean\n        END AS freq_mean_nulled,\n    \n        CASE\n            WHEN freq_median = 'Infinity' THEN NULL\n            WHEN freq_median = '-Infinity' THEN NULL\n            ELSE freq_

In [None]:
print(query)

In [97]:
conn.sql(query)

In [98]:
conn.sql("""
    SELECT mfcc_entropy_nulled FROM features_nulled
""")

┌─────────────────────┐
│ mfcc_entropy_nulled │
│        float        │
├─────────────────────┤
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                  ·  │
│                  ·  │
│                  ·  │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
│                NULL │
├─────────────────────┤
│ 402 rows (20 shown) │
└─────────────────────┘

# Imputation of null values

In [104]:
query = """
    CREATE OR REPLACE TEMPORARY TABLE features_imp AS (SELECT
        subjectId,
"""

In [105]:
n_features = len(feat_cols)
for i, feat_col in enumerate(feat_cols):
    if i == (n_features - 1): 
        query += f"""
            COALESCE({feat_col}_nulled, AVG({feat_col}_nulled) OVER(PARTITION BY subjectId)) AS {feat_col}_imp
        """
        break
    query += f"""
        COALESCE({feat_col}_nulled, AVG({feat_col}_nulled) OVER(PARTITION BY subjectId)) AS {feat_col}_imp,
    """

In [106]:
print(i)

80


In [107]:
query += """
        FROM features_nulled
    );
"""

In [108]:
query

'\n    CREATE OR REPLACE TEMPORARY TABLE features_imp AS (SELECT\n        subjectId,\n\n        COALESCE(freq_kurt_nulled, AVG(freq_kurt_nulled) OVER(PARTITION BY subjectId)) AS freq_kurt_imp,\n    \n        COALESCE(freq_skew_nulled, AVG(freq_skew_nulled) OVER(PARTITION BY subjectId)) AS freq_skew_imp,\n    \n        COALESCE(freq_entropy_nulled, AVG(freq_entropy_nulled) OVER(PARTITION BY subjectId)) AS freq_entropy_imp,\n    \n        COALESCE(freq_mean_nulled, AVG(freq_mean_nulled) OVER(PARTITION BY subjectId)) AS freq_mean_imp,\n    \n        COALESCE(freq_median_nulled, AVG(freq_median_nulled) OVER(PARTITION BY subjectId)) AS freq_median_imp,\n    \n        COALESCE(freq_mode_nulled, AVG(freq_mode_nulled) OVER(PARTITION BY subjectId)) AS freq_mode_imp,\n    \n        COALESCE(freq_min_nulled, AVG(freq_min_nulled) OVER(PARTITION BY subjectId)) AS freq_min_imp,\n    \n        COALESCE(freq_max_nulled, AVG(freq_max_nulled) OVER(PARTITION BY subjectId)) AS freq_max_imp,\n    \n       

In [109]:
print(query)


    CREATE OR REPLACE TEMPORARY TABLE features_imp AS (SELECT
        subjectId,

        COALESCE(freq_kurt_nulled, AVG(freq_kurt_nulled) OVER(PARTITION BY subjectId)) AS freq_kurt_imp,
    
        COALESCE(freq_skew_nulled, AVG(freq_skew_nulled) OVER(PARTITION BY subjectId)) AS freq_skew_imp,
    
        COALESCE(freq_entropy_nulled, AVG(freq_entropy_nulled) OVER(PARTITION BY subjectId)) AS freq_entropy_imp,
    
        COALESCE(freq_mean_nulled, AVG(freq_mean_nulled) OVER(PARTITION BY subjectId)) AS freq_mean_imp,
    
        COALESCE(freq_median_nulled, AVG(freq_median_nulled) OVER(PARTITION BY subjectId)) AS freq_median_imp,
    
        COALESCE(freq_mode_nulled, AVG(freq_mode_nulled) OVER(PARTITION BY subjectId)) AS freq_mode_imp,
    
        COALESCE(freq_min_nulled, AVG(freq_min_nulled) OVER(PARTITION BY subjectId)) AS freq_min_imp,
    
        COALESCE(freq_max_nulled, AVG(freq_max_nulled) OVER(PARTITION BY subjectId)) AS freq_max_imp,
    
        COALESCE(freq_var_nu

In [110]:
conn.sql(query)

In [111]:
conn.sql("""
    SELECT mfcc_entropy_imp FROM features_imp 
""")

┌──────────────────┐
│ mfcc_entropy_imp │
│      double      │
├──────────────────┤
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│               ·  │
│               ·  │
│               ·  │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
│             NULL │
├──────────────────┤
│     402 rows     │
│    (20 shown)    │
└──────────────────┘

In [112]:
conn.sql("""
    SELECT mfcc_entropy FROM features WHERE mfcc_entropy = '-Infinity' OR mfcc_entropy = 'Infinity'         
""")

┌──────────────┐
│ mfcc_entropy │
│    float     │
├──────────────┤
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│           ·  │
│           ·  │
│           ·  │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
│         -inf │
├──────────────┤
│   402 rows   │
│  (20 shown)  │
└──────────────┘

In [None]:
# int(inf_cnts["mfcc_entropy_imp"].to_list()[-1])

ValueError: cannot convert float NaN to integer

In [116]:
query = """
    SELECT
"""

In [117]:
n_features = len(feat_cols)
for i, feat_col in enumerate(feat_cols):
    if i == (n_features - 1): 
        query += f"""
            SUM(CAST(ISINF({feat_col}_imp) AS INTEGER)) AS {feat_col}_imp
        """
        break
    query += f"""
        SUM(CAST(ISINF({feat_col}_imp) AS INTEGER)) AS {feat_col}_imp,
    """

In [118]:
query += """
    FROM features_imp
"""

In [119]:
print(query)


    SELECT

        SUM(CAST(ISINF(freq_kurt_imp) AS INTEGER)) AS freq_kurt_imp,
    
        SUM(CAST(ISINF(freq_skew_imp) AS INTEGER)) AS freq_skew_imp,
    
        SUM(CAST(ISINF(freq_entropy_imp) AS INTEGER)) AS freq_entropy_imp,
    
        SUM(CAST(ISINF(freq_mean_imp) AS INTEGER)) AS freq_mean_imp,
    
        SUM(CAST(ISINF(freq_median_imp) AS INTEGER)) AS freq_median_imp,
    
        SUM(CAST(ISINF(freq_mode_imp) AS INTEGER)) AS freq_mode_imp,
    
        SUM(CAST(ISINF(freq_min_imp) AS INTEGER)) AS freq_min_imp,
    
        SUM(CAST(ISINF(freq_max_imp) AS INTEGER)) AS freq_max_imp,
    
        SUM(CAST(ISINF(freq_var_imp) AS INTEGER)) AS freq_var_imp,
    
        SUM(CAST(ISINF(freq_stddev_imp) AS INTEGER)) AS freq_stddev_imp,
    
        SUM(CAST(ISINF(freq_first_quart_imp) AS INTEGER)) AS freq_first_quart_imp,
    
        SUM(CAST(ISINF(freq_third_quart_imp) AS INTEGER)) AS freq_third_quart_imp,
    
        SUM(CAST(ISINF(freq_range_imp) AS INTEGER)) AS freq_ran

In [121]:
inf_cnts = conn.sql(query).fetchdf()
inf_cnts

Unnamed: 0,freq_kurt_imp,freq_skew_imp,freq_entropy_imp,freq_mean_imp,freq_median_imp,freq_mode_imp,freq_min_imp,freq_max_imp,freq_var_imp,freq_stddev_imp,...,spec_cont_max_imp,spec_cont_range_imp,spec_cont_var_imp,spec_cont_std_imp,spec_cont_first_quart_imp,spec_cont_third_quart_imp,spec_cont_inter_quart_range_imp,spec_cont_entropy_imp,spec_cont_kurt_imp,spec_cont_skew_imp
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
inf_cnts.transpose()[0].to_dict()

{'freq_kurt_imp': 0.0,
 'freq_skew_imp': 0.0,
 'freq_entropy_imp': 0.0,
 'freq_mean_imp': 0.0,
 'freq_median_imp': 0.0,
 'freq_mode_imp': 0.0,
 'freq_min_imp': 0.0,
 'freq_max_imp': 0.0,
 'freq_var_imp': 0.0,
 'freq_stddev_imp': 0.0,
 'freq_first_quart_imp': 0.0,
 'freq_third_quart_imp': 0.0,
 'freq_range_imp': 0.0,
 'freq_inter_quart_range_imp': 0.0,
 'zcr_imp': 0.0,
 'poly_feat_1_imp': 0.0,
 'poly_feat_2_imp': 0.0,
 'spec_cent_imp': 0.0,
 'spec_bw_imp': 0.0,
 'spec_flat_imp': 0.0,
 'spec_roll_imp': 0.0,
 'mel_spec_mean_imp': 0.0,
 'mel_spec_median_imp': 0.0,
 'mel_spec_mode_imp': 0.0,
 'mel_spec_mode_cnt_imp': 0.0,
 'mel_spec_min_imp': 0.0,
 'mel_spec_max_imp': 0.0,
 'mel_spec_range_imp': 0.0,
 'mel_spec_var_imp': 0.0,
 'mel_spec_std_imp': 0.0,
 'mel_spec_first_quart_imp': 0.0,
 'mel_spec_third_quart_imp': 0.0,
 'mel_spec_inter_quart_range_imp': 0.0,
 'mel_spec_entropy_imp': 0.0,
 'mel_spec_kurt_imp': 0.0,
 'mel_spec_skew_imp': 0.0,
 'mel_spec_db_mean_imp': 0.0,
 'mel_spec_db_median_

In [None]:
def identify_inf_cols_to_remove(df, threshold=5):
    """
    it is assumed that df is single row and multidimensional 
    or with multiple columns
    """
    to_remove = []
    for column in df.columns:
        inf_cnt = int(inf_cnts[column].to_list()[-1])
        if inf_cnt > threshold:
            print(f"column {column}: {inf_cnt}")
            to_remove.append(column)

    return to_remove

In [None]:
cols_to_remove = identify_inf_cols_to_remove(inf_cnts)
# cols_to_remove = ["test", "test2"]
cols_to_remove

In [None]:
# query_part = "\n".join([f"DROP COLUMN {col_to_remove}" for col_to_remove in cols_to_remove])# cols_to_remove = ["test", "test2"]
# print(query_part)

In [None]:
# query = f"""
#     ALTER TABLE features_imp 
#     {query_part}
# """

because turning infs to null values when all values/rows in that column are infs are averaged and still used for imputation still produce null values. What if there are cases when rows have now values that aren't inf? Then that means we'd turn the inf values to null, and during imputation whatever non null values are in the column that iwll be used to calculate the average and then impute the rest of the null values,

but what if there aren't any values that are non inf? Then that means we'd be calculating null averages over and over?

with non inf:
`inf, 1, 2, 3 --nullification--> null, 1, 2, 3 --imputation--> 2, 1, 2, 3`

with inf only:
`inf, inf, inf, inf --nullification--> null, null, null, null --imputation--> null, null, null, null`

with inf only what if?
`inf, inf, inf, inf --nullification--> null, null, null, null --imputation--> 0, 0, 0, 0`

with non inf only what if?
`inf, 1, 2, 3, 0, 0, 0 --nullification--> null, 1, 2, 3, 0, 0, 0 --imputation using 0--> 0, 1, 2, 3, 0, 0, 0`

the problem with the above is we are filling in null values with 0 when in fact it could be filled with  a userful value like the average of the numbers in the column, problem with using 0 for imputation is it it's not like a pure 0 feature that's part and parcel of the instance/row which directly corresponds to a label say male or female. Because what if for that row that naturally has that 0 feature it corresponds to male and mostly male? And we then impute 0 for that null value when it fact corresponds to a differetn label say female?? Then this would lead to incaccuracies in our model, given that 0 most usually is associated iwth male, we cannot associate a 0 that's been used for imputation for the label female

so mas maganda talaga ideally is instead of null, we use zeros only if all values in the column are null and instead use imputation from average if there are non infs the column before they are turned into null values

# Seeing nullified and imputed features

In [132]:
conn.sql("""
    SELECT * FROM features_imp
""")

┌───────────────────────┬─────────────────────┬──────────────────────┬────────────────────┬────────────────────────┬─────────────────────┬────────────────────┬────────────────────┬───────────────────┬───────────────────────┬─────────────────────┬──────────────────────┬──────────────────────┬───────────────────┬────────────────────────────┬─────────────────────┬────────────────────────┬────────────────────┬────────────────────┬────────────────────┬──────────────────────┬────────────────────┬────────────────────┬─────────────────────┬──────────────────────┬───────────────────────┬──────────────────────┬────────────────────┬────────────────────┬──────────────────┬────────────────────┬──────────────────────────┬──────────────────────────┬────────────────────────────────┬──────────────────────┬────────────────────┬────────────────────┬──────────────────────┬────────────────────────┬──────────────────────┬──────────────────────────┬─────────────────────┬─────────────────────┬────────────────

# split the imputed feature table into train, val, and test sets and with it replace categorical values of label column to a numerical one 

In [133]:
conn.sql("""    
    CREATE OR REPLACE TEMPORARY TABLE train_data AS (
        WITH feature_label AS (
            SELECT 
                f.*,
                l.value AS label
            FROM train_labels l
            INNER JOIN features_imp f
            ON f.subjectId = l.subjectId
        )
        
        SELECT 
            * EXCLUDE (label, subjectId),
            CASE 
                WHEN label = 'male' THEN 0
                ELSE 1
            END AS label
        FROM feature_label
    )
""")

In [134]:
conn.sql("""
    SELECT COUNT(*) FROM train_data
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│          256 │
└──────────────┘

# if for example the last 3 windows of freq_kurt is null then for all 4350 subjects there would be 13050 nulls in total which we ought to impute during the final preprocessing

In [135]:
conn.sql("""    
    CREATE OR REPLACE TEMPORARY TABLE val_data AS (
        WITH feature_label AS (
            SELECT 
                f.*,
                l.value AS label
            FROM val_labels l
            INNER JOIN features_imp f
            ON f.subjectId = l.subjectId
        )
        
        SELECT 
            * EXCLUDE (label, subjectId),
            CASE 
                WHEN label = 'male' THEN 0
                ELSE 1
            END AS label
        FROM feature_label
    )
""")

In [136]:
conn.sql("""
    SELECT COUNT(*) FROM val_data
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│           50 │
└──────────────┘

In [137]:
conn.sql("""    
    CREATE OR REPLACE TEMPORARY TABLE test_data AS (
        WITH feature_label AS (
            SELECT 
                f.*,
                l.value AS label
            FROM test_labels l
            INNER JOIN features_imp f
            ON f.subjectId = l.subjectId
        )
        
        SELECT 
            * EXCLUDE (label, subjectId),
            CASE 
                WHEN label = 'male' THEN 0
                ELSE 1
            END AS label
        FROM feature_label
    )
""")

In [138]:
conn.sql("""
    SELECT COUNT(*) FROM test_data
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│           96 │
└──────────────┘

# We check how much our each split of our dataset is imbalanced

In [139]:
conn.sql("""
    SELECT COUNT(label) AS label_cnt, label FROM train_data
    GROUP BY label
""")

┌───────────┬───────┐
│ label_cnt │ label │
│   int64   │ int32 │
├───────────┼───────┤
│       110 │     0 │
│       146 │     1 │
└───────────┴───────┘

In [140]:
conn.sql("""
    SELECT COUNT(label) AS label_cnt, label FROM val_data
    GROUP BY label
""")

┌───────────┬───────┐
│ label_cnt │ label │
│   int64   │ int32 │
├───────────┼───────┤
│        23 │     0 │
│        27 │     1 │
└───────────┴───────┘

In [141]:
conn.sql("""
    SELECT COUNT(label) AS label_cnt, label FROM test_data
    GROUP BY label
""")

┌───────────┬───────┐
│ label_cnt │ label │
│   int64   │ int32 │
├───────────┼───────┤
│        33 │     0 │
│        63 │     1 │
└───────────┴───────┘

# once data is split and now recombined to our labels we save each into individual parquet files again  

In [146]:
# # local
# SAVE_DIR = SILVER_DATA_DIR.format(
#     DATA_DIR=DATA_DIR,
#     FOLDER_NAME=SILVER_FOLDER_NAME,
#     SUB_FOLDER_NAME="stage-03"
# )
# os.makedirs(SAVE_DIR, exist_ok=True)
# SAVE_DIR

# cloud
SAVE_DIR = SILVER_DATA_DIR.format(
    FOLDER_NAME=SILVER_FOLDER_NAME,
    SUB_FOLDER_NAME="stage-03"
)
SAVE_DIR

KeyError: 'DATA_DIR'

In [None]:
# conn.sql(f"""
#     COPY (SELECT * FROM train_data) TO '{SAVE_DIR}/train_data.parquet' (FORMAT parquet)
# """)

In [None]:
# conn.sql(f"""
#     COPY (SELECT * FROM val_data) TO '{SAVE_DIR}/val_data.parquet' (FORMAT parquet)
# """)

In [None]:
# conn.sql(f"""
#     COPY (SELECT * FROM test_data) TO '{SAVE_DIR}/test_data.parquet' (FORMAT parquet)
# """)

In [None]:
train_data_table = conn.sql("SELECT * FROM train_data").to_arrow_table()
val_data_table = conn.sql("SELECT * FROM val_data").to_arrow_table()
test_data_table = conn.sql("SELECT * FROM test_data").to_arrow_table()

In [None]:
handler = pa_adl.AccountHandler.from_account_name(storage_account_name, credential=credential)
fs = pa.fs.PyFileSystem(handler)

In [None]:
pq.write_table(train_data_table, "sgppipelinesa-silver/stage-03/train_data.parquet", filesystem=fs)
pq.write_table(val_data_table, "sgppipelinesa-silver/stage-03/val_data.parquet", filesystem=fs)
pq.write_table(test_data_table, "sgppipelinesa-silver/stage-03/test_data.parquet", filesystem=fs)