In [27]:
import os
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrowfs_adlgen2 as pa_adl
import pyarrow.dataset as ds
import numpy as np
import librosa
import duckdb
import matplotlib.pyplot as plt

from dotenv import load_dotenv
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

from azure.identity import DefaultAzureCredential, ClientSecretCredential
from azure.storage.filedatalake import DataLakeServiceClient
from azure.keyvault.secrets import SecretClient
from azure.core.exceptions import ResourceNotFoundError

from utilities.visualizers import view_signal_feature
from utilities.feature_extractors import extract_spectogam_stats

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
DATA_DIR = "../include/data"

In [29]:
# cloud
# URL = "abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net"
URL = "{FOLDER_NAME}"
SILVER_FOLDER_NAME = "sgppipelinesa-silver"
SUB_FOLDER_NAME = "stage-01"
SILVER_DATA_DIR = os.path.join(URL, "{SUB_FOLDER_NAME}").replace("\\", "/")
SILVER_DATA_DIR

# # local
# SILVER_FOLDER_NAME = "silver"
# SUB_FOLDER_NAME = "stage-01"
# SILVER_DATA_DIR = os.path.join("{DATA_DIR}", "{FOLDER_NAME}", "{SUB_FOLDER_NAME}").replace("\\", "/")
# SILVER_DATA_DIR

'{FOLDER_NAME}/{SUB_FOLDER_NAME}'

# Computing features for a single subjects audio signals
this is to familiarize what exact features we need to compute for all other subjects audio signals

In [30]:
# # anonymous-20080904-qzg_signals
# # _caustic_-20170306-smy_signals
# subject_table = pq.read_table(
#     os.path.join(
#         SILVER_DATA_DIR.format(
#             DATA_DIR=DATA_DIR,
#             FOLDER_NAME=SILVER_FOLDER_NAME,
#             SUB_FOLDER_NAME=SUB_FOLDER_NAME
#         ), 
#         "anonymous-20080904-qzg_signals.parquet"
#     )
# )
# subject_table

In [31]:
# # connect to an in-memory database
# conn = duckdb.connect()

In [32]:
# conn.sql("""
#     SELECT COUNT(*) FROM subject_table
# """)

In [33]:
# # hertz
# hertz = 16000
# window_time = 3
# hop_time = 1

# # we calculate the window size of each segment or the
# # amount of samples it has to have based on the frequency
# samples_per_win_size = int(window_time * hertz)
# samples_per_hop_size = int(hop_time * hertz)

# Computing statistical features

In [34]:
# conn.sql(f"""
#     CREATE OR REPLACE TEMPORARY TABLE subject_features AS (
#         SELECT
#             signals, 
#             subjectId, 
#             rowId,
#             KURTOSIS(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_kurt,
#             SKEWNESS(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_skew,
#             ENTROPY(signals)OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_entropy,
#             AVG(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_mean,
#             MEDIAN(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_median,
#             MODE(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_mode,
#             MIN(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_min,
#             MAX(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_max,
#             VAR_SAMP(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_var,
#             STDDEV_SAMP(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_stddev,
#             QUANTILE_CONT(signals, 0.25) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_first_quart,
#             QUANTILE_CONT(signals, 0.75) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_third_quart
#         FROM subject_table
#         WHERE (rowId % {samples_per_hop_size}) = 0
#         ORDER BY rowId
#     )
# """)

In [35]:
# conn.sql("""
#     CREATE OR REPLACE TEMPORARY TABLE subject_features AS (
#         SELECT 
#             *,
#             (freq_max - freq_min) AS freq_range,
#             (freq_third_quart - freq_first_quart) AS freq_inter_quart_range
#         FROM subject_features
#     )
# """)

In [36]:
# conn.sql("""
#     SELECT * FROM subject_features
# """)

#### 544000 + a window of 48000 is 592000 which is greater 553472 so we consider only the indeces 544000 to 553471 which is just 9471 rows of data for this last window

In [37]:
# conn.sql("""
#     SELECT COUNT(*) FROM subject_table
# """)

# imputing missing or null values created from feature engineering

In [38]:
# conn.sql("""
#     SELECT 
#         COALESCE(
#             freq_kurt, 
#             (SELECT AVG(freq_kurt) FROM subject_features)
#         ) AS freq_kurt_imp,
#         COALESCE(
#             freq_skew, 
#             (SELECT AVG(freq_skew) FROM subject_features)
#         ) AS freq_skew_imp,
#         COALESCE(
#             freq_entropy, 
#             (SELECT AVG(freq_entropy) FROM subject_features)
#         ) AS freq_entropy_imp
#     FROM subject_features
# """)

# Computing spectral features

In [39]:
# frames = conn.sql("""
#     SELECT COUNT(rowId) FROM subject_features
# """).fetchall()[-1][-1]

In [40]:
# frames

In [41]:
# time = librosa.frames_to_time(frames, sr=hertz, hop_length=samples_per_hop_size)
# time

In [42]:
# subject_table["signals"].to_numpy()

In [43]:
# x_signals = subject_table["signals"].to_numpy()
# x_signals

In [44]:
# # calculate other features
# zcr = librosa.feature.zero_crossing_rate(y=x_signals, frame_length=samples_per_win_size, hop_length=samples_per_hop_size)
# zcr

In [45]:
# zcr.shape[1]

In [46]:
# new_zcr = zcr.reshape(-1)
# new_zcr.shape

In [47]:
# view_signal_feature(new_zcr, "zero crossing rate")

#### sometimes the calculation of spectral features may lead to 1 or more data points being added when compared to the length of the number of statistical features computed based on window length and hop length. To make sure number of values of statistical features and spectral features are the same we get only the first n rows of this spectral features based solely on the number of rows of the statistical features e.g. if there are 35 successfully calculated statistical features and we have 36 calculated spectral features we only get the first 35 of the spectral features and discard the rest 

In [48]:
# zcr_n_values_to_rem = np.abs(zcr.shape[1] - time)
# zcr_n_values_to_rem.astype(int)

In [49]:
# # get slice of those in range with time only
# new_zcr = zcr.reshape(-1)[:frames]
# new_zcr.shape

In [50]:
# new_zcr.shape

In [51]:
# # these are 2 features all in all we don't need to aggregate it
# # into a (1, 35) array
# poly_feats = librosa.feature.poly_features(y=x_signals, sr=hertz, n_fft=samples_per_win_size, hop_length=samples_per_hop_size)
# poly_feats.shape

In [52]:
# new_poly_feats = poly_feats[:, :frames]
# new_poly_feats.shape

In [53]:
# # y is the audio signals we must pass
# # sr is the sampling rate of our audio signals
# # n_fft is the window size of the fast fourier transform
# # hop_length is number of samples between successive frames
# # n_mels is the number of Mel bands to generate
# mel_spec = librosa.feature.melspectrogram(y=x_signals, sr=hertz, n_fft=samples_per_win_size, hop_length=samples_per_hop_size, n_mels=90)
# mel_spec.shape

In [54]:
# mel_spec_mean, \
# mel_spec_median, \
# mel_spec_mode, \
# mel_spec_mode_cnt, \
# mel_spec_min, \
# mel_spec_max, \
# mel_spec_range, \
# mel_spec_var, \
# mel_spec_std, \
# mel_spec_first_quart, \
# mel_spec_third_quart, \
# mel_spec_inter_quart_range, \
# mel_spec_entropy, \
# mel_spec_kurt, \
# mel_spec_skew = extract_spectogam_stats(mel_spec)

In [55]:
# mel_spec_mode

In [56]:
# mel_spec_mean.shape, mel_spec_median.shape, mel_spec_mode.shape

In [57]:
# mel_spec_entropy.shape, mel_spec_kurt.shape, mel_spec_skew.shape

In [58]:
# librosa.display.specshow(mel_spec)

In [59]:
# mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
# mel_spec_db

In [60]:
# mel_spec_db_mean, \
# mel_spec_db_median, \
# mel_spec_db_mode, \
# mel_spec_db_mode_cnt, \
# mel_spec_db_min, \
# mel_spec_db_max, \
# mel_spec_db_range, \
# mel_spec_db_var, \
# mel_spec_db_std, \
# mel_spec_db_first_quart, \
# mel_spec_db_third_quart, \
# mel_spec_db_inter_quart_range, \
# mel_spec_db_entropy, \
# mel_spec_db_kurt, \
# mel_spec_db_skew = extract_spectogam_stats(mel_spec_db)

In [61]:
# librosa.display.specshow(mel_spec_db)

In [62]:
# spec_cent = librosa.feature.spectral_centroid(y=x_signals, sr=hertz, n_fft=samples_per_win_size, hop_length=samples_per_hop_size)
# spec_cent.shape

In [63]:
# new_spec_cent = spec_cent.reshape(-1)
# new_spec_cent.shape

In [64]:
# view_signal_feature(new_spec_cent, "spectral centroid")

In [65]:
# mfcc = librosa.feature.mfcc(y=x_signals, sr=hertz, n_fft=samples_per_win_size, hop_length=samples_per_hop_size, n_mfcc=90)
# mfcc.shape

In [66]:
# mfcc_mean, \
# mfcc_median, \
# mfcc_mode, \
# mfcc_mode_cnt, \
# mfcc_min, \
# mfcc_max, \
# mfcc_range, \
# mfcc_var, \
# mfcc_std, \
# mfcc_first_quart, \
# mfcc_third_quart, \
# mfcc_inter_quart_range, \
# mfcc_entropy, \
# mfcc_kurt, \
# mfcc_skew = extract_spectogam_stats(mfcc)

In [67]:
# librosa.display.specshow(mfcc)

In [68]:
# spec_bw = librosa.feature.spectral_bandwidth(y=x_signals, sr=hertz, n_fft=samples_per_win_size, hop_length=samples_per_hop_size)
# spec_bw.shape

In [69]:
# new_spec_bw = spec_bw.reshape(-1)
# new_spec_bw.shape

In [70]:
# view_signal_feature(new_spec_bw, "spectral bandwidth")

In [71]:
# spec_cont = librosa.feature.spectral_contrast(y=x_signals, sr=hertz, n_fft=samples_per_win_size, hop_length=samples_per_hop_size)
# spec_cont.shape

In [72]:
# spec_cont_mean, \
# spec_cont_median, \
# spec_cont_mode, \
# spec_cont_mode_cnt, \
# spec_cont_min, \
# spec_cont_max, \
# spec_cont_range, \
# spec_cont_var, \
# spec_cont_std, \
# spec_cont_first_quart, \
# spec_cont_third_quart, \
# spec_cont_inter_quart_range, \
# spec_cont_entropy, \
# spec_cont_kurt, \
# spec_cont_skew = extract_spectogam_stats(spec_cont)

In [73]:
# librosa.display.specshow(spec_cont)

In [74]:
# spec_flat = librosa.feature.spectral_flatness(y=x_signals, n_fft=samples_per_win_size, hop_length=samples_per_hop_size)
# spec_flat.shape

In [75]:
# new_spec_flat = spec_flat.reshape(-1)
# new_spec_flat.shape

In [76]:
# view_signal_feature(new_spec_flat, "spectral flatness")

In [77]:
# spec_roll = librosa.feature.spectral_rolloff(y=x_signals, sr=hertz, n_fft=samples_per_win_size, hop_length=samples_per_hop_size)
# spec_roll.shape

In [78]:
# new_spec_roll = spec_roll.reshape(-1)
# new_spec_roll.shape

In [79]:
# view_signal_feature(new_spec_roll, "spectral rolloff")

In [80]:
# subject_features = conn.sql("""
#     SELECT * FROM subject_features
# """).to_arrow_table()

In [81]:
# subject_features

In [82]:
# subject_features = subject_features.append_column("zcr", [new_zcr])
# subject_features = subject_features.append_column("poly_feat_1", [poly_feats[0, :]])
# subject_features = subject_features.append_column("poly_feat_2", [poly_feats[1, :]])
# subject_features = subject_features.append_column("spec_cent", [new_spec_cent])
# subject_features = subject_features.append_column("spec_bw", [new_spec_bw])
# subject_features = subject_features.append_column("spec_flat", [new_spec_flat])
# subject_features = subject_features.append_column("spec_roll", [new_spec_roll])

# subject_features = subject_features.append_column("mel_spec_mean", [mel_spec_mean])
# subject_features = subject_features.append_column("mel_spec_median", [mel_spec_median])
# subject_features = subject_features.append_column("mel_spec_mode", [mel_spec_mode])
# subject_features = subject_features.append_column("mel_spec_mode_cnt", [mel_spec_mode_cnt])
# subject_features = subject_features.append_column("mel_spec_min", [mel_spec_min])
# subject_features = subject_features.append_column("mel_spec_max", [mel_spec_max])
# subject_features = subject_features.append_column("mel_spec_range", [mel_spec_range])
# subject_features = subject_features.append_column("mel_spec_var", [mel_spec_var])
# subject_features = subject_features.append_column("mel_spec_std", [mel_spec_std])
# subject_features = subject_features.append_column("mel_spec_first_quart", [mel_spec_first_quart])
# subject_features = subject_features.append_column("mel_spec_third_quart", [mel_spec_third_quart])
# subject_features = subject_features.append_column("mel_spec_inter_quart_range", [mel_spec_inter_quart_range])
# subject_features = subject_features.append_column("mel_spec_entropy", [mel_spec_entropy])
# subject_features = subject_features.append_column("mel_spec_kurt", [mel_spec_kurt])
# subject_features = subject_features.append_column("mel_spec_skew", [mel_spec_skew])

# subject_features = subject_features.append_column("mel_spec_db_mean", [mel_spec_db_mean])
# subject_features = subject_features.append_column("mel_spec_db_median", [mel_spec_db_median])
# subject_features = subject_features.append_column("mel_spec_db_mode", [mel_spec_db_mode])
# subject_features = subject_features.append_column("mel_spec_db_mode_cnt", [mel_spec_db_mode_cnt])
# subject_features = subject_features.append_column("mel_spec_db_min", [mel_spec_db_min])
# subject_features = subject_features.append_column("mel_spec_db_max", [mel_spec_db_max])
# subject_features = subject_features.append_column("mel_spec_db_range", [mel_spec_db_range])
# subject_features = subject_features.append_column("mel_spec_db_var", [mel_spec_db_var])
# subject_features = subject_features.append_column("mel_spec_db_std", [mel_spec_db_std])
# subject_features = subject_features.append_column("mel_spec_db_first_quart", [mel_spec_db_first_quart])
# subject_features = subject_features.append_column("mel_spec_db_third_quart", [mel_spec_db_third_quart])
# subject_features = subject_features.append_column("mel_spec_db_inter_quart_range", [mel_spec_db_inter_quart_range])
# subject_features = subject_features.append_column("mel_spec_db_entropy", [mel_spec_db_entropy])
# subject_features = subject_features.append_column("mel_spec_db_kurt", [mel_spec_db_kurt])
# subject_features = subject_features.append_column("mel_spec_db_skew", [mel_spec_db_skew])

# subject_features = subject_features.append_column("mfcc_mean", [mfcc_mean])
# subject_features = subject_features.append_column("mfcc_median", [mfcc_median])
# subject_features = subject_features.append_column("mfcc_mode", [mfcc_mode])
# subject_features = subject_features.append_column("mfcc_mode_cnt", [mfcc_mode_cnt])
# subject_features = subject_features.append_column("mfcc_min", [mfcc_min])
# subject_features = subject_features.append_column("mfcc_max", [mfcc_max])
# subject_features = subject_features.append_column("mfcc_range", [mfcc_range])
# subject_features = subject_features.append_column("mfcc_var", [mfcc_var])
# subject_features = subject_features.append_column("mfcc_std", [mfcc_std])
# subject_features = subject_features.append_column("mfcc_first_quart", [mfcc_first_quart])
# subject_features = subject_features.append_column("mfcc_third_quart", [mfcc_third_quart])
# subject_features = subject_features.append_column("mfcc_inter_quart_range", [mfcc_inter_quart_range])
# subject_features = subject_features.append_column("mfcc_entropy", [mfcc_entropy])
# subject_features = subject_features.append_column("mfcc_kurt", [mfcc_kurt])
# subject_features = subject_features.append_column("mfcc_skew", [mfcc_skew])

# subject_features = subject_features.append_column("spec_cont_mean", [spec_cont_mean])
# subject_features = subject_features.append_column("spec_cont_median", [spec_cont_median])
# subject_features = subject_features.append_column("spec_cont_mode", [spec_cont_mode])
# subject_features = subject_features.append_column("spec_cont_mode_cnt", [spec_cont_mode_cnt])
# subject_features = subject_features.append_column("spec_cont_min", [spec_cont_min])
# subject_features = subject_features.append_column("spec_cont_max", [spec_cont_max])
# subject_features = subject_features.append_column("spec_cont_range", [spec_cont_range])
# subject_features = subject_features.append_column("spec_cont_var", [spec_cont_var])
# subject_features = subject_features.append_column("spec_cont_std", [spec_cont_std])
# subject_features = subject_features.append_column("spec_cont_first_quart", [spec_cont_first_quart])
# subject_features = subject_features.append_column("spec_cont_third_quart", [spec_cont_third_quart])
# subject_features = subject_features.append_column("spec_cont_inter_quart_range", [spec_cont_inter_quart_range])
# subject_features = subject_features.append_column("spec_cont_entropy", [spec_cont_entropy])
# subject_features = subject_features.append_column("spec_cont_kurt", [spec_cont_kurt])
# subject_features = subject_features.append_column("spec_cont_skew", [spec_cont_skew])

In [83]:
# subject_features

In [84]:
# subject_features.shape

# We have computed more than enough features we need, now we will load all of the subjects audio signal parquet files into one giant table

In [85]:
# # local
# signal_file_paths = [
#     os.path.join(
#         SILVER_DATA_DIR.format(
#             DATA_DIR=DATA_DIR,
#             SILVER_FOLDER_NAME=SILVER_FOLDER_NAME,
#             SUB_FOLDER_NAME=SUB_FOLDER_NAME
#         ), 
#         signal_file_path
#     ).replace("\\", "/") 
#     for signal_file_path in os.listdir(SILVER_DATA_DIR.format(
#         DATA_DIR=DATA_DIR,
#         SILVER_FOLDER_NAME=SILVER_FOLDER_NAME,
#         SUB_FOLDER_NAME=SUB_FOLDER_NAME
#     )) 
#     if (not "labels" in signal_file_path) and (".parquet" in signal_file_path)
# ]
# len(signal_file_paths)

In [86]:
# # local
# signal_file_paths_test = [
#     os.path.join(
#         SILVER_DATA_DIR.format(
#             DATA_DIR=DATA_DIR,
#             SILVER_FOLDER_NAME=SILVER_FOLDER_NAME,
#             SUB_FOLDER_NAME=SUB_FOLDER_NAME
#         ),
#         signal_file_path_test
#     ).replace("\\", "/")
#     for signal_file_path_test in ["anonymous-20080904-qzg_signals.parquet", "_caustic_-20170306-smy_signals.parquet"]
# ]
# signal_file_paths_test

In [87]:
# Retrieve credentials from environment variables
# this is strictly used only in development
# load env variables
env_dir = Path('../').resolve()
load_dotenv(os.path.join(env_dir, '.env'))

True

In [88]:
storage_account_name = os.environ.get("STORAGE_ACCOUNT_NAME")
credential = os.environ.get("STORAGE_ACCOUNT_KEY")
conn_str = os.environ.get("STORAGE_ACCOUNT_CONN_STR")

In [89]:
SILVER_DATA_DIR

'{FOLDER_NAME}/{SUB_FOLDER_NAME}'

In [94]:
# cloud
# create client with generated sas token
datalake_service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net", 
    credential=credential
)

# retrieves file system client/container client 
# to retrieve datalake client
silver_container_client = datalake_service_client.get_file_system_client(f"{storage_account_name}-silver")

# we only get the directories in the first level of 
# the container, if it has a "/" then it means it is not
# an immediate folder in the container. This only really
# gets the subject folders 
signal_file_paths = [
    os.path.join(
        SILVER_FOLDER_NAME,
        path.name
    ).replace("\\", "/") 
    for path in silver_container_client.get_paths(path=SUB_FOLDER_NAME) 
    if (not "labels" in path.name) and (".parquet" in path.name)
]
signal_file_paths

['sgppipelinesa-silver/stage-01/1028-20100710-hne_signals.parquet',
 'sgppipelinesa-silver/stage-01/1337ad-20170321-ajg_signals.parquet',
 'sgppipelinesa-silver/stage-01/1337ad-20170321-tkg_signals.parquet',
 'sgppipelinesa-silver/stage-01/1snoke-20120412-hge_signals.parquet',
 'sgppipelinesa-silver/stage-01/23yipikaye-20100807-ujm_signals.parquet',
 'sgppipelinesa-silver/stage-01/Aaron-20080318-kdl_signals.parquet',
 'sgppipelinesa-silver/stage-01/Anniepoo-20140308-bft_signals.parquet',
 'sgppipelinesa-silver/stage-01/Anniepoo-20140308-cqj_signals.parquet',
 'sgppipelinesa-silver/stage-01/Anniepoo-20140308-fcp_signals.parquet',
 'sgppipelinesa-silver/stage-01/Anniepoo-20140308-hns_signals.parquet',
 'sgppipelinesa-silver/stage-01/Anniepoo-20140308-nky_signals.parquet',
 'sgppipelinesa-silver/stage-01/Coren-20141121-pxp_signals.parquet']

In [None]:
def compute_statistical_feats(subject_table, samples_per_win_size, samples_per_hop_size):
    # connect to an in-memory database
    conn = duckdb.connect()

    # count = conn.sql("""
    #     SELECT COUNT(rowId) FROM subject_table
    # """).fetchall()[-1][-1]

    conn.sql(f"""
        CREATE OR REPLACE TEMPORARY TABLE subject_features AS (
            SELECT
                -- signals, 
                subjectId, 
                -- rowId,
                KURTOSIS(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_kurt,
                SKEWNESS(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_skew,
                ENTROPY(signals)OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_entropy,
                AVG(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_mean,
                MEDIAN(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_median,
                MODE(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_mode,
                MIN(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_min,
                MAX(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_max,
                VAR_SAMP(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_var,
                STDDEV_SAMP(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_stddev,
                QUANTILE_CONT(signals, 0.25) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_first_quart,
                QUANTILE_CONT(signals, 0.75) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_third_quart
            FROM subject_table
            WHERE (rowId % {samples_per_hop_size}) = 0
            ORDER BY rowId
        )
    """)

    conn.sql("""
        CREATE OR REPLACE TEMPORARY TABLE subject_features AS (
            SELECT 
                *,
                (freq_max - freq_min) AS freq_range,
                (freq_third_quart - freq_first_quart) AS freq_inter_quart_range
            FROM subject_features
        )
    """)

    subject_features = conn.sql("""
        SELECT * FROM subject_features
    """).to_arrow_table()

    return subject_features

def compute_spectral_features(subject_features, x_signals, hertz, samples_per_win_size, samples_per_hop_size, n_frames):
    zcr = librosa.feature.zero_crossing_rate(
        y=x_signals, 
        frame_length=samples_per_win_size, 
        hop_length=samples_per_hop_size
    )[:, :n_frames]
    poly_feats = librosa.feature.poly_features(
        y=x_signals, 
        sr=hertz, 
        n_fft=samples_per_win_size, 
        hop_length=samples_per_hop_size
    )[:, :n_frames]

    mel_spec = librosa.feature.melspectrogram(
        y=x_signals, 
        sr=hertz, 
        n_fft=samples_per_win_size, 
        hop_length=samples_per_hop_size, 
        n_mels=90
    )[:, :n_frames]
    mel_spec_mean, \
    mel_spec_median, \
    mel_spec_mode, \
    mel_spec_mode_cnt, \
    mel_spec_min, \
    mel_spec_max, \
    mel_spec_range, \
    mel_spec_var, \
    mel_spec_std, \
    mel_spec_first_quart, \
    mel_spec_third_quart, \
    mel_spec_inter_quart_range, \
    mel_spec_entropy, \
    mel_spec_kurt, \
    mel_spec_skew = extract_spectogam_stats(mel_spec)

    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    mel_spec_db_mean, \
    mel_spec_db_median, \
    mel_spec_db_mode, \
    mel_spec_db_mode_cnt, \
    mel_spec_db_min, \
    mel_spec_db_max, \
    mel_spec_db_range, \
    mel_spec_db_var, \
    mel_spec_db_std, \
    mel_spec_db_first_quart, \
    mel_spec_db_third_quart, \
    mel_spec_db_inter_quart_range, \
    mel_spec_db_entropy, \
    mel_spec_db_kurt, \
    mel_spec_db_skew = extract_spectogam_stats(mel_spec_db)

    mfcc = librosa.feature.mfcc(
        y=x_signals, 
        sr=hertz, 
        n_fft=samples_per_win_size, 
        hop_length=samples_per_hop_size, n_mfcc=90
    )[:, :n_frames]
    mfcc_mean, \
    mfcc_median, \
    mfcc_mode, \
    mfcc_mode_cnt, \
    mfcc_min, \
    mfcc_max, \
    mfcc_range, \
    mfcc_var, \
    mfcc_std, \
    mfcc_first_quart, \
    mfcc_third_quart, \
    mfcc_inter_quart_range, \
    mfcc_entropy, \
    mfcc_kurt, \
    mfcc_skew = extract_spectogam_stats(mfcc)
    
    spec_cont = librosa.feature.spectral_contrast(
        y=x_signals, 
        sr=hertz, 
        n_fft=samples_per_win_size, 
        hop_length=samples_per_hop_size
    )[:, :n_frames]
    spec_cont_mean, \
    spec_cont_median, \
    spec_cont_mode, \
    spec_cont_mode_cnt, \
    spec_cont_min, \
    spec_cont_max, \
    spec_cont_range, \
    spec_cont_var, \
    spec_cont_std, \
    spec_cont_first_quart, \
    spec_cont_third_quart, \
    spec_cont_inter_quart_range, \
    spec_cont_entropy, \
    spec_cont_kurt, \
    spec_cont_skew = extract_spectogam_stats(spec_cont)

    spec_cent = librosa.feature.spectral_centroid(
        y=x_signals, 
        sr=hertz, 
        n_fft=samples_per_win_size, 
        hop_length=samples_per_hop_size
    )[:, :n_frames]
    spec_bw = librosa.feature.spectral_bandwidth(
        y=x_signals, 
        sr=hertz, 
        n_fft=samples_per_win_size, 
        hop_length=samples_per_hop_size
    )[:, :n_frames]
    spec_flat = librosa.feature.spectral_flatness(
        y=x_signals, 
        n_fft=samples_per_win_size, 
        hop_length=samples_per_hop_size
    )[:, :n_frames]
    spec_roll = librosa.feature.spectral_rolloff(
        y=x_signals, 
        sr=hertz, 
        n_fft=samples_per_win_size, 
        hop_length=samples_per_hop_size
    )[:, :n_frames]

    # add the newly computed spectographic and chromagraphic
    # features as columns
    subject_features = subject_features.append_column("zcr", [zcr.reshape(-1)])
    subject_features = subject_features.append_column("poly_feat_1", [poly_feats[0, :]])
    subject_features = subject_features.append_column("poly_feat_2", [poly_feats[1, :]])
    subject_features = subject_features.append_column("spec_cent", [spec_cent.reshape(-1)])
    subject_features = subject_features.append_column("spec_bw", [spec_bw.reshape(-1)])
    subject_features = subject_features.append_column("spec_flat", [spec_flat.reshape(-1)])
    subject_features = subject_features.append_column("spec_roll", [spec_roll.reshape(-1)])

    subject_features = subject_features.append_column("mel_spec_mean", [mel_spec_mean])
    subject_features = subject_features.append_column("mel_spec_median", [mel_spec_median])
    subject_features = subject_features.append_column("mel_spec_mode", [mel_spec_mode])
    subject_features = subject_features.append_column("mel_spec_mode_cnt", [mel_spec_mode_cnt])
    subject_features = subject_features.append_column("mel_spec_min", [mel_spec_min])
    subject_features = subject_features.append_column("mel_spec_max", [mel_spec_max])
    subject_features = subject_features.append_column("mel_spec_range", [mel_spec_range])
    subject_features = subject_features.append_column("mel_spec_var", [mel_spec_var])
    subject_features = subject_features.append_column("mel_spec_std", [mel_spec_std])
    subject_features = subject_features.append_column("mel_spec_first_quart", [mel_spec_first_quart])
    subject_features = subject_features.append_column("mel_spec_third_quart", [mel_spec_third_quart])
    subject_features = subject_features.append_column("mel_spec_inter_quart_range", [mel_spec_inter_quart_range])
    subject_features = subject_features.append_column("mel_spec_entropy", [mel_spec_entropy])
    subject_features = subject_features.append_column("mel_spec_kurt", [mel_spec_kurt])
    subject_features = subject_features.append_column("mel_spec_skew", [mel_spec_skew])

    subject_features = subject_features.append_column("mel_spec_db_mean", [mel_spec_db_mean])
    subject_features = subject_features.append_column("mel_spec_db_median", [mel_spec_db_median])
    subject_features = subject_features.append_column("mel_spec_db_mode", [mel_spec_db_mode])
    subject_features = subject_features.append_column("mel_spec_db_mode_cnt", [mel_spec_db_mode_cnt])
    subject_features = subject_features.append_column("mel_spec_db_min", [mel_spec_db_min])
    subject_features = subject_features.append_column("mel_spec_db_max", [mel_spec_db_max])
    subject_features = subject_features.append_column("mel_spec_db_range", [mel_spec_db_range])
    subject_features = subject_features.append_column("mel_spec_db_var", [mel_spec_db_var])
    subject_features = subject_features.append_column("mel_spec_db_std", [mel_spec_db_std])
    subject_features = subject_features.append_column("mel_spec_db_first_quart", [mel_spec_db_first_quart])
    subject_features = subject_features.append_column("mel_spec_db_third_quart", [mel_spec_db_third_quart])
    subject_features = subject_features.append_column("mel_spec_db_inter_quart_range", [mel_spec_db_inter_quart_range])
    subject_features = subject_features.append_column("mel_spec_db_entropy", [mel_spec_db_entropy])
    subject_features = subject_features.append_column("mel_spec_db_kurt", [mel_spec_db_kurt])
    subject_features = subject_features.append_column("mel_spec_db_skew", [mel_spec_db_skew])

    subject_features = subject_features.append_column("mfcc_mean", [mfcc_mean])
    subject_features = subject_features.append_column("mfcc_median", [mfcc_median])
    subject_features = subject_features.append_column("mfcc_mode", [mfcc_mode])
    subject_features = subject_features.append_column("mfcc_mode_cnt", [mfcc_mode_cnt])
    subject_features = subject_features.append_column("mfcc_min", [mfcc_min])
    subject_features = subject_features.append_column("mfcc_max", [mfcc_max])
    subject_features = subject_features.append_column("mfcc_range", [mfcc_range])
    subject_features = subject_features.append_column("mfcc_var", [mfcc_var])
    subject_features = subject_features.append_column("mfcc_std", [mfcc_std])
    subject_features = subject_features.append_column("mfcc_first_quart", [mfcc_first_quart])
    subject_features = subject_features.append_column("mfcc_third_quart", [mfcc_third_quart])
    subject_features = subject_features.append_column("mfcc_inter_quart_range", [mfcc_inter_quart_range])
    subject_features = subject_features.append_column("mfcc_entropy", [mfcc_entropy])
    subject_features = subject_features.append_column("mfcc_kurt", [mfcc_kurt])
    subject_features = subject_features.append_column("mfcc_skew", [mfcc_skew])

    subject_features = subject_features.append_column("spec_cont_mean", [spec_cont_mean])
    subject_features = subject_features.append_column("spec_cont_median", [spec_cont_median])
    subject_features = subject_features.append_column("spec_cont_mode", [spec_cont_mode])
    subject_features = subject_features.append_column("spec_cont_mode_cnt", [spec_cont_mode_cnt])
    subject_features = subject_features.append_column("spec_cont_min", [spec_cont_min])
    subject_features = subject_features.append_column("spec_cont_max", [spec_cont_max])
    subject_features = subject_features.append_column("spec_cont_range", [spec_cont_range])
    subject_features = subject_features.append_column("spec_cont_var", [spec_cont_var])
    subject_features = subject_features.append_column("spec_cont_std", [spec_cont_std])
    subject_features = subject_features.append_column("spec_cont_first_quart", [spec_cont_first_quart])
    subject_features = subject_features.append_column("spec_cont_third_quart", [spec_cont_third_quart])
    subject_features = subject_features.append_column("spec_cont_inter_quart_range", [spec_cont_inter_quart_range])
    subject_features = subject_features.append_column("spec_cont_entropy", [spec_cont_entropy])
    subject_features = subject_features.append_column("spec_cont_kurt", [spec_cont_kurt])
    subject_features = subject_features.append_column("spec_cont_skew", [spec_cont_skew])

    return subject_features


def test(signal_file_paths, 
    data_dir, 
    hertz: int=16000, 
    window_time: int=3, 
    hop_time: int=1, 
    storage_account_name: str=None, 
    credential: str=None,
):
    """
    concurrently extracts the features of each signal
    the first half of feature extraction involves using
    sql for computing statistical features of the subjects
    signals

    the second half involves using librosa for computing
    for instance spectral features of the sujects signals
    as there is not supported function to compute audio
    specific signal features like spectograms, chromagrams,
    tempogram, etc.
    """
    # make directory where subject features wil lbe saved
    os.makedirs(data_dir, exist_ok=True)
    
    # we calculate the window size of each segment or the
    # amount of samples it has to have based on the frequency
    samples_per_win_size = int(window_time * hertz)
    samples_per_hop_size = int(hop_time * hertz)

    if storage_account_name and credential:
        handler = pa_adl.AccountHandler.from_account_name(storage_account_name, credential=credential)
        fs = pa.fs.PyFileSystem(handler)

    def helper(signal_file_path):
        try:
            # extract subject name from file path
            subject_id = signal_file_path.split("/")[-1]
            subject_id = subject_id.replace("_signals.parquet", "")

            # read subjects table and it ssignals
            if storage_account_name and credential:
                # cloud
                subject_table = pq.read_table(signal_file_path, filesystem=fs)
            else:
                # local
                subject_table = pq.read_table(signal_file_path)
                
            x_signals = subject_table["signals"].to_numpy()

            # calculate statistical features
            subject_features = compute_statistical_feats(
                subject_table, 
                samples_per_win_size, 
                samples_per_hop_size
            )

            # get the number of frames used using a window of 48000
            # and hop length of 16000
            n_frames = subject_features.shape[0]

            # compute spectographic and chromagraphic features
            subject_features = compute_spectral_features(
                subject_features, 
                x_signals, 
                hertz, 
                samples_per_win_size, 
                samples_per_hop_size, 
                n_frames
            )
            
            save_path = os.path.join(data_dir, f"{subject_id}_signals.parquet").replace("\\", "/")
            if storage_account_name and credential:
                pq.write_table(subject_features, save_path,filesystem=fs)
            else:
                # write table to a 2nd sub stage in silver staging layer 
                pq.write_table(subject_features, save_path)

        except Exception as e:
            print(f"error {e} occured on subject id: {subject_id}")
            
            # return the last subjects features
            return subject_id, e

    # concurrently calculate statistical features and spectral
    # features of each subject using DuckDB SQL and librosa
    with ThreadPoolExecutor(max_workers=5) as exe:
        subjects_features_list = list(exe.map(helper, signal_file_paths))

    return subjects_features_list

In [109]:
# # local
# SAVE_DIR = SILVER_DATA_DIR.format(
#     DATA_DIR=DATA_DIR,
#     SILVER_FOLDER_NAME=SILVER_FOLDER_NAME,
#     SUB_FOLDER_NAME="stage-02"
# )
# SAVE_DIR

# cloud
SAVE_DIR = SILVER_DATA_DIR.format(
    FOLDER_NAME=SILVER_FOLDER_NAME,
    SUB_FOLDER_NAME="stage-02"
)
SAVE_DIR

'sgppipelinesa-silver/stage-02'

In [110]:
subjects_features = test(signal_file_paths, SAVE_DIR, storage_account_name=storage_account_name, credential=credential)