In [1]:
import os
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow.compute import kurtosis
import numpy as np
import librosa
import duckdb

from concurrent.futures import ThreadPoolExecutor

In [2]:
DATA_DIR = "../include/data/"

In [3]:
# # cloud
# SILVER_FOLDER_NAME = "sgppipelinesa-silver"
# SUB_FOLDER_NAME = "stage-01"
# SILVER_DATA_DIR = os.path.join(URL.format(FOLDER_NAME=SILVER_FOLDER_NAME), SUB_FOLDER_NAME)
# SILVER_DATA_DIR

# local
SILVER_FOLDER_NAME = "silver/"
SUB_FOLDER_NAME = "stage-01/"
SILVER_DATA_DIR = os.path.join(DATA_DIR, os.path.join(SILVER_FOLDER_NAME, SUB_FOLDER_NAME))
SILVER_DATA_DIR

'../include/data/silver/stage-01/'

In [4]:
subject_table = pq.read_table(os.path.join(SILVER_DATA_DIR, "_caustic_-20170306-smy_signals.parquet"))
subject_table

pyarrow.Table
signals: float
subjectId: string
rowId: int32
----
signals: [[0.0009460449,0.0008239746,0.00076293945,0.00064086914,0.00048828125,...,-0.077941895,-0.08255005,-0.0909729,-0.098846436,-0.10534668],[-0.10913086,-0.11035156,-0.10748291,-0.10131836,-0.08596802,...,-0.0005187988,0.001739502,-0.00030517578,0.00061035156,0.003692627],...,[-0.021118164,-0.0065612793,0.0034179688,0.004486084,-0.011230469,...,0.026397705,0.004425049,-0.0184021,-0.04437256,-0.06384277],[-0.067718506,-0.056793213,-0.04071045,-0.023620605,0.0061950684,...,0.008422852,0.009674072,0.010498047,0.011352539,0.012268066]]
subjectId: [["_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy",...,"_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy"],["_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy

In [5]:
# connect to an in-memory database
conn = duckdb.connect()

In [6]:
conn.sql("""
    SELECT COUNT(*) FROM subject_table
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       553472 │
└──────────────┘

In [7]:
# hertz
hertz = 16000
window_time = 3
hop_time = 1

# we calculate the window size of each segment or the
# amount of samples it has to have based on the frequency
samples_per_win_size = int(window_time * hertz)
samples_per_hop_size = int(hop_time * hertz)

In [22]:
conn.sql(f"""
    CREATE OR REPLACE TEMPORARY TABLE subject_features AS (
        SELECT
            signals, 
            subjectId, 
            rowId,
            KURTOSIS(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_kurt,
            SKEWNESS(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_skew,
            ENTROPY(signals)OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_entropy,
            AVG(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_mean,
            MEDIAN(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_median,
            MODE(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_mode,
            MIN(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_min,
            MAX(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_max,
            VAR_SAMP(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_var,
            STDDEV_SAMP(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_stddev,
            QUANTILE_CONT(signals, 0.25) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_first_quart,
            QUANTILE_CONT(signals, 0.75) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_third_quart
        FROM subject_table
        WHERE (rowId % {samples_per_hop_size}) = 0
        ORDER BY rowId
    )
""")

In [23]:
conn.sql("""
    SELECT * FROM subject_features
""")

┌───────────────┬────────────────────────┬────────┬─────────────────────┬──────────────────────┬────────────────────┬────────────────────────┬───────────────┬───────────────┬──────────────┬─────────────┬────────────────────────┬──────────────────────┬──────────────────┬──────────────────┐
│    signals    │       subjectId        │ rowId  │      freq_kurt      │      freq_skew       │    freq_entropy    │       freq_mean        │  freq_median  │   freq_mode   │   freq_min   │  freq_max   │        freq_var        │     freq_stddev      │ freq_first_quart │ freq_third_quart │
│     float     │        varchar         │ int32  │       double        │        double        │       double       │         double         │     float     │     float     │    float     │    float    │         double         │        double        │      float       │      float       │
├───────────────┼────────────────────────┼────────┼─────────────────────┼──────────────────────┼────────────────────┼─────────────

In [34]:
conn.sql("""
    SELECT COUNT(*) FROM subject_table
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       553472 │
└──────────────┘