In [1]:
import os
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow.compute import kurtosis
import numpy as np
import librosa
import duckdb

from concurrent.futures import ThreadPoolExecutor

In [2]:
DATA_DIR = "../include/data/"

In [3]:
# # cloud
# SILVER_FOLDER_NAME = "sgppipelinesa-silver"
# SUB_FOLDER_NAME = "stage-01"
# SILVER_DATA_DIR = os.path.join(URL.format(FOLDER_NAME=SILVER_FOLDER_NAME), SUB_FOLDER_NAME)
# SILVER_DATA_DIR

# local
SILVER_FOLDER_NAME = "silver/"
SUB_FOLDER_NAME = "stage-01/"
SILVER_DATA_DIR = os.path.join(DATA_DIR, os.path.join(SILVER_FOLDER_NAME, SUB_FOLDER_NAME))
SILVER_DATA_DIR

'../include/data/silver/stage-01/'

In [4]:
subject_table = pq.read_table(os.path.join(SILVER_DATA_DIR, "_caustic_-20170306-smy_signals.parquet"))
subject_table

pyarrow.Table
signals: float
subjectId: string
rowId: int32
----
signals: [[0.0009460449,0.0008239746,0.00076293945,0.00064086914,0.00048828125,...,-0.077941895,-0.08255005,-0.0909729,-0.098846436,-0.10534668],[-0.10913086,-0.11035156,-0.10748291,-0.10131836,-0.08596802,...,-0.0005187988,0.001739502,-0.00030517578,0.00061035156,0.003692627],...,[-0.021118164,-0.0065612793,0.0034179688,0.004486084,-0.011230469,...,0.026397705,0.004425049,-0.0184021,-0.04437256,-0.06384277],[-0.067718506,-0.056793213,-0.04071045,-0.023620605,0.0061950684,...,0.008422852,0.009674072,0.010498047,0.011352539,0.012268066]]
subjectId: [["_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy",...,"_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy"],["_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy","_caustic_-20170306-smy

In [5]:
# connect to an in-memory database
conn = duckdb.connect()

In [6]:
conn.sql("""
    SELECT COUNT(*) FROM subject_table
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       553472 │
└──────────────┘

In [7]:
# hertz
hertz = 16000
window_time = 3
hop_time = 1

# we calculate the window size of each segment or the
# amount of samples it has to have based on the frequency
samples_per_win_size = int(window_time * hertz)
samples_per_hop_size = int(hop_time * hertz)

In [22]:
conn.sql(f"""
    CREATE OR REPLACE TEMPORARY TABLE subject_features AS (
        SELECT
            signals, 
            subjectId, 
            rowId,
            KURTOSIS(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_kurt,
            SKEWNESS(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_skew,
            ENTROPY(signals)OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_entropy,
            AVG(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_mean,
            MEDIAN(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_median,
            MODE(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_mode,
            MIN(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_min,
            MAX(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_max,
            VAR_SAMP(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_var,
            STDDEV_SAMP(signals) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_stddev,
            QUANTILE_CONT(signals, 0.25) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_first_quart,
            QUANTILE_CONT(signals, 0.75) OVER(PARTITION BY subjectId ORDER BY rowId ROWS BETWEEN CURRENT ROW AND {samples_per_win_size - 1} FOLLOWING) AS freq_third_quart
        FROM subject_table
        WHERE (rowId % {samples_per_hop_size}) = 0
        ORDER BY rowId
    )
""")

In [24]:
conn.sql("""
    CREATE OR REPLACE TEMPORARY TABLE subject_features AS (
        SELECT 
            *,
            (freq_max - freq_min) AS freq_range,
            (freq_first_quart - freq_third_quart) AS freq_inter_quart_range
        FROM subject_features
    )
""")

In [25]:
conn.sql("""
    SELECT * FROM subject_features
""")

┌───────────────┬────────────────────────┬────────┬─────────────────────┬──────────────────────┬────────────────────┬────────────────────────┬───────────────┬───────────────┬──────────────┬─────────────┬────────────────────────┬──────────────────────┬──────────────────┬──────────────────┬─────────────┬────────────────────────┐
│    signals    │       subjectId        │ rowId  │      freq_kurt      │      freq_skew       │    freq_entropy    │       freq_mean        │  freq_median  │   freq_mode   │   freq_min   │  freq_max   │        freq_var        │     freq_stddev      │ freq_first_quart │ freq_third_quart │ freq_range  │ freq_inter_quart_range │
│     float     │        varchar         │ int32  │       double        │        double        │       double       │         double         │     float     │     float     │    float     │    float    │         double         │        double        │      float       │      float       │    float    │         float          │
├────────────

#### 544000 + a window of 48000 is 592000 which is greater 553472 so we consider only the indeces 544000 to 553471 which is just 9471 rows of data for this last window

In [34]:
conn.sql("""
    SELECT COUNT(*) FROM subject_table
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       553472 │
└──────────────┘

In [31]:
frames = conn.sql("""
    SELECT COUNT(rowId) FROM subject_features
""").fetchall()[-1][-1]

In [30]:
frames[-1][-1]

35

In [33]:
time = librosa.frames_to_time(frames, sr=hertz, hop_length=samples_per_hop_size)
time

np.float64(35.0)

In [41]:
subject_table["signals"].to_numpy()

array([0.00094604, 0.00082397, 0.00076294, ..., 0.01049805, 0.01135254,
       0.01226807], shape=(553472,), dtype=float32)

In [54]:
x_signals = subject_table["signals"].to_numpy()
x_signals

array([0.00094604, 0.00082397, 0.00076294, ..., 0.01049805, 0.01135254,
       0.01226807], shape=(553472,), dtype=float32)

In [55]:
# calculate other features
zcr = librosa.feature.zero_crossing_rate(y=x_signals, frame_length=samples_per_win_size, hop_length=samples_per_hop_size)
zcr

array([[0.08729167, 0.15414583, 0.18664583, 0.17652083, 0.1423125 ,
        0.17235417, 0.14370833, 0.13979167, 0.08329167, 0.11022917,
        0.15575   , 0.156625  , 0.15625   , 0.14270833, 0.14977083,
        0.15375   , 0.15527083, 0.1721875 , 0.1454375 , 0.13004167,
        0.12695833, 0.12214583, 0.10583333, 0.1366875 , 0.1511875 ,
        0.17816667, 0.1685    , 0.17489583, 0.16841667, 0.13808333,
        0.13670833, 0.18285417, 0.185     , 0.17108333, 0.07641667]])

In [61]:
zcr.shape[1]

35

In [64]:
new_zcr = zcr.reshape(-1)
new_zcr.shape

(35,)

In [None]:
# zcr_n_values_to_rem = np.abs(zcr.shape[1] - time)
# zcr_n_values_to_rem.astype(int)

np.int64(0)

In [None]:
# # get slice of those in range with time only
# new_zcr = zcr.reshape(-1)[:-zcr_n_values_to_rem.astype(int)]
# new_zcr

array([], dtype=float64)

In [65]:
mel_spect = librosa.feature.melspectrogram(y=x_signals, sr=hertz, n_fft=samples_per_win_size, hop_length=samples_per_hop_size, n_mels=90)
mel_spect

array([[9.3418999e+00, 1.5313477e+01, 1.3804149e+01, ..., 1.0596875e+01,
        9.3281775e+00, 1.1975436e+01],
       [2.7116418e+00, 6.3942085e+01, 7.0263359e+01, ..., 5.0933604e+00,
        7.9568477e+00, 3.7885796e+01],
       [1.0996913e+03, 1.9044412e+03, 1.7637432e+03, ..., 3.0651685e+02,
        9.2778308e+02, 1.0961296e+03],
       ...,
       [1.1292878e+00, 2.6480398e+00, 2.3072226e+00, ..., 1.5473083e+00,
        6.4120537e-01, 3.7659831e-02],
       [9.0728039e-01, 1.9571239e+00, 1.7471429e+00, ..., 1.3136760e+00,
        8.2490325e-01, 6.0928982e-02],
       [3.8142192e-01, 6.8643939e-01, 3.5092783e-01, ..., 5.8115226e-01,
        3.3651710e-01, 2.4943130e-02]], shape=(90, 35), dtype=float32)

In [66]:
mel_spect_db = librosa.power_to_db(mel_spect, ref=np.max)
mel_spect_db

array([[-29.448627 , -27.302242 , -27.752884 , ..., -28.901203 ,
        -29.455013 , -28.370068 ],
       [-34.820656 , -21.095112 , -20.685692 , ..., -32.082935 ,
        -30.14557  , -23.368217 ],
       [ -8.7402725,  -6.3553047,  -6.6886253, ..., -14.288439 ,
         -9.478516 ,  -8.754362 ],
       ...,
       [-38.624935 , -34.923737 , -35.522087 , ..., -37.257214 ,
        -41.083008 , -53.394196 ],
       [-39.575565 , -36.236797 , -36.7297   , ..., -37.968098 ,
        -39.98895  , -51.30474  ],
       [-43.338924 , -40.78696  , -43.7008   , ..., -41.510082 ,
        -43.882908 , -55.18347  ]], shape=(90, 35), dtype=float32)

In [68]:
mean_mel = np.mean(mel_spect_db, axis=0)
mean_mel.shape

(35,)

In [69]:
variance_mel = np.var(mel_spect_db, axis=0)
variance_mel.shape

(35,)

In [71]:
spect_cent = librosa.feature.spectral_centroid(y=x_signals, sr=hertz, n_fft=samples_per_win_size, hop_length=samples_per_hop_size)
spect_cent.shape

(1, 35)

In [74]:
new_spect_cent = spect_cent.reshape(-1)
new_spect_cent.shape

(35,)