In [1]:
import re
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import io
import librosa

import pyspark.sql.functions as F
import pyspark

from pyspark.sql import SparkSession, Window
from pyspark.conf import SparkConf
# from pyspark.context import SparkContext
from pyspark.sql.types import StringType, ArrayType, StructField, StructType, FloatType, DoubleType, IntegerType

from concurrent.futures import ThreadPoolExecutor

from utilities.loaders import save_data_splits

%load_ext autoreload
%autoreload 2

In [3]:
# `sparksession is none: typeerror: 'javapackage' object is not 
# callable` can be raised if the pyspark version being used is 4.0.0
# which is not compatible to a python 3.11.8 version
spark = SparkSession.builder.appName("app")\
    .config("spark.driver.memory", "16g")\
    .config("spark.executor.memory", "4g")\
    .config("spark.executor.cores", "2")\
    .config("spark.executor.instances", "3")\
    .config("spark.sql.execution.arrow.maxRecordsPerBatch", "100")\
    .getOrCreate()

In [4]:
# # cloud
# URL = "abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net/"
# SILVER_FOLDER_NAME = "sgppipelinesa-silver"
# SUB_FOLDER_NAME = "stage-01"
# SILVER_DATA_PATH = os.path.join(URL.format(FOLDER_NAME=SILVER_FOLDER_NAME), SUB_FOLDER_NAME)
# SILVER_DATA_PATH

# local
DATA_DIR = "../include/data/"
SILVER_FOLDER_NAME = "silver/"
SUB_FOLDER_NAME = "stage-01"
SILVER_DATA_DIR = os.path.join(DATA_DIR, os.path.join(SILVER_FOLDER_NAME, SUB_FOLDER_NAME)).replace("\\", "/")
SILVER_DATA_DIR

'../include/data/silver/stage-01'

In [5]:
# folder_infos = dbutils.fs.ls(BRONZE_DATA_PATH)

In [19]:
def read_signal_files(SPLIT_FOLDER):

    # only include the parquet files without labels
    signal_files = [
        os.path.join(SPLIT_FOLDER, SIGNAL_DF_FILE).replace('\\', '/') 
        for SIGNAL_DF_FILE in os.listdir(SPLIT_FOLDER)
        # if file path has labels or .csv in it do not include in list
        if (not "labels" in SIGNAL_DF_FILE) and (not ".csv" in SIGNAL_DF_FILE)
    ]

    # print(signal_files)

    signals_df = spark.read.format("parquet").load(signal_files)

    return signals_df

In [None]:
signals_df = read_signal_files(SILVER_DATA_DIR)

In [21]:
signals_df.show()

+-------------+--------------------+-----+
|      signals|           subjectId|rowId|
+-------------+--------------------+-----+
| -0.011077881|23yipikaye-201008...|    0|
| -0.010467529|23yipikaye-201008...|    1|
|  -0.00982666|23yipikaye-201008...|    2|
| -0.010620117|23yipikaye-201008...|    3|
| -0.009674072|23yipikaye-201008...|    4|
| -0.006713867|23yipikaye-201008...|    5|
|-0.0036621094|23yipikaye-201008...|    6|
|-0.0025634766|23yipikaye-201008...|    7|
|-0.0027770996|23yipikaye-201008...|    8|
|-0.0034484863|23yipikaye-201008...|    9|
|-0.0015869141|23yipikaye-201008...|   10|
| 6.4086914E-4|23yipikaye-201008...|   11|
| 0.0025024414|23yipikaye-201008...|   12|
| 0.0030822754|23yipikaye-201008...|   13|
| 0.0043945312|23yipikaye-201008...|   14|
| 0.0038146973|23yipikaye-201008...|   15|
| 0.0038452148|23yipikaye-201008...|   16|
|  0.005554199|23yipikaye-201008...|   17|
| 0.0049438477|23yipikaye-201008...|   18|
| 0.0037231445|23yipikaye-201008...|   19|
+----------

In [22]:
train_signals_df = spark.sql("""
    SELECT 
        s.signals AS signals, 
        s.subjectId AS subjectId, 
        s.rowId AS rowId
    FROM {train_labels_df} l
    LEFT JOIN {signals_df} s
    ON l.subjectId = s.subjectId
""", signals_df=signals_df, train_labels_df=train_labels_df)

In [23]:
train_signals_df.select("subjectId").distinct().show()

+--------------------+
|           subjectId|
+--------------------+
|  Aaron-20080318-kdl|
| 1337ad-20170321-ajg|
|Anniepoo-20140308...|
|23yipikaye-201008...|
|  Coren-20141121-pxp|
| 1337ad-20170321-tkg|
|Anniepoo-20140308...|
|Anniepoo-20140308...|
|   1028-20100710-hne|
+--------------------+



In [24]:
subject_signals_flat_list = train_signals_df.where(F.col("subjectId") == "1028-20100710-hne").select("signals").rdd.map(lambda r: r[0]).collect()

In [None]:
fig = plt.figure(figsize=(17, 5))
librosa.display.waveshow(np.array(subject_signals_flat_list), alpha=0.5, color="#8442f5")
plt.show()

In [25]:
val_signals_df = spark.sql("""
    SELECT 
        s.signals AS signals, 
        s.subjectId AS subjectId, 
        s.rowId AS rowId
    FROM {val_labels_df} l
    LEFT JOIN {signals_df} s
    ON l.subjectId = s.subjectId
""", signals_df=signals_df, val_labels_df=val_labels_df)

In [26]:
val_signals_df.select("subjectId").distinct().show()

+--------------------+
|           subjectId|
+--------------------+
| 1snoke-20120412-hge|
|Anniepoo-20140308...|
+--------------------+



In [27]:
test_signals_df = spark.sql("""
    SELECT 
        s.signals AS signals, 
        s.subjectId AS subjectId, 
        s.rowId AS rowId
    FROM {test_labels_df} l
    LEFT JOIN {signals_df} s
    ON l.subjectId = s.subjectId
""", signals_df=signals_df, test_labels_df=test_labels_df)

In [28]:
test_signals_df.select("subjectId").distinct().show()

+--------------------+
|           subjectId|
+--------------------+
|Anniepoo-20140308...|
+--------------------+



In [None]:
# @F.pandas_udf(returnType=FloatType(), functionType=F.PandasUDFType.GROUPED_AGG)
# def get_peak_freq(segment: pd.Series):
#     # calculate frequency domain features
#     # get the spectrogram by calculating short time fourier transform
#     spectrogram = np.abs(librosa.stft(segment))
#     # print(f"spectrogram shape: {spectrogram.shape}")

#     # Get the frequencies corresponding to the spectrogram bins
#     frequencies = librosa.fft_frequencies(sr=16000)
#     # print(f"frequencies shape: {frequencies.shape}")

#     # Find the frequency bin with the highest average energy
#     peak_frequency_bin = np.argmax(np.mean(spectrogram, axis=1))

#     # Get the peak frequency in Hz
#     # calculate also peak frequency
#     # I think dito na gagamit ng fast fourier transform
#     # to obtain the frequency, or use some sort of function
#     # to convert the raw audio signals into a spectogram
#     peak_frequency = frequencies[peak_frequency_bin]

#     return peak_frequency

def extract_features(
    signals_df: pyspark.sql.DataFrame,
    # dataset: list[tuple[str, pyspark.sql.DataFrame]],
    hertz: int=16000,
    window_time: int=3,
    hop_time: int=1):
    """
    extracts the features from each segment of an audio signal

    args:
        dataset - 
        hertz - number of samples per second
        window_time - number of seconds of the given window to consider
        e.g. if number of seconds is 3 and hertz is 16000 or 16000
        samples/rows per second then the window size we will consider
        is 16000 * 3 or 48000
        hop_time - seconds
    """
    # we calculate the window size of each segment or the
    # amount of samples it has to have based on the frequency
    samples_per_win_size = int(window_time * hertz)
    samples_per_hop_size = int(hop_time * hertz)
    # print(f"samples per window size: {samples_per_win_size}")
    # print(f"samples per hop size: {samples_per_hop_size}\n")

    
    feat_window = Window.partitionBy("subjectId").orderBy("rowId").rowsBetween(Window.currentRow, samples_per_win_size - 1)
    
    signals_df = signals_df.withColumn("freq_skew", F.skewness("signals").over(feat_window))
    signals_df = signals_df.withColumn("freq_kurt", F.kurtosis("signals").over(feat_window))
    
    signals_df = signals_df.withColumn("freq_mean", F.mean("signals").over(feat_window))
    # signals_df = signals_df.withColumn("freq_median", F.median("signals").over(feat_window))
    # median over window function is not supported so we can use 
    signals_df = signals_df.withColumn("freq_median", F.percentile("signals", 0.5).over(feat_window))
    signals_df = signals_df.withColumn("freq_mode", F.mode("signals").over(feat_window))
    
    signals_df = signals_df.withColumn("freq_min", F.min("signals").over(feat_window))
    signals_df = signals_df.withColumn("freq_max", F.max("signals").over(feat_window))
    signals_df = signals_df.withColumn("freq_range", F.col("freq_max") - F.col("freq_min"))
    signals_df = signals_df.withColumn("freq_var", F.variance("signals").over(feat_window))
    signals_df = signals_df.withColumn("freq_std", F.stddev("signals").over(feat_window))
    
    signals_df = signals_df.withColumn("freq_first_quart", F.percentile("signals", 0.25).over(feat_window))
    signals_df = signals_df.withColumn("freq_third_quart", F.percentile("signals", 0.75).over(feat_window))
    signals_df = signals_df.withColumn("freq_inter_quart_range", F.col("freq_first_quart") - F.col("freq_third_quart"))

    # signals_df = signals_df.withColumn("freq_peak", get_peak_freq(F.col("signals")).over(feat_window))
    
    # an implementation of the only including windows after a certain
    # hop size, since we cannot do it directly using spark we can 
    # filter out the rows of windows that have not yet made the 
    # appropriate hop size using filtering 
    signals_df = signals_df.where((F.col("rowId") % samples_per_hop_size) == 0)

    return signals_df

In [30]:
train_signals_df.cache()
val_signals_df.cache()
test_signals_df.cache()

DataFrame[signals: float, subjectId: string, rowId: int]

In [31]:
train_signals_df.select("signals").show()

+------------+
|     signals|
+------------+
|1.5258789E-4|
|1.5258789E-4|
|9.1552734E-5|
|9.1552734E-5|
|6.1035156E-5|
|9.1552734E-5|
|9.1552734E-5|
|9.1552734E-5|
|9.1552734E-5|
|6.1035156E-5|
|3.0517578E-5|
|3.0517578E-5|
|6.1035156E-5|
|3.0517578E-5|
|3.0517578E-5|
|         0.0|
|         0.0|
|3.0517578E-5|
|3.0517578E-5|
|6.1035156E-5|
+------------+
only showing top 20 rows



In [32]:
train_signals_df = extract_features(signals_df=train_signals_df)

In [33]:
train_signals_df

DataFrame[signals: float, subjectId: string, rowId: int, freq_skew: double, freq_kurt: double, freq_mean: double, freq_median: double, freq_mode: float, freq_min: float, freq_max: float, freq_range: float, freq_var: double, freq_std: double, freq_first_quart: double, freq_third_quart: double, freq_inter_quart_range: double]

In [35]:
train_signals_df.select("rowId").limit(10).show()

+------+
| rowId|
+------+
|     0|
| 16000|
| 32000|
| 48000|
| 64000|
| 80000|
| 96000|
|112000|
|128000|
|144000|
+------+



In [25]:
# # cloud
# URL = "abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net/"
# SILVER_FOLDER_NAME = "sgppipelinesa-silver"
# SUB_FOLDER_NAME = "stage-02"
# SILVER_DATA_PATH = os.path.join(URL.format(FOLDER_NAME=SILVER_FOLDER_NAME), SUB_FOLDER_NAME)
# SILVER_DATA_PATH

# local
DATA_DIR = "../include/data/"
SILVER_FOLDER_NAME = "silver/"
SUB_FOLDER_NAME = "stage-02"
SILVER_DATA_DIR = os.path.join(DATA_DIR, os.path.join(SILVER_FOLDER_NAME, SUB_FOLDER_NAME)).replace("\\", "/")
SILVER_DATA_DIR

'../include/data/silver/stage-02'

In [None]:
train_labels_df.write\
.mode("overwrite")\
.partitionBy("subjectId")\
.option("compression", "snappy")\
.parquet(os.path.join(SILVER_DATA_DIR, "train"))

In [38]:
sample_data = [(i, (i + 1) * 10, "subject_1") for i in range(20)]
df_1 = spark.createDataFrame(sample_data, ["row_id", "value", "subject_id"])

In [39]:
df_1.show()

+------+-----+----------+
|row_id|value|subject_id|
+------+-----+----------+
|     0|   10| subject_1|
|     1|   20| subject_1|
|     2|   30| subject_1|
|     3|   40| subject_1|
|     4|   50| subject_1|
|     5|   60| subject_1|
|     6|   70| subject_1|
|     7|   80| subject_1|
|     8|   90| subject_1|
|     9|  100| subject_1|
|    10|  110| subject_1|
|    11|  120| subject_1|
|    12|  130| subject_1|
|    13|  140| subject_1|
|    14|  150| subject_1|
|    15|  160| subject_1|
|    16|  170| subject_1|
|    17|  180| subject_1|
|    18|  190| subject_1|
|    19|  200| subject_1|
+------+-----+----------+



In [40]:
sample_data = [(i, (i + 1) * 10, "subject_2") for i in range(12)]
df_2 = spark.createDataFrame(sample_data, ["row_id", "value", "subject_id"])

In [41]:
df_2.show()

+------+-----+----------+
|row_id|value|subject_id|
+------+-----+----------+
|     0|   10| subject_2|
|     1|   20| subject_2|
|     2|   30| subject_2|
|     3|   40| subject_2|
|     4|   50| subject_2|
|     5|   60| subject_2|
|     6|   70| subject_2|
|     7|   80| subject_2|
|     8|   90| subject_2|
|     9|  100| subject_2|
|    10|  110| subject_2|
|    11|  120| subject_2|
+------+-----+----------+



In [42]:
df_1.cache()
df_2.cache()

DataFrame[row_id: bigint, value: bigint, subject_id: string]

In [43]:
df_3 = df_2.unionByName(df_1)

In [44]:
df_3.show()

+------+-----+----------+
|row_id|value|subject_id|
+------+-----+----------+
|     0|   10| subject_2|
|     1|   20| subject_2|
|     2|   30| subject_2|
|     3|   40| subject_2|
|     4|   50| subject_2|
|     5|   60| subject_2|
|     6|   70| subject_2|
|     7|   80| subject_2|
|     8|   90| subject_2|
|     9|  100| subject_2|
|    10|  110| subject_2|
|    11|  120| subject_2|
|     0|   10| subject_1|
|     1|   20| subject_1|
|     2|   30| subject_1|
|     3|   40| subject_1|
|     4|   50| subject_1|
|     5|   60| subject_1|
|     6|   70| subject_1|
|     7|   80| subject_1|
+------+-----+----------+
only showing top 20 rows



In [45]:
samples_per_win_size = 6
samples_per_hop_size = 4

In [46]:
feat_window = Window.partitionBy("subject_id").orderBy("row_id").rowsBetween(Window.currentRow, samples_per_win_size - 1)

In [47]:
df_3 = df_3.withColumn("freq_std", F.sum("value").over(feat_window))

In [48]:
df_3.show()

+------+-----+----------+--------+
|row_id|value|subject_id|freq_std|
+------+-----+----------+--------+
|     0|   10| subject_1|     210|
|     1|   20| subject_1|     270|
|     2|   30| subject_1|     330|
|     3|   40| subject_1|     390|
|     4|   50| subject_1|     450|
|     5|   60| subject_1|     510|
|     6|   70| subject_1|     570|
|     7|   80| subject_1|     630|
|     8|   90| subject_1|     690|
|     9|  100| subject_1|     750|
|    10|  110| subject_1|     810|
|    11|  120| subject_1|     870|
|    12|  130| subject_1|     930|
|    13|  140| subject_1|     990|
|    14|  150| subject_1|    1050|
|    15|  160| subject_1|     900|
|    16|  170| subject_1|     740|
|    17|  180| subject_1|     570|
|    18|  190| subject_1|     390|
|    19|  200| subject_1|     200|
+------+-----+----------+--------+
only showing top 20 rows



# an implementation of the only including windows after a certain hop size, since we cannot do it directly using spark we can filter out the rows of windows that have not yet made the appropriate hop size using filtering 

In [49]:
cond = ((F.col("row_id") % samples_per_hop_size) == 0)
df_3 = df_3.where(cond)

In [50]:
df_3.show()

+------+-----+----------+--------+
|row_id|value|subject_id|freq_std|
+------+-----+----------+--------+
|     0|   10| subject_1|     210|
|     4|   50| subject_1|     450|
|     8|   90| subject_1|     690|
|    12|  130| subject_1|     930|
|    16|  170| subject_1|     740|
|     0|   10| subject_2|     210|
|     4|   50| subject_2|     450|
|     8|   90| subject_2|     420|
+------+-----+----------+--------+



In [51]:
df_3.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (isnotnull(row_id#2725L) AND ((row_id#2725L % 4) = 0))
   +- Window [sum(value#2726L) windowspecdefinition(subject_id#2727, row_id#2725L ASC NULLS FIRST, specifiedwindowframe(RowFrame, currentrow$(), 5)) AS freq_std#2914L], [subject_id#2727], [row_id#2725L ASC NULLS FIRST]
      +- Sort [subject_id#2727 ASC NULLS FIRST, row_id#2725L ASC NULLS FIRST], false, 0
         +- Exchange hashpartitioning(subject_id#2727, 200), ENSURE_REQUIREMENTS, [plan_id=2062]
            +- Union
               :- InMemoryTableScan [row_id#2725L, value#2726L, subject_id#2727]
               :     +- InMemoryRelation [row_id#2725L, value#2726L, subject_id#2727], StorageLevel(disk, memory, deserialized, 1 replicas)
               :           +- *(1) Scan ExistingRDD[row_id#2725L,value#2726L,subject_id#2727]
               +- InMemoryTableScan [row_id#2706L, value#2707L, subject_id#2708]
                     +- InMemoryRelation [row_id#2706L, va

In [52]:
df_3.unpersist()

DataFrame[row_id: bigint, value: bigint, subject_id: string, freq_std: bigint]

In [53]:
df_3.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (isnotnull(row_id#2725L) AND ((row_id#2725L % 4) = 0))
   +- Window [sum(value#2726L) windowspecdefinition(subject_id#2727, row_id#2725L ASC NULLS FIRST, specifiedwindowframe(RowFrame, currentrow$(), 5)) AS freq_std#2914L], [subject_id#2727], [row_id#2725L ASC NULLS FIRST]
      +- Sort [subject_id#2727 ASC NULLS FIRST, row_id#2725L ASC NULLS FIRST], false, 0
         +- Exchange hashpartitioning(subject_id#2727, 200), ENSURE_REQUIREMENTS, [plan_id=2062]
            +- Union
               :- InMemoryTableScan [row_id#2725L, value#2726L, subject_id#2727]
               :     +- InMemoryRelation [row_id#2725L, value#2726L, subject_id#2727], StorageLevel(disk, memory, deserialized, 1 replicas)
               :           +- *(1) Scan ExistingRDD[row_id#2725L,value#2726L,subject_id#2727]
               +- InMemoryTableScan [row_id#2706L, value#2707L, subject_id#2708]
                     +- InMemoryRelation [row_id#2706L, va

In [54]:
df_3.show()

+------+-----+----------+--------+
|row_id|value|subject_id|freq_std|
+------+-----+----------+--------+
|     0|   10| subject_1|     210|
|     4|   50| subject_1|     450|
|     8|   90| subject_1|     690|
|    12|  130| subject_1|     930|
|    16|  170| subject_1|     740|
|     0|   10| subject_2|     210|
|     4|   50| subject_2|     450|
|     8|   90| subject_2|     420|
+------+-----+----------+--------+

