In [2]:
import re
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import io

import pyspark.sql.functions as F
import pyspark

from pyspark.sql import SparkSession, Window
from pyspark.conf import SparkConf
# from pyspark.context import SparkContext
from pyspark.sql.types import StringType, ArrayType, StructField, StructType, FloatType, DoubleType, IntegerType

from concurrent.futures import ThreadPoolExecutor

In [3]:
# `sparksession is none: typeerror: 'javapackage' object is not 
# callable` can be raised if the pyspark version being used is 4.0.0
# which is not compatible to a python 3.11.8 version
spark = SparkSession.builder.appName("app").getOrCreate()
    # .config("spark.driver.memory", "14g")\
    # .config("spark.sql.execution.arrow.maxRecordsPerBatch", "100")\
    # .getOrCreate()

In [4]:
# URL = "abfss://{FOLDER_NAME}@sgppipelinesa.dfs.core.windows.net/"
# SILVER_FOLDER_NAME = "sgppipelinesa-silver"
# SUB_FOLDER_NAME = "stage-01"
# SILVER_DATA_PATH = os.path.join(URL.format(FOLDER_NAME=SILVER_FOLDER_NAME), SUB_FOLDER_NAME)
# SILVER_DATA_PATH
# folder_infos = dbutils.fs.ls(BRONZE_DATA_PATH)

DATA_PATH = "../include/data/"
SILVER_FOLDER_NAME = "silver"
SUB_FOLDER_NAME = "stage-01"
SILVER_DATA_PATH = os.path.join(DATA_PATH, os.path.join(SILVER_FOLDER_NAME, SUB_FOLDER_NAME))
SILVER_DATA_PATH

'../include/data/silver\\stage-01'

In [5]:
train_labels_df = spark.read.format("parquet").load(os.path.join(SILVER_DATA_PATH, "train", "labels.parquet"))
val_labels_df = spark.read.format("parquet").load(os.path.join(SILVER_DATA_PATH, "validate", "labels.parquet"))
test_labels_df = spark.read.format("parquet").load(os.path.join(SILVER_DATA_PATH, "test", "labels.parquet"))

In [6]:
train_labels_df.show()

+------+--------------------+--------------------+
| value|            filePath|           subjectId|
+------+--------------------+--------------------+
|  male|file:///c:/Users/...|23yipikaye-201008...|
|female|file:///c:/Users/...|Anniepoo-20140308...|
|female|file:///c:/Users/...|Anniepoo-20140308...|
|female|file:///c:/Users/...|Anniepoo-20140308...|
|female|file:///c:/Users/...|Anniepoo-20140308...|
|female|file:///c:/Users/...| 1337ad-20170321-tkg|
|  male|file:///c:/Users/...| 1snoke-20120412-hge|
|  male|file:///c:/Users/...|  Aaron-20080318-kdl|
|  male|file:///c:/Users/...|   1028-20100710-hne|
+------+--------------------+--------------------+



In [7]:
val_labels_df.show()

+------+--------------------+-------------------+
| value|            filePath|          subjectId|
+------+--------------------+-------------------+
|female|file:///c:/Users/...|1337ad-20170321-ajg|
|  male|file:///c:/Users/...| Coren-20141121-pxp|
+------+--------------------+-------------------+



In [8]:
test_labels_df.show()

+------+--------------------+--------------------+
| value|            filePath|           subjectId|
+------+--------------------+--------------------+
|female|file:///c:/Users/...|Anniepoo-20140308...|
+------+--------------------+--------------------+



#### we will load the data in this format for concurrent processing and to prevent bottle neck issues of having to read files into one dataframe and then just to only convert it back to a array fo tuples with subject name and a corresponding spark dataframe. So why not instead read the parquet files like this?
```
[
  (<subject 1>, <subject 1 spark df>),
  (<subject 2>, <subject 2 spark df>),
  (<subject 3>, <subject 3 spark df>),
  ...
  (<subject n>, <subject n spark df>),
]
```

In [9]:
def read_signal_files(SPLIT_FOLDER):

    # only include the parquet files without labels
    signal_files = [
        os.path.join(SPLIT_FOLDER, SIGNAL_DF_FILE).replace('\\', '/') for SIGNAL_DF_FILE in os.listdir(SPLIT_FOLDER) if not "labels" in SIGNAL_DF_FILE]

    def helper(signal_file):
        subject_id = signal_file.split('/')[-1].replace("_signals.parquet", "")
        signal_df = spark.read.format("parquet").load(signal_file)
        return subject_id, signal_df

    with ThreadPoolExecutor() as exe:
        signals_df = list(exe.map(helper, signal_files))

    return signals_df

In [10]:
SPLIT_URL = os.path.join(SILVER_DATA_PATH, "{SPLIT}")
SPLIT_FOLDER = SPLIT_URL.format(SPLIT="train")
train_signals_df = read_signal_files(SPLIT_FOLDER)

In [11]:
train_signals_df

[('1028-20100710-hne', DataFrame[subjectId: string, signals: float, ID: int]),
 ('1337ad-20170321-tkg',
  DataFrame[subjectId: string, signals: float, ID: int]),
 ('1snoke-20120412-hge',
  DataFrame[subjectId: string, signals: float, ID: int]),
 ('23yipikaye-20100807-ujm',
  DataFrame[subjectId: string, signals: float, ID: int]),
 ('Aaron-20080318-kdl', DataFrame[subjectId: string, signals: float, ID: int]),
 ('Anniepoo-20140308-bft',
  DataFrame[subjectId: string, signals: float, ID: int]),
 ('Anniepoo-20140308-cqj',
  DataFrame[subjectId: string, signals: float, ID: int]),
 ('Anniepoo-20140308-hns',
  DataFrame[subjectId: string, signals: float, ID: int]),
 ('Anniepoo-20140308-nky',
  DataFrame[subjectId: string, signals: float, ID: int])]

In [12]:
SPLIT_FOLDER = SPLIT_URL.format(SPLIT="validate")
val_signals_df = read_signal_files(SPLIT_FOLDER)

In [13]:
val_signals_df

[('1337ad-20170321-ajg',
  DataFrame[subjectId: string, signals: float, ID: int]),
 ('Coren-20141121-pxp', DataFrame[subjectId: string, signals: float, ID: int])]

In [14]:
SPLIT_FOLDER = SPLIT_URL.format(SPLIT="test")
test_signals_df = read_signal_files(SPLIT_FOLDER)

In [22]:
train_signals_df[0][-1].show()

+-----------------+-------------+---+
|        subjectId|      signals| ID|
+-----------------+-------------+---+
|1028-20100710-hne|-0.0054626465|  1|
|1028-20100710-hne|-0.0053710938|  2|
|1028-20100710-hne|-0.0055236816|  3|
|1028-20100710-hne| -0.005493164|  4|
|1028-20100710-hne| -0.005340576|  5|
|1028-20100710-hne|-0.0047302246|  6|
|1028-20100710-hne| -0.004486084|  7|
|1028-20100710-hne|-0.0047302246|  8|
|1028-20100710-hne|-0.0049438477|  9|
|1028-20100710-hne| -0.004760742| 10|
|1028-20100710-hne|-0.0042419434| 11|
|1028-20100710-hne| -0.004211426| 12|
|1028-20100710-hne| -0.004119873| 13|
|1028-20100710-hne|-0.0032348633| 14|
|1028-20100710-hne|-0.0029907227| 15|
|1028-20100710-hne|-0.0032958984| 16|
|1028-20100710-hne|-0.0035095215| 17|
|1028-20100710-hne|-0.0036621094| 18|
|1028-20100710-hne|-0.0036621094| 19|
|1028-20100710-hne| -0.003967285| 20|
+-----------------+-------------+---+
only showing top 20 rows



In [None]:
def extract_features(dataset: list[tuple[str, pyspark.sql.DataFrame]], hertz: int=16000, window_time: int=3, hop_time: int=1):
    """
    extracts the features from each segment of an audio signal
    """
    
    def helper(datum):
        # we access the SCR values via raw data column
        subject_id = datum[0]
        signal_df = datum[1]

        # # get number of rows of 16000hz signals 
        # n_rows = x_signals.shape[0]
        # # print(n_rows)

        # we calculate the window size of each segment or the
        # amount of samples it has to have based on the frequency
        samples_per_win_size = int(window_time * hertz)
        samples_per_hop_size = int(hop_time * hertz)
        # print(f"samples per window size: {samples_per_win_size}")
        # print(f"samples per hop size: {samples_per_hop_size}\n")

        
        feat_window = Window.orderBy("ID").rowsBetween(-(samples_per_win_size - 1), Window.currentRow)
        signal_df = signal_df.withColumn("freq_std", F.stddev("signals").over(feat_window))
        signal_df = signal_df.withColumn("freq_skew", F.skewness("signals").over(feat_window))
        signal_df = signal_df.withColumn("freq_kurt", F.kurtosis("signals").over(feat_window))
        
        signal_df = signal_df.withColumn("freq_mean", F.mean("signals").over(feat_window))
        # signal_df = signal_df.withColumn("freq_median", F.median("signals").over(feat_window))
        # median over window function is not supported so we can use 
        signal_df = signal_df.withColumn("freq_median", F.percentile("signals", 0.5).over(feat_window))
        signal_df = signal_df.withColumn("freq_mode", F.mode("signals").over(feat_window))
        
        signal_df = signal_df.withColumn("freq_min", F.min("signals").over(feat_window))
        signal_df = signal_df.withColumn("freq_max", F.max("signals").over(feat_window))
        signal_df = signal_df.withColumn("freq_var", F.variance("signals").over(feat_window))
        
        signal_df = signal_df.withColumn("freq_first_quart", F.percentile("signals", 0.25).over(feat_window))
        signal_df = signal_df.withColumn("freq_third_quart", F.percentile("signals", 0.75).over(feat_window))
        signal_df = signal_df.withColumn("freq_inter_quart_range", F.col("freq_first_quart") - F.col("freq_third_quart"))
        
        

        # range
        # interquartile range
        # variance  

        # # initialize segments to empty list as this will store our
        # # segmented signals 
        # segments = []
        # labels = []

        # # fig = plt.figure(figsize=(17, 5))
        # n_frames = 0

        # # this segments our signals into overlapping segments
        # for i in range(0, (n_rows - samples_per_win_size) + samples_per_hop_size, samples_per_hop_size):
        #     # # last segment would have start x: 464000 - end x: 512000
        #     # # and because 512000 plus our hop size of 16000 = 528000 
        #     # # already exceeding 521216 this then terminates the loop
        #     # i += samples_per_hop_size
        #     # start = i
        #     # end = i + samples_per_win_size
        #     start = i
        #     end = min((i + samples_per_win_size), n_rows)
        #     # print(f'start x: {start} - end x: {end}')

        #     # extract segment from calculated start and end
        #     # indeces
        #     segment = x_signals[start:end]

        #     # # calculate frequency domain features
        #     # # get the spectrogram by calculating short time fourier transform
        #     # spectrogram = np.abs(librosa.stft(segment))
        #     # # print(f"spectrogram shape: {spectrogram.shape}")

        #     # # Get the frequencies corresponding to the spectrogram bins
        #     # frequencies = librosa.fft_frequencies(sr=hertz)
        #     # # print(f"frequencies shape: {frequencies.shape}")

        #     # # Find the frequency bin with the highest average energy
        #     # peak_frequency_bin = np.argmax(np.mean(spectrogram, axis=1))

        #     # # Get the peak frequency in Hz
        #     # # calculate also peak frequency
        #     # # I think dito na gagamit ng fast fourier transform
        #     # # to obtain the frequency, or use some sort of function
        #     # # to convert the raw audio signals into a spectogram
        #     # peak_frequency = frequencies[peak_frequency_bin]

        #     # # calculate the segments fast fourier transform
        #     # ft = np.fft.fft(segment)

        #     # # the fft vector can have negative or positive values
        #     # # so to avoid negative values and just truly see the frequencies
        #     # # of each segment we use its absolute values instead 
        #     # magnitude = np.abs(ft)
        #     # mag_len = magnitude.shape[0]
        #     # frequency = np.linspace(0, hertz, mag_len)

        #     # calculate statistical features
        #     # because the frequency for each segment is 16000hz we can divide
        #     # it by 1000 to instead to get its kilo hertz alternative
        #     mean_freq_kHz = np.mean(segment, axis=0)
        #     median_freq_kHz = np.median(segment, axis=0)
        #     std_freq = np.std(segment, axis=0)
        #     mode_freq = mode(segment, axis=0)
            
        #     # min = np.min(segment, axis=0)

        #     # calculate first quantile, third quantile, interquartile range
        #     first_quartile_kHz = np.percentile(segment, 25) / 1000,
        #     third_quartile_kHz = np.percentile(segment, 75) / 1000,
        #     inter_quartile_range_kHz = (np.percentile(segment, 75) - np.percentile(segment, 25)) / 1000,

        #     # compute morphological features
        #     skewness = skew(segment)
        #     kurtosis = kurt(segment)

        #     # compute time domain features
        #     amp_env = np.max(segment, axis=0)
        #     rms = np.sqrt(np.sum(segment ** 2, axis=0) / samples_per_win_size)

        #     features = {
        #         # statistical features
        #         "mean_freq_kHz": mean_freq_kHz,
        #         "median_freq_kHz": median_freq_kHz,
        #         "std_freq": std_freq,
        #         "mode_freq": mode_freq[0],
        #         'first_quartile_kHz': first_quartile_kHz[0],
        #         'third_quartile_kHz': third_quartile_kHz[0],
        #         'inter_quartile_range_kHz': inter_quartile_range_kHz[0],

        #         # morphological features
        #         "skewness": skewness,
        #         "kurtosis": kurtosis,

        #         # time domain features
        #         "amp_env":amp_env,
        #         "rms": rms,
                
        #         # frequency features
        #         # "peak_frequency": peak_frequency,
        #     }
            
        #     segments.append(features)
        #     labels.append(label)
            
        #     n_frames += 1

        # frames = range(n_frames)
        # # print(f"number of frames resulting from window size of {samples_per_win_size} and a hop size of {samples_per_hop_size} from audio signal frequency of {hertz}: {frames}")

        # time = librosa.frames_to_time(frames, hop_length=samples_per_hop_size)
        # # print(f"shape of time calculated from number of frames: {time.shape[0]}\n")
        
        # # calculate other features
        # zcr = librosa.feature.zero_crossing_rate(y=x_signals, frame_length=samples_per_win_size, hop_length=samples_per_hop_size)
        # mel_spect = librosa.feature.melspectrogram(y=x_signals, sr=hertz, n_fft=samples_per_win_size, hop_length=samples_per_hop_size, n_mels=90)
        # mel_spect_db = librosa.power_to_db(mel_spect, ref=np.max)
        # mean_mel = np.mean(mel_spect_db, axis=0)
        # variance_mel = np.var(mel_spect_db, axis=0)

        # spect_cent = librosa.feature.spectral_centroid(y=x_signals, sr=hertz, n_fft=samples_per_win_size, hop_length=samples_per_hop_size)
        # # chroma_stft = librosa.feature.chroma_stft(y=x_signals, frame_length=samples_per_win_size, hop_length=samples_per_hop_size)
        # # print(mel_spect.shape, spect_cent.shape, zcr.shape)
        # # print(f"mel spectrogram shape: {mel_spect.shape}")

        # # calculate the number of values we need to remove in the
        # # feature vector librosa calculated for us compared to the
        # # feature vectors we calculated on our own
        # zcr_n_values_to_rem = np.abs(zcr.shape[1] - time.shape[0])
        # mean_mel_n_values_to_rem = np.abs(mean_mel.shape[0] - time.shape[0])
        # spect_cent_n_values_to_rem = np.abs(spect_cent.shape[1] - time.shape[0])

        # # get slice of those in range with time only
        # zcr = zcr.reshape(-1)[:-zcr_n_values_to_rem]
        # mean_mel = mean_mel.reshape(-1)[:-mean_mel_n_values_to_rem]
        # variance_mel = variance_mel.reshape(-1)[:-mean_mel_n_values_to_rem]
        # spect_cent = spect_cent.reshape(-1)[:-spect_cent_n_values_to_rem]

        # # create features dataframe
        # subject_features = pd.DataFrame.from_records(segments)
        # subject_features["zcr"] = zcr
        # subject_features["mean_mel"] = mean_mel
        # subject_features["variance_mel"] = variance_mel
        # subject_features["spect_cent"] = spect_cent
        
        # # create labels dataframe
        # subject_labels = pd.Series(labels)

        # os.makedirs(f"./data/_EXTRACTED_FEATURES/{split}", exist_ok=True)
        # subject_features.to_csv(f'./data/_EXTRACTED_FEATURES/{split}/{name}_features.csv')
        # subject_labels.to_csv(f'./data/_EXTRACTED_FEATURES/{split}/{name}_labels.csv')

        # return (subject_features, subject_labels, name, time)
        return subject_id, signal_df

    with ThreadPoolExecutor(max_workers=2) as exe: 
        signals_df = list(exe.map(helper, dataset))

        # # unzip subjects data and unpack
        # subjects_features, subjects_labels, subjects_names, time = zip(*subjects_data)
    
    # return subjects_features, subjects_labels, subjects_names, time
    return signals_df

In [66]:
train_signals_df[:1]

[('1028-20100710-hne', DataFrame[subjectId: string, signals: float, ID: int])]

In [67]:
signals_df = extract_features(dataset=train_signals_df[:1])

In [68]:
signals_df[0][-1]

DataFrame[subjectId: string, signals: float, ID: int, freq_std: double, freq_skew: double, freq_kurt: double, freq_mean: double, freq_median: double, freq_mode: float, freq_min: float, freq_max: float, freq_var: double, freq_first_quart: double, freq_third_quart: double, freq_inter_quart_range: double]

In [70]:
signals_df[0][-1].select("freq_inter_quart_range", "ID").show()

+----------------------+---+
|freq_inter_quart_range| ID|
+----------------------+---+
|                   0.0|  1|
|     -4.57763671875E-5|  2|
|     -7.62939453125E-5|  3|
|       -6.103515625E-5|  4|
|       -1.220703125E-4|  5|
|    -1.373291015625E-4|  6|
|    -4.425048828125E-4|  7|
|   -7.4005126953125E-4|  8|
|        -7.32421875E-4|  9|
|     -7.01904296875E-4| 10|
|    -6.866455078125E-4| 11|
|   -7.2479248046875E-4| 12|
|     -8.85009765625E-4| 13|
|  -0.00106048583984375| 14|
|    -0.001129150390625| 15|
|     -0.00115966796875| 16|
|       -0.001220703125| 17|
|        -0.00146484375| 18|
|   -0.0014801025390625| 19|
|  -0.00138092041015625| 20|
+----------------------+---+
only showing top 20 rows



In [None]:
signals_df[0][-1]
F.wind