In [0]:
import re
import os
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import matplotlib.pyplot as plt
import pyspark.sql.functions as F

from pyspark.sql import SparkSession, Window
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.types import StringType, ArrayType, StructField, StructType, FloatType, DoubleType, IntegerType

In [0]:
spark = SparkSession.builder.appName("test")\
    .config("spark.driver.memory", "14g")\
    .config("spark.sql.execution.arrow.maxRecordsPerBatch", "100")\
    .getOrCreate()

In [0]:
url = "abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/"
folder_infos = dbutils.fs.ls(url)

In [0]:
sample_folder = folder_infos[-1].path
sample_folder

'abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1337ad-20170321-ajg/'

In [0]:
sample_folder.strip('/').split('/')[-1]

'1337ad-20170321-ajg'

In [0]:
type(folder_infos[-1])

dbruntime.dbutils.FileInfo

In [0]:
dbutils.fs.ls(folder_infos[-1].path)

[FileInfo(path='abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1337ad-20170321-ajg/LICENSE', name='LICENSE', size=659, modificationTime=1753875338000),
 FileInfo(path='abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1337ad-20170321-ajg/etc/', name='etc/', size=0, modificationTime=1753875338000),
 FileInfo(path='abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1337ad-20170321-ajg/wav/', name='wav/', size=0, modificationTime=1753875339000)]

In [0]:
f = dbutils.fs.head("abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1028-20100710-hne/etc/README")

In [0]:
f

'User Name:1028\n\nSpeaker Characteristics:\n\nGender: Male\nAge Range: Adult\nLanguage: EN\nPronunciation dialect: American English\n\nRecording Information:\n\nMicrophone make: n/a\nMicrophone type: Headset mic\nAudio card make: unknown\nAudio card type: unknown\nAudio Recording Software: VoxForge Speech Submission Application\nO/S:\n\nFile Info:\n\nFile type: wav\nSampling Rate: 48000\nSample rate format: 16\nNumber of channels: 1\n'

In [0]:
df = spark.read.format('text')\
    .option("lineSep", "\n")\
    .load("abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1028-20100710-hne/etc/README")

In [0]:
df.show()

+--------------------+
|               value|
+--------------------+
|      User Name:1028|
|                    |
|Speaker Character...|
|                    |
|        Gender: Male|
|    Age Range: Adult|
|        Language: EN|
|Pronunciation dia...|
|                    |
|Recording Informa...|
|                    |
|Microphone make: n/a|
|Microphone type: ...|
|Audio card make: ...|
|Audio card type: ...|
|Audio Recording S...|
|                O/S:|
|                    |
|          File Info:|
|                    |
+--------------------+
only showing top 20 rows


In [0]:
dbutils.fs.readFile("abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1028-20100710-hne/etc/README")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-7372681937682615>, line 1[0m
[0;32m----> 1[0m dbutils[38;5;241m.[39mfs[38;5;241m.[39mreadFile([38;5;124m"[39m[38;5;124mabfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1028-20100710-hne/etc/README[39m[38;5;124m"[39m)

[0;31mAttributeError[0m: 'RemoteFsHandler' object has no attribute 'readFile'

In [0]:
# what I want to do is list the files get all the file names in the bronze 
# container, which will give me a list, and then use that list to concurrently
# list the files inside these list of directories
def load_labels(DIR, folder_infos):
    def helper(folder_info):
        try:
            # remove trailing backslash
            folder = folder_info.path.strip('/').split('/')[-1]
            file_path = os.path.join(DIR, folder, "etc", "README")
            
            # print(file_path)
            # with open(file_path, "r") as file:
            #     lines = [line for line in file.readlines() if "gender" in line.lower()]
            #     file.close()

            file_content = dbutils.fs.head(file_path)

            print(lines)

            # extract only the gender of the subject in meta data
            # print(lines[0].lower())
            string = re.sub(r"(gender)", "", lines[0].lower())
            string = re.sub(r"[:;\[\]\t\n\s]", "", string)

            if string:
                gender = string
                if gender.startswith("ma") or gender.startswith("mä"):
                    return folder, string, "male"
                elif gender.startswith("fem") or gender.startswith("wei"):
                    return folder, string, "female"
                else:
                    return folder, string, "unknown"
            
        except IndexError:
            return folder, "unknown", "unknown"
        
        except FileNotFoundError:
            return folder, "unknown", "unknown"

    with ThreadPoolExecutor(max_workers=5) as exe:
        subjects_labels = list(exe.map(helper, folder_infos))
        
        
    return subjects_labels

In [0]:
labels = load_labels(url, folder_infos)

abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1028-20100710-hne/etc/README
abfss://sgppipelinesa-bronze@sgppipelinesa.dfs.core.windows.net/1337ad-20170321-ajg/etc/README


In [0]:
labels

[('1028-20100710-hne', 'unknown', 'unknown'),
 ('1337ad-20170321-ajg', 'unknown', 'unknown')]

In [0]:
def load_audio(DIR: str, folders: list, hertz=16000):
    """
    loads audio signals from each .wav file of each subject
    """

    def helper(folder):
    # for folder in folders:
        try:
            wavs_dir = os.path.join(DIR, folder, "wav")
            path_to_wavs = os.listdir(wavs_dir)

        # this is if a .wav file is not used as a directory so 
        # try flac 
        except FileNotFoundError:
            wavs_dir = os.path.join(DIR, folder, "flac")
            path_to_wavs = os.listdir(wavs_dir)

        finally:
            # create storage for list of signals to all be 
            # concatenated later
            ys = []

            # create figure, and axis
            # fig, axes = plt.subplots(nrows=len(path_to_wavs), ncols=1, figsize=(12, 30))
            
            for index, wav in enumerate(path_to_wavs):

                wav_path = os.path.join(wavs_dir, wav)
                # print(wav_path)

                # each .wav file has a sampling frequency is 16000 hertz 
                y, sr = librosa.load(wav_path, sr=hertz)

                # audio recordings can have different length
                print(f"shape of audio signals {y.shape}")
                print(f"sampling rate of audio signals after interpolation: {sr}")

                # top_db is set to 20 representing any signal below
                # 20 decibels will be considered silence
                y_trimmed, _ = librosa.effects.trim(y, top_db=20)

                # append y to ys 
                ys.append(y_trimmed)

            # concatenate all audio signals into one final signal as 
            # this is all anyway recorded in the voice of the same gender
            final = np.concatenate(ys, axis=0)
            print(f"shape of final signal: {final.shape}")
            # print(f"shape of signal: {y.shape}")
            # print(f"shape of trimmed signal: {y_trimmed.shape}")
            # print(f"sampling rate: {sr}")
            # librosa.display.waveshow(final, alpha=0.5)

            # plt.tight_layout()
            # plt.show()

            return folder, final
        
    # concurrently load .wav files and trim  each .wav files
    # audio signal and combine into one signal for each subject 
    with ThreadPoolExecutor(max_workers=5) as exe:
        signals = list(exe.map(helper, folders))
        
    return signals