In [6]:
import os
import io
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrowfs_adlgen2 as pa_adl
import numpy as np
import librosa

from dotenv import load_dotenv
from pathlib import Path

from azure.identity import DefaultAzureCredential, ClientSecretCredential
from azure.storage.filedatalake import DataLakeServiceClient
from azure.keyvault.secrets import SecretClient
from azure.core.exceptions import ResourceNotFoundError

from concurrent.futures import ThreadPoolExecutor

In [None]:
# # Retrieve credentials from environment variables
# # this is strictly used only in development
# # load env variables
# env_dir = Path('../../').resolve()
# load_dotenv(os.path.join(env_dir, '.env'))

True

In [4]:
storage_account_name = os.environ.get("STORAGE_ACCOUNT_NAME")
credential = os.environ.get("STORAGE_ACCOUNT_KEY")
conn_str = os.environ.get("STORAGE_ACCOUNT_CONN_STR")

In [5]:
# cloud
# create client with generated sas token
datalake_service_client = DataLakeServiceClient(
    account_url=f"https://{storage_account_name}.dfs.core.windows.net", 
    credential=credential
)

# retrieves file system client/container client 
# to retrieve datalake client
bronze_container_client = datalake_service_client.get_file_system_client(f"{storage_account_name}-bronze")

# we only get the directories in the first level of 
# the container, if it has a "/" then it means it is not
# an immediate folder in the container. This only really
# gets the subject folders 
subject_folders = [path.name for path in bronze_container_client.get_paths() if not "/" in path.name]
subject_folders

['1028-20100710-hne',
 '1337ad-20170321-ajg',
 '1337ad-20170321-tkg',
 '1snoke-20120412-hge',
 '23yipikaye-20100807-ujm',
 'Aaron-20080318-kdl',
 'Anniepoo-20140308-bft',
 'Anniepoo-20140308-cqj',
 'Anniepoo-20140308-fcp',
 'Anniepoo-20140308-hns',
 'Anniepoo-20140308-nky',
 'Coren-20141121-pxp']

In [None]:
def ingest_signals(subject_folders: list[str], DATA_DIR: str=None):

    def helper(subject_folder):
            
        # for folder in folders:
        try:
            # # local
            # # lists out the files containing the .wav files in
            # # a subjects folder. Note also that using double backslash
            # # is not accepted by azure and will raise `path does not exist`
            # # or `files can not exist on the root account level` that's
            # # why we replace it always with forward slashes
            # wavs_dir = os.path.join(DATA_DIR, "bronze", subject_folder, "wav").replace("\\", "/")
            # path_to_wavs = os.listdir(wavs_dir)

            wavs_dir = os.path.join(subject_folder, "wav").replace("\\", "/")
            path_to_wavs = [
                path.name 
                for path in bronze_container_client.get_paths(path=wavs_dir)
            ]
        
        except (ResourceNotFoundError, FileNotFoundError):
            # # local
            # # this is if a .wav file is not used as a 
            # # directory so try flac 
            # wavs_dir = os.path.join(DATA_DIR, "bronze", subject_folder, "flac").replace("\\", "/")
            # path_to_wavs = os.listdir(wavs_dir)

            # cloud
            # this is if a .wav file is not used as a directory so 
            # try flac 
            wavs_dir = os.path.join(subject_folder, "flac").replace("\\", "/")
            path_to_wavs = [
                path.name 
                for path in bronze_container_client.get_paths(path=wavs_dir)
            ]

        finally:
            # create storage for list of signals to all be 
            # concatenated later
            ys = []
            for index, wav in enumerate(path_to_wavs):
                # cloud
                wav_file_client = bronze_container_client.get_file_client(wav)

                # Download the file content
                download_result = wav_file_client.download_file()
                downloaded_bytes = download_result.readall()
                audio_buff = io.BytesIO(downloaded_bytes)

                # # local
                # audio_buff = os.path.join(wavs_dir, wav)

                # let librosa read the audio buffer containing t+he content
                # of the binary audio file
                y, sr = librosa.load(audio_buff, sr=16000)

                # audio recordings can have different length
                print(f"shape of audio signals {y.shape}")
                print(f"sampling rate of audio signals after interpolation: {sr}")

                # top_db is set to 20 representing any signal below
                # 20 decibels will be considered silence
                y_trimmed, _ = librosa.effects.trim(y, top_db=20)

            #     # append y to ys 
                ys.append(y_trimmed)

            # concatenate all audio signals into one final signal as 
            # this is all anyway recorded in the voice of the same gender
            final = np.concatenate(ys, axis=0)

            # create pyarrow table so we can write this table as
            # parquet file format later
            table = pa.table({
                "signals": pa.array(final), 
                "subjectId": pa.array([subject_folder] * final.shape[0], type=pa.string()),
                "rowId": pa.array(np.arange(final.shape[0]), type=pa.int32())
            })

            # cloud
            # write the pyarrow table to azure data lake using
            # pyarrow azure file system using the credential we
            # retrieved which uses the function apps system assigned 
            # managed identity
            SILVER_FOLDER_NAME = f"{storage_account_name}-silver"
            SUB_FOLDER_NAME = "stage-01"
            FILE_NAME = f"{subject_folder}_signals.parquet"

            SILVER_DATA_PATH = os.path.join(SILVER_FOLDER_NAME, SUB_FOLDER_NAME, FILE_NAME).replace("\\", "/")
            handler = pa_adl.AccountHandler.from_account_name(storage_account_name, credential=credential)
            fs = pa.fs.PyFileSystem(handler)
            pq.write_table(table, SILVER_DATA_PATH, filesystem=fs)

            # # local
            # SILVER_FOLDER_NAME = f"silver"
            # SUB_FOLDER_NAME = "stage-01"
            # FILE_NAME = f"{subject_folder}_signals.parquet"

            # SILVER_DATA_PATH = os.path.join(DATA_DIR, SILVER_FOLDER_NAME, SUB_FOLDER_NAME, FILE_NAME).replace("\\", "/")
            # pq.write_table(table, SILVER_DATA_PATH)

            return subject_folder, final
            
    # concurrently load .wav files and trim  each .wav files
    # audio signal and combine into one signal for each subject 
    with ThreadPoolExecutor(max_workers=5) as exe:
        print("running")
        exe.map(helper, subject_folders)

In [8]:
# # local
# BRONZE_FOLDER_NAME = "bronze"
# DATA_DIR = "../include/data"
# BRONZE_DATA_DIR = os.path.join("{DATA_DIR}", "{FOLDER_NAME}").replace("\\", "/")
# BRONZE_DATA_DIR

In [9]:
# subject_folders = os.listdir(BRONZE_DATA_DIR.format(DATA_DIR=DATA_DIR, FOLDER_NAME=BRONZE_FOLDER_NAME))
# len(subject_folders)

In [13]:
ingest_signals(subject_folders, DATA_DIR=None)

running
['1028-20100710-hne', '1337ad-20170321-ajg', '1337ad-20170321-tkg', '1snoke-20120412-hge', '23yipikaye-20100807-ujm', 'Aaron-20080318-kdl', 'Anniepoo-20140308-bft', 'Anniepoo-20140308-cqj', 'Anniepoo-20140308-fcp', 'Anniepoo-20140308-hns', 'Anniepoo-20140308-nky', 'Coren-20141121-pxp']
shape of audio signals (76000,)
sampling rate of audio signals after interpolation: 16000
shape of audio signals (106000,)
sampling rate of audio signals after interpolation: 16000
shape of audio signals (122000,)
sampling rate of audio signals after interpolation: 16000
shape of audio signals (74000,)
sampling rate of audio signals after interpolation: 16000
shape of audio signals (100000,)
sampling rate of audio signals after interpolation: 16000
shape of audio signals (90000,)
sampling rate of audio signals after interpolation: 16000
shape of audio signals (122000,)
sampling rate of audio signals after interpolation: 16000
shape of audio signals (136000,)
sampling rate of audio signals after i