# Preprocess raw audio
This notebook handles the preprocessing of speech corpora. 

The first corpus handled is the [LibriSpeech corpus](https://www.openslr.org/12), and more specifically the `dev-clean.tar.gz` version of it. The goal is to compose all individual fragments of one "story" into one big audio file that can be used later on for further processing.

A second corpus that is handled is the [VCTK corpus](https://datashare.ed.ac.uk/handle/10283/3443), which can be considered to be more "clean" than `LibriSpeech`.

Both corpora have dedicated cells that allow to process them, both a function and a calling cell.

It should be noted that the `silencePeriod` should ideally be chosen different from 0 to account for the fact that in a true online-mode operation the noise-only segments are used to estimate $\mathbf{R}_{\mathbf{nn}}$. If the sentences are concatenated directl
y, the estimate of this autocorrelation matrix will be very poor if it is ever estimated at all.

In [None]:
# Setup
import numpy as np
import os
from scipy import signal
import soundfile as sf

rawAudioPath = os.path.join("path", "to", "raw", "audio")
processedPath = os.path.join("path", "to", "folder", "to", "store", "results")

expectedFs = 16000  # If sampling frequency not equal to this, resample
silencePeriod = 0.5  # [s]: time between sentences to allow for estimating Rnn

In [None]:
def handleSessionLibriSpeech(
    sessionPath: str,
    processedPath: str,
    subject: str,
    session: str,
    expectedFs: int,
    silencePeriod: float,
):
    """
    Given the path to a directory containing all recordings of a certain
    session, stitch all recordings of the session together to obtain one large
    recording and save it.
    """
    recordings = []
    sessionData = []

    for recording in os.scandir(sessionPath):
        path = recording.path
        # collect all the recordings, and remove the transcript (in .txt form)
        if path.split(".")[-1] == "flac":
            recordings.append(path)

    # sort the list to have the right sequence of recordings
    recordings.sort(key=lambda x: x.split(".")[-2].split("-")[-1])

    # read the recordings on at a time and stitch together
    for recording in recordings:
        data, fs = sf.read(recording)
        if fs != expectedFs:  # resample
            data = signal.resample_poly(data, up=expectedFs, down=fs)
        if data.ndim > 1:
            if data.shape[0] == 2:
                data = data.T  # enforce data to be column vectors
            if data.shape[1] == 2:
                data = data[:, 0]  # convert stereo to mono

        sessionData.append(data)
        sessionData.append(np.zeros(int(expectedFs * silencePeriod)))

    sessionData = np.concatenate((sessionData), axis=0)
    sessionData = sessionData / np.max(np.abs(sessionData))

    sf.write(
        os.path.join(processedPath, subject, session + ".wav"), sessionData, expectedFs
    )

In [None]:
def handleSessionVCTK(
    subjectPath: str,
    saveFolder: str,
    subjectID: str,
    expectedFs: int,
    silencePeriod: float,
):
    """
    Given the path to a directory containing all recordings of a certain
    session, stitch all recordings of the session together to obtain one large
    recording and save it.

    Since VCTK has a "mic_1" and "mic_2" per recording, this is filtered to
    avoid having nigh the same pronounciation twice.
    """
    recordings = []
    sessionData = []

    for recording in os.scandir(subjectPath):  # no transcripts as in Librispeech
        recordingName = os.path.split(recording.path)
        if "mic1" in recordingName[1]:  # only retain mic 1
            recordings.append(recording.path)

    for recording in recordings:
        data, fs = sf.read(recording)
        if data.ndim != 1:  # get only one dimension
            if data.shape[0] == 2:
                data = data[0, :]
            else:
                data = data[:, 0]

        if fs != expectedFs:  # resample if needed
            data = signal.resample_poly(data, up=expectedFs, down=fs)

        sessionData.append(data)
        sessionData.append(np.zeros(int(silencePeriod * expectedFs)))

    sessionData = np.concatenate(sessionData, axis=0)
    sessionData = sessionData / (np.max(np.abs(sessionData)))

    sf.write(os.path.join(saveFolder, subjectID + ".wav"), sessionData, expectedFs)

In [None]:
# collect all sessions for LibriSpeech
for entry in os.scandir(rawAudioPath):
    subjectPath = entry.path
    subjectID = subjectPath.split("/")[-1]
    print(f"Subject {subjectID} started")

    saveFolder = os.path.join(processedPath, subjectID)
    if not os.path.isdir(saveFolder):
        os.mkdir(saveFolder)

    for session in os.scandir(subjectPath):
        sessionPath = session.path
        sessionID = sessionPath.split("/")[-1]
        handleSessionLibriSpeech(
            session.path, saveFolder, subjectID, sessionID, expectedFs
        )

In [None]:
# collect all sessions for VCTK
for entry in os.scandir(rawAudioPath):
    if os.path.isdir(entry):
        subjectPath = entry.path
        subjectID = subjectPath.split("/")[-1]
        print(f"Subject {subjectID} started")

        if not os.path.isdir(processedPath):
            os.mkdir(processedPath)

        handleSessionVCTK(
            subjectPath,
            processedPath,
            subjectID,
            expectedFs=expectedFs,
            silencePeriod=silencePeriod,
        )  # no sessions for VCTK