In [None]:
import os
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display

## 1) Convert dataset from stereo to mono 
#### We do this because we need the audio files to be in one channel (mono)

import os
from pydub import AudioSegment

# Set input and output directories
input_dir = '/Users/cabral/archive/augmented-audio'
output_dir = '/Users/cabral/archive/converted-audio'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Function to convert stereo to mono
def convert_to_mono(input_file, output_file):
    audio = AudioSegment.from_wav(input_file)
    audio = audio.set_channels(1)  # Convert to mono
    audio.export(output_file, format="wav")

# Walk through the input directory and its subfolders
for root, _, files in os.walk(input_dir):
    for file in files:
        if file.endswith(".wav"):
            # Construct the full paths for input and output files
            input_file = os.path.join(root, file)
            # Create a similar folder structure in the output directory
            relative_path = os.path.relpath(input_file, input_dir)
            output_file = os.path.join(output_dir, relative_path)

            # Create the directory for the output file if it doesn't exist
            os.makedirs(os.path.dirname(output_file), exist_ok=True)

            # Convert the audio file to mono and save it in the output directory
            convert_to_mono(input_file, output_file)
            #print(f"Converted {input_file} to mono and saved to {output_file}")
            

print("All files have been converted succesfully")


In [None]:
# Set the seed value for experiment reproducibility.
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

In [None]:
DATASET_PATH = '/Users/cabral/archive/converted-audio'
data_dir = pathlib.Path(DATASET_PATH)

In [None]:
Speakers = np.array(tf.io.gfile.listdir(str(data_dir)))
Speakers = Speakers[(Speakers != 'README.md') & (Speakers != '.DS_Store')]
print('Speakers:', Speakers)

#### The audio clips are 5 seconds long at 44.1 kHz. The output_sequence_length=44100 pads the short ones to exactly 5 seconds (and would trim longer ones) so that they can be easily batched.

#### Divided into directories this way, you can easily load the data using: keras.utils.audio_dataset_from_directory. 

#### Validation_split is set to 0.2, which means that 20% of the data will be used for validation, and the remaining 80% for training.

#### Audios are of 5 seconds, so they are 44.1 kHz, which means that every second 44,100 samples were taken


In [None]:
train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
    directory=data_dir,
    batch_size=64,
    validation_split=0.2,
    seed=0,
    output_sequence_length=44100,
    subset='both')

label_names = np.array(train_ds.class_names)
print()
print("label names:", label_names)

for audio, labels in train_ds:
    print("\nVerify they are in mono\nNumber of channels:", audio.shape[-1])
    break


#### The dataset now contains batches of audio clips and integer labels. The audio clips have a shape of (batch, samples, channels).

In [None]:
train_ds.element_spec

In [None]:
#This dataset only contains single channel audio, 
#so use the tf.squeeze function to drop the extra axis:

def squeeze(audio, labels):
    audio = tf.squeeze(audio, axis=-1)
    return audio, labels

train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)

#### The utils.audio_dataset_from_directory function only returns up to two splits. It's a good idea to keep a test set separate from your validation set. Ideally you'd keep it in a separate directory, but in this case you can use Dataset.shard to split the validation set into two halves. Note that iterating over any shard will load all the data, and only keep its fraction.

In [None]:
test_ds = val_ds.shard(num_shards=2, index=0)
val_ds = val_ds.shard(num_shards=2, index=1)

for example_audio, example_labels in train_ds.take(1):  
    print(example_audio.shape)
    print(example_labels.shape)

    

#### Let's plot a few audio waveforms:

In [None]:
label_names[[1,2,3,0]]

In [None]:
plt.figure(figsize=(16, 10))
rows = 3
cols = 3
n = rows * cols
for i in range(n):
    plt.subplot(rows, cols, i+1)
    audio_signal = example_audio[i]
    plt.plot(audio_signal)
    plt.title(label_names[example_labels[i]])
    plt.yticks(np.arange(-1.2, 1.2, 0.2))
    plt.ylim([-1.1, 1.1])


## 2) Convert waveforms to spectrograms
#### Keep in mind that CNN works with spectograms which show frequency changes over time and can be represented as 2D images. Feed the spectrogram images into your neural network to train the model.

Apply a get_spectrogram function:
- This function is used to convert the waveform (time-domain audio) into a spectrogram (frequency-domain representation).
- It applies a Short-Time Fourier Transform (STFT) to the waveform, which transforms the audio into a spectrogram.
- The shape of the spectrogram is expanded with an extra dimension (tf.newaxis) to be used with convolutional layers

In [None]:
def get_spectrogram(waveform):
  # Convert the waveform to a spectrogram via a STFT.
    spectrogram = tf.signal.stft(
    waveform, frame_length=255, frame_step=1)
    # Obtain the magnitude of the STFT.
    spectrogram = tf.abs(spectrogram)
    # Add a `channels` dimension, so that the spectrogram can be used
    # as image-like input data with convolution layers (which expect
    # shape (`batch_size`, `height`, `width`, `channels`).
    spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram


In [None]:
for i in range(5):
    label = label_names[example_labels[i]]
    waveform = example_audio[i]
    spectrogram = get_spectrogram(waveform)

    print('Label:', label)
    print('Waveform shape:', waveform.shape)
    print('Spectrogram shape:', spectrogram.shape)
    print('Audio playback')
    display.display(display.Audio(waveform, rate=44100))

#### Define a function to display a spectogram

In [None]:
def plot_spectrogram(spectrogram, ax):
    if len(spectrogram.shape) > 2:
        assert len(spectrogram.shape) == 3
        spectrogram = np.squeeze(spectrogram, axis=-1)
  # Convert the frequencies to log scale and transpose, so that the time is
  # represented on the x-axis (columns).
  # Add an epsilon to avoid taking a log of zero.
    log_spec = np.log(spectrogram.T + np.finfo(float).eps)
    height = log_spec.shape[0]
    width = log_spec.shape[1]
    X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
    Y = range(height)
    ax.pcolormesh(X, Y, log_spec)

#### Plot the example's waveform over time and the corresponding spectrogram (frequencies over time):

In [None]:
fig, axes = plt.subplots(2, figsize=(12, 8))
timescale = np.arange(waveform.shape[0])
axes[0].plot(timescale, waveform.numpy())
axes[0].set_title('Waveform')
axes[0].set_xlim([0, 44100])

plot_spectrogram(spectrogram.numpy(), axes[1])
axes[1].set_title('Spectrogram')
plt.suptitle(label.title())
plt.show()

#### Now, create spectrogram datasets from the audio datasets:

In [None]:
def make_spec_ds(ds):
    return ds.map(
      map_func=lambda audio,label: (get_spectrogram(audio), label),
      num_parallel_calls=tf.data.AUTOTUNE)

train_spectrogram_ds = make_spec_ds(train_ds)
val_spectrogram_ds = make_spec_ds(val_ds)
test_spectrogram_ds = make_spec_ds(test_ds)

for example_spectrograms, example_spect_labels in train_spectrogram_ds.take(1):
    print("Hello")
    break

#### Examine the spectrograms for different examples of the dataset:

In [None]:
for example_spectrograms, example_spect_labels in train_spectrogram_ds.take(1):
    print("Hello")
    break

In [None]:
rows = 3
cols = 3
n = rows*cols
fig, axes = plt.subplots(rows, cols, figsize=(16, 9))

for i in range(n):
    r = i // cols
    c = i % cols
    ax = axes[r][c]
    plot_spectrogram(example_spectrograms[i].numpy(), ax)
    ax.set_title(label_names[example_spect_labels[i].numpy()])

plt.show()