In [1]:
import numpy as np
import scipy as scipy
import os
import sys
from scipy.io import wavfile
import soundfile as sf
import IPython.display as ipd
import librosa
import matplotlib.pyplot as plt

sys.path.insert(0, './src/')
from src import util

In [None]:
#util.flacs_to_wavs()
util.split_audio_in_samples()

## Reading FLAC, remove silence, write as WAV

For this, we downloaded the [LibriSpeech Clean Speech Develoment Set](https://www.openslr.org/resources/12/dev-clean.tar.gz), and unpacked it in the "/data" folde

In [None]:
data, samplerate = sf.read("./data/LibriSpeech/dev-clean/84/121123/84-121123-0000.flac")
print(len(data))
data = data[data != 0.]
print(len(data))

In [None]:
print(data.min())
print(data.max())

In [None]:
wavfile.write("./samples/84-121123-0000.wav", samplerate, data)
ipd.Audio("./samples/84-121123-0000.wav")

## Make Audio Files of Certain Length

First load in an audio file, check how long it is and its samplerate

In [None]:
sample1, samplerate = sf.read("./data/LibriSpeech/dev-clean/84/121123/84-121123-0002.flac")
wavfile.write("./samples/84-121123-0002.wav", samplerate, sample1)
print(len(sample1))
print(samplerate)

An array length of `219040` and a sample rate of `16000` means that the length of this audio file is $\frac{219040}{16000} \approx 13$ seconds. Let's load in the `.wav` version of this file, to see if it is indeed roughly 13 seconds long.

In [None]:
ipd.Audio("./samples/84-121123-0002.wav") 

And we were correct. So, in order to make this sample 5 seconds long instead of 13, we have to save the first $5 \cdot 16000 = 80000$ elements in our sample array.

In [None]:
sample1, samplerate = sf.read("./data/LibriSpeech/dev-clean/84/121123/84-121123-0002.flac")
shorter_sample1 = sample1[0:5*samplerate]
wavfile.write("./samples/84-121123-0002-5secs.wav", samplerate, shorter_sample1)
ipd.Audio("./samples/84-121123-0002-5secs.wav") 

And we have indeed succesfully cut the audio sample down to 5 seconds!

## Min-Max Normalization for Volume

In [None]:
sample1, samplerate = sf.read("./data/LibriSpeech/dev-clean/84/121123/84-121123-0002.flac")
wavfile.write("./samples/84-121123-0002.wav", samplerate, sample1)

In [None]:
print(sample1.max())
print(sample1.min())

In [None]:
ipd.Audio("./samples/84-121123-0002.wav") 

In [None]:
a = -1
b = 1

max1 = sample1.max()
min1 = sample1.min()

normalized_sample1 = a + ((sample1 - min1) * (b - a))/(max1 - min1)

In [None]:
print(normalized_sample1.max())
print(normalized_sample1.min())

In [None]:
wavfile.write("./samples/84-121123-0002_minmax.wav", samplerate, normalized_sample1)
ipd.Audio("./samples/84-121123-0002_minmax.wav") 

In [None]:
a = -0.1
b = 0.1

max1 = sample1.max()
min1 = sample1.min()

normalized_sample1 = a + ((sample1 - min1) * (b - a))/(max1 - min1)
wavfile.write("./samples/84-121123-0002_very_quiet.wav", samplerate, normalized_sample1)
ipd.Audio("./samples/84-121123-0002_very_quiet.wav") 

## Merging Two Audio Samples

We start by loading in a `.flac` file, we rewrite it as a `.wav`, and we can then listen to this file via the notebook.

In [None]:
sample1, samplerate = sf.read("./data/LibriSpeech/dev-clean/84/121123/84-121123-0002.flac")
wavfile.write("./samples/84-121123-0002.wav", samplerate, sample1)
ipd.Audio("./samples/84-121123-0002.wav") 

There seems to be some silent moments in the audio file, which we can filter out by running the next cell. Note how the lengths of the sample change. Although I am not sure if this is really needed in the end product, it is nice to show how it is done.

In [None]:
print(len(sample1))
#sample1 = sample1[sample1 != 0.]
print(len(sample1))

In [None]:
print(219040/16000)
print(218703/16000)

Lets save this shortened sample, and listen to it.

In [None]:
wavfile.write("./samples/84-121123-0002-shortened.wav", samplerate, sample1)
ipd.Audio("./samples/84-121123-0002-shortened.wav") 

We can do this for a second audio fragment:

In [None]:
sample2, samplerate = sf.read("./data/LibriSpeech/dev-clean/174/50561/174-50561-0006.flac")
wavfile.write("./samples/174-50561-0006.wav", samplerate, sample2)
ipd.Audio("./samples/174-50561-0006.wav")


In [None]:
print(len(sample2))
sample2 = sample2[sample2 != 0.]
print(len(sample2))
wavfile.write("./samples/174-50561-0006-shortened.wav", samplerate, sample2)
ipd.Audio("./samples/174-50561-0006-shortened.wav") 

Make sure that the audio files have the same length.

In [None]:
if len(sample1) > len(sample2):
    sample1 = sample1[0:len(sample2)]
else:
    sample2 = sample2[0:len(sample1)]

[Audio processing magic](https://stackoverflow.com/questions/4039158/mixing-two-audio-files-together-with-python): add the two signals and divide them by two, and lets listen to the result!

In [None]:
new_sample = (sample1 + sample2)/2

In [None]:
wavfile.write("./samples/test_merging.wav", samplerate, new_sample)
ipd.Audio("./samples/test_merging.wav")



## Lange termijn:
 - Netwerk

### Compute STFT

In the countnet paper, STFT's are used

In [None]:
X1 = np.abs(librosa.stft(sample1, n_fft=400, hop_length=160)).T
X2 = np.abs(librosa.stft(sample2, n_fft=400, hop_length=160)).T
XM = np.abs(librosa.stft(new_sample, n_fft=400, hop_length=160)).T

In [None]:
f, axarr = plt.subplots(1,3, figsize=(10,10))
f.patch.set_facecolor('white')
axarr[0].imshow(X1)
axarr[0].set_title('Sample 1')
axarr[1].imshow(X2)
axarr[1].set_title('Sample 2')
axarr[2].imshow(XM)
axarr[2].set_title('Merged Sample')
plt.show()

### Loading all filepaths:

In [None]:
splits_dir = './data/wav_splits/'
speakers = os.listdir(splits_dir)

In [None]:
files_per_speaker = []
nr_of_files = 0
for speaker in speakers:
    speaker_files = []
    for subdir, dirs, files in os.walk(splits_dir+"/{}".format(speaker)):
        for f in files:
            nr_of_files +=1
            speaker_files.append(splits_dir+f)
    files_per_speaker.append(speaker_files)


In [None]:
len(files_per_speaker)
print("In total, there are {} audio samples".format(nr_of_files))

### Create Dataset?

In [97]:
splits_dir = './data/train100/train-clean-100'
speakers = os.listdir(splits_dir)

files_per_speaker = []
nr_of_files = 0
for speaker in speakers:
    speaker_files = []
    #print(len(files_per_speaker))
    
        
    for subdir, dirs, files in os.walk(splits_dir+"/{}".format(speaker)):
        for f in files:
            if f.endswith('.wav'):
                nr_of_files +=1
                speaker_files.append(splits_dir+f)
    if len(speaker_files) > 0:
        files_per_speaker.append(speaker_files)
    



In [98]:
print(nr_of_files)

59959


In [99]:
files_per_speaker = np.array(files_per_speaker)
amount_of_datapoints = 0
combinations = []


i = 1
# for i in range(1, val_amount+1):
while(files_per_speaker.shape[0] > 0):
    amount_of_speakers = i % 11
    
    # Check how many speakers are still in the dataset
    number_of_rows = files_per_speaker.shape[0]

    # When the amount of speakers is larger than nr of rows, decrease amount of speakers until we can sample again
    while( amount_of_speakers > number_of_rows):
        amount_of_speakers -= 1
        
    random_speaker_ids = np.random.choice(number_of_rows, size=amount_of_speakers, replace=False)
    #print(random_speaker_ids)
    ids_to_remove = []
    
    files_to_merge = []
    for speaker_id in random_speaker_ids:
        # Load all files for this speaker:
        speaker_files = files_per_speaker[speaker_id]
        # For each random speaker, pick one random file:
        random_file = np.random.choice(speaker_files)
        
        files_to_merge.append(random_file)
        # Remove file from original set, to prevent duplicates among merged files
        files_per_speaker[speaker_id] = np.delete(files_per_speaker[speaker_id], np.where(files_per_speaker[speaker_id] == random_file)[0])
           
        # If all files from a single speaker are used: remove the speakers, to prevent sampling from empty lists
        if len(files_per_speaker[speaker_id]) == 0:
            ids_to_remove.append(speaker_id)
    
    # data = np.zeros(18000)
    ## For all files in files_to_merge:
        # data += files
    # data = data / len(files_to_merge)
    
    # Data opslaan naar een mapje, zodat we ook bijhouden hoeveel sprekers erin zitten
    # E.g. opslaan als : "{}_{}.wav".format(amount_of_speaker, amount_of_datapoints)
    #      of: voor iedere class een aparte map
    # Mss een .txt uitschrijven met dezelfde naam als het audio bestand, met daarin random_speaker_ids
    
    files_per_speaker = np.delete(files_per_speaker, ids_to_remove)
    # Increment counter for calculating amount of speakers
    i += 1
    amount_of_datapoints +=1

print("{} unique datapoints created".format(amount_of_datapoints))

12051 unique datapoints created
