In [None]:
### Stuff one needs
#! pip install SoundFile
#! pip install pandas
#! pip install torchaudio
#! pip install matplotlib
#! pip install pyaudio 
#! pip install pydub

In [None]:
# Imports
import wave
import tarfile

# As
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# From
from pathlib import Path
from pydub import AudioSegment

# Pytorch
import torchaudio

# Internals|helpers
from helpers import play_audio, extract_metadata 

In [None]:
# CONFIG
COMBINED = False # Keep False, a tool to combine audio recordings for testing purposes only

# Files
FILE = Path('LJ025-0076.wav')
FILE2 = Path('LJ025-0073.wav')
METADATA = Path('metadata.csv')

# Folders
FOLDER_LJ = Path('data/LJSpeech-1.1/')
FOLDER_WAV = Path('data/LJSpeech-1.1/wavs')

# Set default stuff 
torchaudio.set_audio_backend("soundfile") 

In [None]:
# Extract data from LJSpeech (https://keithito.com/LJ-Speech-Dataset/)
if FOLDER_WAV.is_dir() == False: 
    with tarfile.open("data/LJSpeech-1.1.tar.bz2", "r:bz2") as tar:
        tar.extractall("data/")

# Extracting metadata
df_meta = extract_metadata(FOLDER_LJ / METADATA)
df_meta.head()

In [None]:
# Find all characters in the LJSpeech 
charset = []
for text in df_meta['text']:
    for character in text:
        charset.append(character)
        charset = list(dict.fromkeys(charset))

charset = ''.join(sorted(charset))
charset

In [None]:
# Look into the data
characters_max = np.max([len(x) for x in df_meta['text']])
print(f'Max characters: {characters_max}')
print(f'Max length for LJSpeech-1.1: {1114}') # Hardcoded

In [None]:
# Take a deeper look at an example file 
waveform, sample_rate = torchaudio.load(FOLDER_WAV / FILE)
waveform_np = waveform.numpy()

plt.plot(np.arange(waveform_np.shape[1]),waveform_np[0]);

In [None]:
# Create spectrogram
transform = torchaudio.transforms.MelSpectrogram(sample_rate, n_mels=70)
mel_spectrogram = transform(waveform)

# Find the [FILE] label matching the spectrogram 
idx = [[idx,x] for idx, x in enumerate(df_meta['filenames']) if FILE.stem in x]
df_sample = df_meta.iloc[idx[0][0]]

mel_spectrogram_log = np.log(mel_spectrogram[0])

# Plot spectrogram
fig = plt.figure(figsize=(15, 10))
plt.title(df_sample['text'], loc='left')
plt.imshow(mel_spectrogram_log, cmap = 'viridis');

In [None]:
# Play the audio file
f = wave.open(str(FOLDER_WAV / FILE),"rb")  
play_audio(f) 

if COMBINED == True:
    wav_1 = AudioSegment.from_wav(str(FOLDER_WAV / FILE))
    wav_2 = AudioSegment.from_wav(str(FOLDER_WAV / FILE2))

    combined_sounds = wav_1 + wav_2
    combined_sounds.export("data//hax.wav", format="wav")
    f_combined = wave.open("data/hax.wav","rb")   
    
    play_audio(f_combined)  