# Notice!!!  
When converting wav files into melspectrograms in line with [WaveGlow](https://github.com/NVIDIA/WaveGlow), trained model didn't generate appropriate speech.  
Therefore, the wav files are converted into melspectrograms using `librosa.core.power_to_db()` instead of `torch.log()`.  
So, when you generate a wav using `WaveGlow`, you should change the generated melspectrogram as follow:
```python
melspec = model.inference(sequence)
melspec = torch.log(10**(melspec / 10))
audio = waveglow.inference(melspec)
```

### Import libraries, metadata

In [None]:
import hparams as hp
import os
import librosa
from librosa.filters import mel as librosa_mel_fn
from audio_processing import *
import numpy as np
import pickle as pkl
from text import *
import IPython.display as ipd
from tqdm.notebook import tqdm
from stft import STFT
import matplotlib.pyplot as plt
import torch
import random
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


csv_file = f'{hp.data_path}/metadata.csv'
root_dir = f'{hp.data_path}/wavs'
save_dir = f'{hp.data_path}/preprocessed'
landmarks_frame = pd.read_csv(csv_file, sep='|', header=None)

### STFT

In [None]:
class TacotronSTFT(torch.nn.Module):
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=22050, 
                 mel_fmin=0.0,
                 mel_fmax=8000.0):
        super(TacotronSTFT, self).__init__()
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa_mel_fn(sampling_rate,
                                   filter_length,
                                   n_mel_channels,
                                   mel_fmin,
                                   mel_fmax)
        self.mel_basis = torch.from_numpy(mel_basis).float()

    def wav_to_specs(self, y):
        assert(torch.min(y.data) >= -1)
        assert(torch.max(y.data) <= 1)

        magnitudes, _ = self.stft_fn.transform(y)
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        melspec = librosa.core.power_to_db(mel_output)
        
        return melspec

stft = TacotronSTFT()

### Others

In [None]:
def load_wav_to_torch(full_path):
    wav, _ = librosa.load(full_path, sr=22050)
    wav = wav / np.max( np.abs(wav) )
    cut = np.where((abs(wav)>0.01))[0]
    wav = wav[cut[0]:(cut[-1]+1)]
    
    return torch.FloatTensor(wav.astype(np.float32))


def get_mel(filename):
    wav = load_wav_to_torch(filename)
    melspec = stft.wav_to_specs(wav.unsqueeze(0))
    return torch.from_numpy(melspec.squeeze(0))


def save_file(idx):
    fname = landmarks_frame.iloc[idx, 0]
    wav_name = os.path.join(root_dir, fname) + '.wav'
    text= landmarks_frame.iloc[idx, 1]

    seq = torch.LongTensor(text_to_sequence(text, ['english_cleaners']))
    melspec = get_mel(wav_name)
    
    with open(f'{save_dir}/sequence/{fname}_sequence.pkl', 'wb') as f:
        pkl.dump(seq, f)
    with open(f'{save_dir}/melspectrogram/{fname}_melspectrogram.pkl', 'wb') as f:
        pkl.dump(melspec, f)
    
    return text, seq, melspec

### Save and Inspect Data

In [None]:
idx = 777
for i in tqdm(range(len(landmarks_frame))):
    text, seq, melspec = save_file(i)
    if i==idx:
        print("Text:")
        print(text)
        print()
        print()
        print("Phoneme:")
        print(sequence_to_text(seq.tolist()))
        print()
        print()
        print("Sequence:")
        print(seq)
        print()
        print()
        print("Melspectrogram:")
        plt.figure(figsize=(16,4))
        plt.imshow(melspec, aspect='auto', origin='lower')
        plt.show()