In [6]:
!conda install numpy=1.20

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [10]:
# Импорты
import sys
sys.path.append('tacotron2/')
from tacotron2.hparams import create_hparams
from tacotron2.layers import TacotronSTFT
import librosa
import json
import pandas as pd
import os
from io import BytesIO
from scipy.io.wavfile import write

In [11]:
# Расположение датасета
hifitts_path = 'F:\Разное\Программирование\Школа Data Scientist/hi_fi_tts_v0'

In [14]:
# Оконное преобразование Фурье
# Акустическая модель (tacotron2) и вокодер (waveglow) должны быть обучены на спектрограммах сформированных 
# с одинаковыми параметрами модуля STFT

import warnings
warnings.filterwarnings('ignore')

stft = TacotronSTFT(
    filter_length=1024, 
    hop_length=256, 
    win_length=1024,
    n_mel_channels=80, 
    sampling_rate=22050, 
    mel_fmin=0.0,
    mel_fmax=8000.0
)

In [15]:
def read_json(json_path):
    '''
    Функция отвечает за считывание файлов manifest.json
    '''
    dataset_type = json_path.split('_')[-1].replace('.json', '')
    with open(json_path, encoding='utf-8') as f:
        cond = "[" + f.read().replace("}\n{", "},\n{") + "]"
        json_data = json.loads(cond)
        for item in json_data:
            item['dataset_type'] = dataset_type
    return json_data

In [16]:
def flac_to_mel(load_flac_path, save_mel_path, dataset_type, txt_line):
    '''
    Функция формирует мел-спектрограмму из аудио-файла и сохраняет её
    '''
    
    # Считываем аудио-данные и частоту дискретизации файла (.flac, 44100Hz, pcm-f)
    flac_data, sample_rate = librosa.load(load_flac_path)
    
    # Формируем мел-спектрограмму
    melspec_1 = librosa.feature.melspectrogram(y=flac_data,sr=sample_rate)
    
    # Отсекаем слишком большие спектрограммы
    if melspec_1.shape[1] >= 1000:
        return False
    
    # Записываем информационную строку о текущем элементе в тексовый файл для обучения/валидации модели
    with open('./hifitts/' + dataset_type + '.txt', 'a') as f:
        f.write(txt_line)
        
    # Формируем новое аудио для записи в память
    audio = librosa.feature.inverse.mel_to_audio(melspec_1, sr=sample_rate)
    
    # Буфер памяти (что-бы не сохранять локально)
    buf = BytesIO()
    
    # Запись файла с другими параметрами, нежели были изначально
    # Необходимо, т.к. используется вокодер обученный на аудио с частотой дискретизации = 22050Hz
    # Так-же метод write модуля scipy считывает только wav формат
    # (считанные данные из flac файлов библиотеками librosa и soundfile, почему-то некорректно преобразовывались в 
    # mel-спектрограммы модулем stft)
    write(buf, sample_rate, audio)
    buffered_audio = buf.getvalue()
    buf.close()
    
    # Считываем аудио-данные и частоту дискретизации файла (.wav, 22050Hz, pcm-s)
    buf_data, sr = sf.read(buffered_audio)
    
    # Преобразовываем в тензор
    floated_data = torch.FloatTensor(buf_data.astype(np.float32))
    
    # Формирование мел-спектрограммы
    norm_data = floated_data / hp.max_wav_value
    norm_data = norm_data.unsqueeze(0)
    norm_data = torch.autograd.Variable(norm_data, requires_grad=False)
    melspec_2 = stft.mel_spectrogram(norm_data)
    melspec_2 = torch.squeeze(melspec_2, 0)
    
    # Сохранение файла
    np.save(save_mel_path, melspec_2)

In [21]:
# Формирование единого датафрейма по всем manifest-файлам .json 
manifests = [manifest for manifest in os.listdir(hifitts_path) if 'manifest' in manifest]
manifest_paths = [f'{hifitts_path}/{manifest}' for manifest in manifests]
manifest_jsons = [read_json(manifest_path) for manifest_path in manifest_paths]
manifest_dfs = [pd.DataFrame(manifest_json) for manifest_json in manifest_jsons]
manifests_df = pd.concat(manifest_dfs, axis=0)
manifests_df.head()

Unnamed: 0,audio_filepath,text,duration,text_no_preprocessing,text_normalized,dataset_type
0,audio/11614_other/12352/prideofjennico_01_cast...,some decision,1.03,"some decision,","some decision,",dev
1,audio/11614_other/12352/prideofjennico_01_cast...,i fear me that those around him then did not f...,7.96,I fear me that those around him then did not f...,I fear me that those around him then did not f...,dev
2,audio/11614_other/12352/prideofjennico_02_cast...,to keep myself something in countenance despit...,10.86,To keep myself something in countenance despit...,To keep myself something in countenance despit...,dev
3,audio/11614_other/12352/prideofjennico_03_cast...,under my gaze,1.06,"under my gaze,","under my gaze,",dev
4,audio/11614_other/12352/prideofjennico_04_cast...,in the vineyards,0.93,"In the vineyards,","In the vineyards,",dev


In [22]:
manifests_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 323978 entries, 0 to 35145
Data columns (total 6 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   audio_filepath         323978 non-null  object 
 1   text                   323978 non-null  object 
 2   duration               323978 non-null  float64
 3   text_no_preprocessing  323978 non-null  object 
 4   text_normalized        323978 non-null  object 
 5   dataset_type           323978 non-null  object 
dtypes: float64(1), object(5)
memory usage: 17.3+ MB


In [23]:
df = manifests_df.reset_index(drop=True).copy()

# Формирование колонки с нормализованным id диктора (от 0 до 9)
df['reader_id'] = df['audio_filepath'].apply(lambda x: x.split('/')[1].split('_')[0])
readers_list = [reader_id for reader_id in df.reader_id.unique()]
readers_dict = {reader_id: str(readers_list.index(reader_id)) for reader_id in readers_list}
df['reader_id_norm'] = df['reader_id'].apply(lambda x: readers_dict[x])
df.head()

Unnamed: 0,audio_filepath,text,duration,text_no_preprocessing,text_normalized,dataset_type,reader_id,reader_id_norm
0,audio/11614_other/12352/prideofjennico_01_cast...,some decision,1.03,"some decision,","some decision,",dev,11614,0
1,audio/11614_other/12352/prideofjennico_01_cast...,i fear me that those around him then did not f...,7.96,I fear me that those around him then did not f...,I fear me that those around him then did not f...,dev,11614,0
2,audio/11614_other/12352/prideofjennico_02_cast...,to keep myself something in countenance despit...,10.86,To keep myself something in countenance despit...,To keep myself something in countenance despit...,dev,11614,0
3,audio/11614_other/12352/prideofjennico_03_cast...,under my gaze,1.06,"under my gaze,","under my gaze,",dev,11614,0
4,audio/11614_other/12352/prideofjennico_04_cast...,in the vineyards,0.93,"In the vineyards,","In the vineyards,",dev,11614,0


In [24]:
# Формирование строки текстового файла по которому модель будет обучаться/валидироваться
df['mel_path'] = 'mels/' + df.index.astype('string') + '_' + df['dataset_type'] + '_' + df['reader_id']
df['txt_line'] = df['mel_path'] + '|' + df['text'] + '|' + df['reader_id_norm'] + '\n'
df.head()

Unnamed: 0,audio_filepath,text,duration,text_no_preprocessing,text_normalized,dataset_type,reader_id,reader_id_norm,mel_path,txt_line
0,audio/11614_other/12352/prideofjennico_01_cast...,some decision,1.03,"some decision,","some decision,",dev,11614,0,mels/0_dev_11614,mels/0_dev_11614|some decision|0\n
1,audio/11614_other/12352/prideofjennico_01_cast...,i fear me that those around him then did not f...,7.96,I fear me that those around him then did not f...,I fear me that those around him then did not f...,dev,11614,0,mels/1_dev_11614,mels/1_dev_11614|i fear me that those around h...
2,audio/11614_other/12352/prideofjennico_02_cast...,to keep myself something in countenance despit...,10.86,To keep myself something in countenance despit...,To keep myself something in countenance despit...,dev,11614,0,mels/2_dev_11614,mels/2_dev_11614|to keep myself something in c...
3,audio/11614_other/12352/prideofjennico_03_cast...,under my gaze,1.06,"under my gaze,","under my gaze,",dev,11614,0,mels/3_dev_11614,mels/3_dev_11614|under my gaze|0\n
4,audio/11614_other/12352/prideofjennico_04_cast...,in the vineyards,0.93,"In the vineyards,","In the vineyards,",dev,11614,0,mels/4_dev_11614,mels/4_dev_11614|in the vineyards|0\n


In [32]:
# Оставляем только необходимые колонки
df = df[['dataset_type', 'reader_id', 'reader_id_norm', 'text', 'audio_filepath', 'mel_path', 'txt_line']]

# Оставляем только тестовую и тренеровочную выборки
df = df[df['dataset_type'] != 'dev']
df.head()

Unnamed: 0,dataset_type,reader_id,reader_id_norm,text,audio_filepath,mel_path,txt_line
50,test,11614,0,our good mother who would not be the true woma...,audio/11614_other/12352/prideofjennico_01_cast...,mels/50_test_11614,mels/50_test_11614|our good mother who would n...
51,test,11614,0,i by no means deemed it incumbent upon me to s...,audio/11614_other/12352/prideofjennico_02_cast...,mels/51_test_11614,mels/51_test_11614|i by no means deemed it inc...
52,test,11614,0,and yet would i now undo the past if i could,audio/11614_other/12352/prideofjennico_02_cast...,mels/52_test_11614,mels/52_test_11614|and yet would i now undo th...
53,test,11614,0,will you not continue the ceremony,audio/11614_other/12352/prideofjennico_02_cast...,mels/53_test_11614,mels/53_test_11614|will you not continue the c...
54,test,11614,0,is masquerading a condition of tenure,audio/11614_other/12352/prideofjennico_02_cast...,mels/54_test_11614,mels/54_test_11614|is masquerading a condition...


In [37]:
# Создание директории для записи файлов
os.mkdir('./hifitts')
os.mkdir('./hifitts/mels')

tmp_df = df.copy()

In [38]:
# Формирование колонки со "строкой-параметрами" для передачи в виде аргумента в функцию
tmp_df['line_for_create_mel'] = \
    tmp_df['audio_filepath'] + '&' + \
    tmp_df['mel_path'] + '&' + \
    tmp_df['dataset_type'] + '&' + \
    tmp_df['txt_line']

In [39]:
# Создание мелспектрограмм
tmp_df['line_for_create_mel'].apply(lambda x: flac_to_mel(
    x.split('&')[0], 
    x.split('&')[1], 
    x.split('&')[2],
    x.split('&')[3],
))

FileNotFoundError: [Errno 2] No such file or directory: 'audio/11614_other/12352/prideofjennico_01_castle_0077.flac'