In [1]:
#data_root = '/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data'
#create_meta_file('wav_data', data_root, "train.txt", 16000, 1102, spectrograms=True, phonemes=False)

In [None]:
def create_meta_file(dataset_name, dataset_root_dir, output_metafile_name, audio_sample_rate, num_fft_freqs, spectrograms=True, phonemes=False):
        """Create the meta-file and spectrograms (mel and linear, optionally) or phonemized utterances (optionally).
        
        Format details:
            Every line of the metadata file contains info about one dataset item.
            The line has following format 
                'id|speaker|language|audio_file_path|mel_spectrogram_path|linear_spectrogram_path|text|phonemized_text'
            And the following must hold
                'audio_file_path' can be empty if loading just spectrograms
                'text' should be carefully normalized and should contain interpunction
                'phonemized_text' can be empty if loading just raw text  
        
        Arguments:
            dataset_name (string): Name of the dataset, loaders.py should contain a function for loading with a corresponding name.
            dataset_root_dir (string): Root directory from which is the dataset build and to which are spectrograms and the meta-file saved..
            output_metafile_name (string): Name of the output meta-file.
            audio_sample_rate (int): Sample rate of audios, used if spectrograms is set True.
            num_fft_freqs (int): Number of frequency bands used during spectrogram computation, used if spectrograms is set True.
        Keyword arguments:
            spectrograms (boolean, default True): If true, spetrograms (both mel and linear) are computed and saved.
            phonemes (boolean, default True): If true, phonemized variants of utterances are computed and saved.
        """

        path = '/content/drive/MyDrive/speech_project'
        # save current sample rate and fft freqs hyperparameters, as we may process dataset with different sample rate
        if spectrograms:
            old_sample_rate = hp.sample_rate
            hp.sample_rate = audio_sample_rate
            old_fft_freqs = hp.num_fft
            hp.num_fft = num_fft_freqs

        # load metafiles, an item is a list like: [text, audiopath, speaker_id, language_code]
        items = dataset.loaders.get_loader_by_name(dataset_name)(path)
        # build dictionaries for translation to IPA from source languages, see utils.text for details
        if phonemes:
            text_lang_pairs = [(i[0], hp.languages[0] if i[3] == "" else i[3]) for i in items]
            phoneme_dicts = text.build_phoneme_dicts(text_lang_pairs)

        # prepare directories which will store spectrograms
        if spectrograms:
            spectrogram_dirs = [os.path.join(path, 'spectrograms'), 
                                os.path.join(path, 'linear_spectrograms')]
            for x in spectrogram_dirs:
                if not os.path.exists(x): os.makedirs(x)

        # iterate through items and build the meta-data_root = '/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data'file
        metafile_path = os.path.join(path, output_metafile_name)
        #print(metafile_path)
        with open(metafile_path, 'w', encoding='utf-8') as f:
            Logger.progress(0, prefix='Building metafile:')
            for i in range(len(items)):
                raw_text, audio_path, speaker, language = items[i]
                if language == "": language = hp.languages[0]
                phonemized_text = text.to_phoneme(raw_text, False, language, phoneme_dicts[language]) if phonemes else ""     
                spectrogram_paths = "|"
                if spectrograms:
                    spec_name = f'{str(i).zfill(6)}.npy'
                    #print(os.path.join(dataset_root_dir, audio_path))
                    try:                 
                      audio_data = audio.load(os.path.join(dataset_root_dir, audio_path))
                    except:
                      print(audio_path)
                      continue
                    np.save(os.path.join(spectrogram_dirs[0], spec_name), audio.spectrogram(audio_data, True))
                    np.save(os.path.join(spectrogram_dirs[1], spec_name), audio.spectrogram(audio_data, False))
                    spectrogram_paths = os.path.join('spectrograms', spec_name) + '|' + os.path.join('linear_spectrograms', spec_name)
                print(f'{str(i).zfill(6)}|{speaker}|{language}|{audio_path}|{spectrogram_paths}|{raw_text}|{phonemized_text}', file=f)
                Logger.progress((i + 1) / len(items), prefix='Building metafile:')
        
        # restore the original sample rate and fft freq values
        if spectrograms:
            hp.sample_rate = old_sample_rate
            hp.num_fft = old_fft_freqs

In [5]:
import sys
import os
import IPython
from IPython.display import Audio

In [6]:
os.chdir(os.path.expanduser("~"))
    
tacotron_dir = "Multilingual_Text_to_Speech"
#if not os.path.exists(tacotron_dir):
#  ! git clone https://github.com/dina-adel/Multilingual_Text_to_Speech

wavernn_dir = "WaveRNN"
if not os.path.exists(wavernn_dir):
  ! git clone https://github.com/Tomiinek/$wavernn_dir

In [7]:
! mkdir -p checkpoints
os.chdir(os.path.join(os.path.expanduser("~"), "checkpoints"))

tacotron_chpt = "generated_switching.pyt"
if not os.path.exists(os.path.join(os.path.expanduser("~"), "checkpoints", tacotron_chpt)):
  ! curl -O -L "https://github.com/Tomiinek/Multilingual_Text_to_Speech/releases/download/v1.0/$tacotron_chpt" 

wavernn_chpt = "wavernn_weight.pyt"
if not os.path.exists(os.path.join(os.path.expanduser("~"), "checkpoints", wavernn_chpt)):
  ! curl -O -L "https://github.com/Tomiinek/Multilingual_Text_to_Speech/releases/download/v1.0/$wavernn_chpt"     

os.chdir(os.path.expanduser("~"))

In [9]:
! pip install soundfile
! pip install phonemizer
! pip install epitran
! apt-get install festival espeak-ng mbrola

E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?


In [11]:
os.chdir(os.path.join(os.path.expanduser("~"), tacotron_dir))
if "utils" in sys.modules: del sys.modules["utils"]
import os
import time
import datetime
import math
import numpy as np
import torch
from torch.utils.data import DataLoader

import dataset.loaders
from dataset.dataset import TextToSpeechDatasetCollection, TextToSpeechCollate, TextToSpeechDataset
from params.params import Params as hp
from utils import audio, text
from modules.tacotron2 import Tacotron, TacotronLoss
from utils.logging import Logger
from utils.samplers import RandomImbalancedSampler, PerfectBatchSampler
from utils import lengths_to_mask, to_gpu

In [14]:
!python3 /home/ires/Multilingual_Text_to_Speech/train.py --base_directory /home/ires/Multilingual_Text_to_Speech --hyper_parameters ar --data_root /home/ires/Multilingual_Text_to_Speech/data

item is:  {'id': '000000', 'speaker': '1', 'language': 'ar', 'audio': '/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data/1/1-1.wav ', 'spectrogram': 'spectrograms/000000.npy', 'linear_spectrogram': 'linear_spectrograms/000000.npy', 'text': 'ارتفعت الأسهم الكويتية ارتفاعا طفيفا اليوم السبت', 'phonemes': ''}
speakers:  {'1'}
item is:  {'id': '000003', 'speaker': '2', 'language': 'ar', 'audio': '/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data/2/2-01.wav ', 'spectrogram': 'spectrograms/000003.npy', 'linear_spectrogram': 'linear_spectrograms/000003.npy', 'text': 'مجال الإصلاحات السياسية', 'phonemes': ''}
speakers:  {'2', '1'}
item is:  {'id': '000038', 'speaker': '3', 'language': 'ar', 'audio': '/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data/3/3-1.wav ', 'spectrogram': 'spectrograms/000038.npy', 'linear_spectrogram': 'linear_spectrograms/000038.npy', 'text': 'توج المنتخب التونسي للمرة السابعة'

speakers:  {'80', '275', '928', '49', '141', '12', '45', '37', '42', '124', '236', '303', '253', '111', '277', '112', '94', '130', '248', '300', '907', '151', '223', '931', '930', '93', '99', '217', '269', '84', '97', '251', '925', '254', '263', '929', '66', '18', '82', '149', '228', '237', '250', '290', '208', '258', '211', '103', '904', '160', '138', '239', '34', '108', '125', '153', '218', '291', '939', '913', '134', '22', '36', '129', '156', '225', '268', '3', '270', '122', '224', '142', '1', '234', '85', '204', '900', '114', '133', '943', '73', '126', '212', '261', '136', '210', '264', '272', '157', '123', '203', '47', '105', '221', '222', '298', '121', '118', '41', '39', '107', '255', '257', '265', '922', '233', '295', '100', '219', '924', '940', '67', '79', '213', '938', '11', '113', '98', '116', '932', '8', '252', '256', '906', '934', '75', '161', '88', '86', '216', '226', '301', '72', '44', '271', '4', '278', '119', '146', '220', '48', '273', '76', '89', '7', '260', '941', '27

Traceback (most recent call last):
  File "/home/ires/Multilingual_Text_to_Speech/train.py", line 248, in <module>
    hp.mel_normalize_mean, hp.mel_normalize_variance = dataset.train.get_normalization_constants(True)
  File "/home/ires/Multilingual_Text_to_Speech/dataset/dataset.py", line 178, in get_normalization_constants
    spectrogram = self.load_spectrogram(item['audio'], path, False, is_mel)
  File "/home/ires/Multilingual_Text_to_Speech/dataset/dataset.py", line 155, in load_spectrogram
    spectrogram = np.load(full_spec_path)
  File "/home/ires/miniconda3/envs/sar/lib/python3.6/site-packages/numpy/lib/npyio.py", line 416, in load
    fid = stack.enter_context(open(os_fspath(file), "rb"))
FileNotFoundError: [Errno 2] No such file or directory: '/home/ires/Multilingual_Text_to_Speech/data/wav_data/spectrograms/000000.npy'


In [None]:
!ls 

checkpoints    dataset		_img	    notebooks	      synthesize.py
CODE.md        dataset_prepare	LICENSE.md  params	      train.py
data	       evaluation	logs	    README.md	      utils
data_file.txt  gta.py		modules     requirements.txt
