In [1]:
from datasets import load_dataset

# load from parquet file (~4000 samples in a parquet file)
# link to other parquet files: https://huggingface.co/datasets/linhtran92/viet_bud500/tree/main/data

train_url = "https://huggingface.co/datasets/linhtran92/viet_bud500/resolve/main/data/train-00000-of-00105-be5f872f8be772f5.parquet"
test_url = "https://huggingface.co/datasets/linhtran92/viet_bud500/resolve/main/data/test-00000-of-00002-531c1d81edb57297.parquet"

data_files = {"train": train_url, "test" : test_url}
dataset = load_dataset("parquet", data_files=data_files, )

# # load dataset via streaming
# dataset = load_dataset("linhtran92/viet_bud500", split='test', streaming=True)
# dataset.take(2)

# # load all (649158 samples, ~100gb, ~2hrs to complete loading)
# dataset = load_dataset("linhtran92/viet_bud500", split="test")

In [2]:
import soundfile as sf

In [3]:
def convert_waveform_to_audio(waveform_list, sampling_rate=16000, path_to_audio='',prefix='train'):
    count = 0
    for waveform in waveform_list:
        print(waveform)
        sf.write(f'{path_to_audio}{prefix}_{str(count).zfill(10)}.wav', waveform['array'], sampling_rate)
        count += 1
    print('saved all audio files!') 
sampling_rate = 16000
# for audio in dataset['train']['audio']:
#     sf.write(f'../dataset/audio/train_24k/train24k_{str(count).zfill(10)}.wav', audio['array'], sampling_rate)
#     count += 1
# count = 0
# for audio in dataset['test']['audio']:
#     sf.write(f'../dataset/audio/test_24k/test24k_{str(count).zfill(10)}.wav', audio['array'], sampling_rate)
#     count += 1

In [4]:
convert_waveform_to_audio(dataset['train']['audio'], sampling_rate,path_to_audio='../dataset/audio/train_16k/',prefix='train16k')
convert_waveform_to_audio(dataset['test']['audio'], sampling_rate,path_to_audio='../dataset/audio/test_16k/',prefix='test16k')

{'path': None, 'array': array([ 0.04827881,  0.08624268,  0.08853149, ..., -0.08074951,
       -0.08364868, -0.08636475]), 'sampling_rate': 16000}
{'path': None, 'array': array([-0.01934814,  0.00234985,  0.00598145, ...,  0.39654541,
        0.39630127,  0.38327026]), 'sampling_rate': 16000}
{'path': None, 'array': array([-0.23605347, -0.40600586, -0.36453247, ..., -0.00982666,
        0.01174927,  0.0322876 ]), 'sampling_rate': 16000}
{'path': None, 'array': array([ 0.11477661,  0.24417114,  0.1333313 , ..., -0.01937866,
       -0.0329895 , -0.04257202]), 'sampling_rate': 16000}
{'path': None, 'array': array([0.00125122, 0.00228882, 0.00213623, ..., 0.00354004, 0.00442505,
       0.00650024]), 'sampling_rate': 16000}
{'path': None, 'array': array([ 0.09945679,  0.17468262,  0.1647644 , ..., -0.00500488,
       -0.00610352, -0.00543213]), 'sampling_rate': 16000}
{'path': None, 'array': array([ 0.01116943,  0.037323  ,  0.02651978, ..., -0.04766846,
       -0.05358887, -0.06271362]), '

In [5]:
import phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(language='vi', preserve_punctuation=True,  with_stress=True)

In [6]:
from phonemizer import phonemize
text = "Xin chào, tôi là một lập trình viên."

# Sử dụng phonemizer để chuyển đổi văn bản thành IPA
ipa_text = phonemize(text, language='vi', backend='espeak', strip=True, preserve_punctuation=True)

print(ipa_text)

sin tʃaː2w, t̪oj laː2 mo6t̪ lə6p tʃi2ɲ viɛn.


In [7]:
def create_data_list(transcription_list, path_to_audio, path_to_list, prefix, language):
    count = 0
    global_phonemizer = phonemizer.backend.EspeakBackend(language=language, preserve_punctuation=True,  with_stress=True)
    with open(path_to_list, 'w') as f:
        for text in transcription_list:
            text = text.strip()
            ps = global_phonemizer.phonemize([text])
            f.write(f'{path_to_audio}/{prefix}_{str(count).zfill(10)}.wav|{ps[0]}|0\n')
            count += 1

# with open('../dataset/train_list_24k.txt', 'w') as f:
#     count = 0
#     for text in dataset['train']['transcription']:
#         text = text.strip()
#         ps = global_phonemizer.phonemize([text])
#         f.write(f'train_24k/train24k_{str(count).zfill(10)}.wav|{ps[0]}|0\n')
#         count += 1
# with open('../dataset/test_list_24k.txt', 'w') as f:
#     count = 0
#     for text in dataset['test']['transcription']:
#         text = text.strip()
#         ps = global_phonemizer.phonemize([text])
#         f.write(f'test_24k/test24k_{str(count).zfill(10)}.wav|{ps[0]}|0\n')
#         count += 1

In [8]:
import os
def prepare_data(sampling_rate, waveform_list, transcription_list, output_path, path_to_list, prefix, language):
    #create folder for audio files
    os.makedirs(output_path, exist_ok=True)
    convert_waveform_to_audio(waveform_list, sampling_rate, output_path, prefix)
    create_data_list(transcription_list, output_path, path_to_list, prefix, language=language)


In [10]:
# test function
prepare_data(16000, dataset['train']['audio'], dataset['train']['transcription'],
             '../dataset/train_16k/', '../dataset/train_list_16k.txt', 'train16k', 'vi')
prepare_data(16000, dataset['test']['audio'], dataset['test']['transcription'],
             '../dataset/test_16k/', '../dataset/test_list_16k.txt', 'test16k', 'vi')

{'path': None, 'array': array([ 0.04827881,  0.08624268,  0.08853149, ..., -0.08074951,
       -0.08364868, -0.08636475]), 'sampling_rate': 16000}
{'path': None, 'array': array([-0.01934814,  0.00234985,  0.00598145, ...,  0.39654541,
        0.39630127,  0.38327026]), 'sampling_rate': 16000}
{'path': None, 'array': array([-0.23605347, -0.40600586, -0.36453247, ..., -0.00982666,
        0.01174927,  0.0322876 ]), 'sampling_rate': 16000}
{'path': None, 'array': array([ 0.11477661,  0.24417114,  0.1333313 , ..., -0.01937866,
       -0.0329895 , -0.04257202]), 'sampling_rate': 16000}
{'path': None, 'array': array([0.00125122, 0.00228882, 0.00213623, ..., 0.00354004, 0.00442505,
       0.00650024]), 'sampling_rate': 16000}
{'path': None, 'array': array([ 0.09945679,  0.17468262,  0.1647644 , ..., -0.00500488,
       -0.00610352, -0.00543213]), 'sampling_rate': 16000}
{'path': None, 'array': array([ 0.01116943,  0.037323  ,  0.02651978, ..., -0.04766846,
       -0.05358887, -0.06271362]), '