In [1]:
import os
import librosa
import soundfile as sf
import matplotlib.pyplot as plt
from datasets import load_dataset, DatasetDict
from tqdm.notebook import tqdm

# Подготовка данных для обучения
Выгружаем из датасета данные по трем языкам, которые будем идентефицировать (Русский, Английский, Немецкий)

In [2]:
lang_dict = {
    "ru": "ru-RU",
    "en": "en-US",
    "de": "de-DE",
}

def get_ds_splits(lang: str = "ru"):
    ds = load_dataset("PolyAI/minds14", lang_dict[lang])
    train_test_ds = ds["train"].train_test_split(test_size=0.2, seed=42, shuffle=True)
    test_val_ds = train_test_ds["test"].train_test_split(test_size=0.5, seed=42, shuffle=True)

    return DatasetDict({
    'train': train_test_ds['train'],
    'valid': test_val_ds['train'],
    'test': test_val_ds['test']
    })


In [3]:
ds_splits_ru = get_ds_splits(lang="ru")
ds_splits_en = get_ds_splits(lang="en")
ds_splits_de = get_ds_splits(lang="de")

Функции для предобработки данных
- Сохранение аудиозаписи 
- Сохранение транскрипции по аудиозаписи
- Построение и сохранение спектрограммы

In [4]:
current_dir = os.getcwd()

def check_exists_path(path):
    if not os.path.exists(path):
        os.makedirs(f"{current_dir}/{path}")

In [5]:
def save_audio(y, sr, path, name):
    check_exists_path(path=path)
    sf.write(file=f"{path}/{name}.wav", data=y, samplerate=sr, subtype="PCM_16")

In [6]:
def save_transcription(transcription, path, name):
    check_exists_path(path=path)
    with open(f"{path}/{name}.txt", "w", encoding="utf-8") as file:
            file.write(transcription)

In [7]:
def save_spectrogram_image(y, sr, path, name):
    check_exists_path(path=path)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    log_S = librosa.amplitude_to_db(S)
    plt.figure(figsize=(1,1))
    librosa.display.specshow(log_S, sr=sr)
    plt.savefig(f"{path}/{name}.png")
    plt.close()

In [8]:
def save_data(ds, split: str = "train", lang: str = "ru"):
    audio_path = f"data/{split}_audio/{lang}"
    transcription_path = f"data/{split}_transcription/{lang}"
    spectrogram_path = f"data/{split}_spectrogram/{lang}"


    for idx, item in tqdm(enumerate(ds[f"{split}"])):
        y = item["audio"]["array"]
        sr = item["audio"]["sampling_rate"]
        transcription = item["transcription"]
        name = f"{lang}_{split}_{idx}"

        # Сохраняем аудио
        save_audio(y=y, sr=sr, path=audio_path, name=name)

        # Сохраняем транскрипцию
        save_transcription(transcription=transcription, path=transcription_path, name=name)

        # Сохраняем спектрограмму
        save_spectrogram_image(y=y, sr=sr, path=spectrogram_path, name=name)

In [9]:
save_data(ds=ds_splits_ru, split="train", lang="ru")
save_data(ds=ds_splits_ru, split="valid", lang="ru")
save_data(ds=ds_splits_ru, split="test", lang="ru")

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [10]:
save_data(ds=ds_splits_en, split="train", lang="en")
save_data(ds=ds_splits_en, split="valid", lang="en")
save_data(ds=ds_splits_en, split="test", lang="en")

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [11]:
save_data(ds=ds_splits_de, split="train", lang="de")
save_data(ds=ds_splits_de, split="valid", lang="de")
save_data(ds=ds_splits_de, split="test", lang="de")

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]