<a href="https://colab.research.google.com/github/Avvonna/DL_sound_project/blob/main/demo_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title 1\. Setup Repository and Libraries (~ 40 sec)

import os
import subprocess

REPO_NAME = "DL_tts_project"
GIT_URL = f"https://github.com/Avvonna/{REPO_NAME}.git"
BRANCH = "colab-demo"

def run_command(command, message):
    """
    Вспомогательная функция для тихой установки пакетов с помощью subprocess.
    """
    print(f"{message}...")
    try:
        subprocess.run(command, check=True, shell=True, capture_output=True, text=True)
        print(f"{message} - DONE")
    except subprocess.CalledProcessError as e:
        print(f"{message} - FAILED!")
        print("ERROR LOG:\n", e.stderr)
        raise e

# Repo
current_dir = os.path.basename(os.getcwd())
if current_dir == REPO_NAME:
    print(f"Already inside {REPO_NAME}. Skipping clone.")
    run_command("git pull", "Pulling latest changes")
else:
    if not os.path.exists(REPO_NAME):
        run_command(f"git clone {GIT_URL}", f"Cloning {REPO_NAME}")
    os.chdir(REPO_NAME)
    run_command(f"git checkout {BRANCH}", f"Checking out {BRANCH}")

# Dependencies
if not os.path.exists(".setup_complete"):
    if os.path.exists("requirements_colab.txt"):
        with open("requirements_colab.txt", "r") as f:
            reqs = f.readlines()
        run_command("pip install -q -r requirements_colab.txt", "Installing project dependencies")

        with open(".setup_complete", "w") as f:
            f.write("done")
        print("Dependencies installed.")
    else:
        print("requirements.txt not found!")
else:
    print("Dependencies already installed.")

print(f"\nCurrent Directory: {os.getcwd()}")
print("Setup complete!")

Cloning DL_tts_project...
Cloning DL_tts_project - DONE
Checking out colab-demo...
Checking out colab-demo - DONE
Installing project dependencies...
Installing project dependencies - DONE
Dependencies installed.

Current Directory: /content/DL_tts_project
Setup complete!


In [None]:
# @title 2\. Download Pre-trained Weights

import sys

sys.path.insert(0, os.getcwd())

from scripts.download_generator import download_weights

FILE_ID = "1c8udP8vkwpQ-D6K3OCKKYPvLTDCIDs28"
OUTPUT_DIR = "saved"
FILENAME = "generator.pth"

download_weights(file_id=FILE_ID, output_dir=OUTPUT_DIR, filename=FILENAME)

MODEL_PATH = os.path.join(OUTPUT_DIR, FILENAME)
print(f"Weights found at {MODEL_PATH}")


Скачивание весов в saved/generator.pth...


Downloading...
From (original): https://drive.google.com/uc?id=1c8udP8vkwpQ-D6K3OCKKYPvLTDCIDs28
From (redirected): https://drive.google.com/uc?id=1c8udP8vkwpQ-D6K3OCKKYPvLTDCIDs28&confirm=t&uuid=4b6667bf-ca9a-49fb-a183-4b5ec8790487
To: /content/DL_tts_project/saved/generator.pth
100%|██████████| 55.8M/55.8M [00:01<00:00, 45.0MB/s]

Веса успешно скачаны.
Weights found at saved/generator.pth





In [None]:
# @title 3\. Helper Functions (Visualization & Inference)

import os
import shutil
import subprocess
from pathlib import Path

import soundfile as sf
import IPython.display as ipd


AUDIO_EXTS = (".wav", ".mp3", ".flac")


def play_audio(file_path: str) -> None:
    """
    Читает и воспроизводит аудиофайл в Colab/Jupyter.
    """
    try:
        audio_array, sample_rate = sf.read(file_path)
        if audio_array.ndim > 1:
            audio_array = audio_array.T
        print(f"Audio: {os.path.basename(file_path)} | SR: {sample_rate}")
        ipd.display(ipd.Audio(audio_array, rate=sample_rate))
    except Exception as e:
        print(f"Could not play {file_path}: {e}")


def _list_audio_files(audio_dir: str | Path, limit: int | None = None) -> list[Path]:
    audio_dir_p = Path(audio_dir)
    files = [p for p in sorted(audio_dir_p.iterdir()) if p.is_file() and p.suffix.lower() in AUDIO_EXTS]
    if limit is not None:
        files = files[:limit]
    return files

def _run_synthesize(
    mode: str,
    data_dir: str,
    checkpoint_path: str,
    output_dir: str,
    device: str = "auto",
    acoustic_model_name: str = "facebook/mms-tts-rus",
    save_gt: bool = True,
    save_hf_audio: bool = True,
) -> None:
    """
    Запускает synthesize.py через CLI
    """

    overrides = [
        f"synthesize.mode={mode}",
        f"synthesize.device={device}",
        f"synthesize.checkpoint_path={checkpoint_path}",
        f"synthesize.output_dir={output_dir}",
        f"dataset.data_dir={data_dir}",
    ]

    if mode == "resynthesis":
        overrides.append(f"synthesize.save_gt={str(save_gt).lower()}")

    if mode == "full_tts":
        overrides.append(f"synthesize.save_hf_audio={str(save_hf_audio).lower()}")
        overrides.append(f"synthesize.acoustic_model.model_name={acoustic_model_name}")

    cmd = ["python", "synthesize.py", *overrides]

    print(f"Running synthesize.py ({mode})...")
    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode != 0:
        print("Synthesis Error:")
        print(result.stderr)
        raise RuntimeError("synthesize.py failed")

    out = (result.stdout or "").strip().splitlines()
    if out:
        print(out[-1])


def show_synthesis_results(
    outputs_dir: str,
    data_dir: str,
    mode: str,
    limit: int | None = None,
    show_original: bool = True,
) -> None:
    out_p = Path(outputs_dir)
    if not out_p.exists():
        print(f"Outputs dir not found: {outputs_dir}")
        return

    audio_in = _list_audio_files(Path(data_dir) / "audio", limit=limit)
    if not audio_in:
        print("No input audio found in data_dir/audio")
        return

    print(f"Showing {mode} results from: {outputs_dir}\n")

    for p in audio_in:
        stem = p.stem
        print("-" * 60)
        print(f"ID: {stem}")

        if show_original:
            print("\n[ORIGINAL INPUT]")
            play_audio(str(p))

        if mode == "resynthesis":
            gt = out_p / f"{stem}_gt.wav"
            gen = out_p / f"{stem}_resynthesis.wav"

            if gt.exists():
                print("\n[GT]")
                play_audio(str(gt))

            if gen.exists():
                print("\n[RESYNTHESIS]")
                play_audio(str(gen))
            else:
                print("\n[RESYNTHESIS] not found:", gen.name)

        elif mode == "full_tts":
            hf = out_p / f"{stem}_hf.wav"
            gen = out_p / f"{stem}_full_tts.wav"

            if hf.exists():
                print("\n[HF AUDIO (text->wav, resampled)]")
                play_audio(str(hf))
            else:
                print("\n[HF AUDIO] not found:", hf.name)

            if gen.exists():
                print("\n[HiFi-GAN OUTPUT (mel->wav)]")
                play_audio(str(gen))
            else:
                print("\n[HiFi-GAN OUTPUT] not found:", gen.name)
        else:
            raise ValueError("mode must be 'resynthesis' or 'full_tts'")


def run_resynthesis_and_show(
    data_dir: str,
    checkpoint_path: str,
    outputs_dir: str | None = None,
    device: str = "auto",
    limit: int | None = None,
) -> None:
    """
    Resynthesis: реальные аудио -> mel -> HiFi-GAN
    Показывает GT и результат.
    """
    out_dir = outputs_dir or str(Path(data_dir) / "outputs")

    _run_synthesize(
        mode="resynthesis",
        data_dir=data_dir,
        checkpoint_path=checkpoint_path,
        output_dir=out_dir,
        device=device,
        save_gt=True,
    )

    show_synthesis_results(
        outputs_dir=out_dir,
        data_dir=data_dir,
        mode="resynthesis",
        limit=limit,
    )


def run_full_tts_and_show(
    data_dir: str,
    checkpoint_path: str,
    outputs_dir: str | None = None,
    device: str = "auto",
    acoustic_model_name: str = "facebook/mms-tts-rus",
    limit: int | None = None,
) -> None:
    """
    Full TTS: text -> HF wav -> mel -> HiFi-GAN
    Показывает HF wav и итог HiFi-GAN.
    """
    out_dir = outputs_dir or str(Path(data_dir) / "outputs")

    _run_synthesize(
        mode="full_tts",
        data_dir=data_dir,
        checkpoint_path=checkpoint_path,
        output_dir=out_dir,
        device=device,
        acoustic_model_name=acoustic_model_name,
        save_hf_audio=True,
    )

    show_synthesis_results(
        outputs_dir=out_dir,
        data_dir=data_dir,
        mode="full_tts",
        limit=limit,
    )

print("Synthesis helper functions ready.")


Synthesis helper functions ready.


Код ниже позволяет выполнить два сценария: **ресинтез** (аудио→mel→HiFi-GAN) и **full_tts** (текст→HF→mel→HiFi-GAN).

Для работы используется структура `CustomDirDataset`: файлы временно раскладываются в `temp_data/audio` и, при необходимости, `temp_data/transcriptions`, а результаты сохраняются в `temp_data/outputs`.

* **MODE = "resynthesis"**
  Загрузите только аудиофайлы (`.wav`, `.mp3`, `.flac`). Скрипт пересоберёт mel из вашего аудио и сгенерирует новое аудио через HiFi-GAN.

* **MODE = "full_tts"**
  Загрузите аудиофайлы **и** тексты (`.txt`) с теми же именами (например, `a.wav` и `a.txt`). Тогда для каждого примера будет использован свой текст.
  Если `.txt` не загружены, для всех загруженных аудиофайлов будет автоматически создан текст из переменной `DEFAULT_TEXT` (один и тот же для всех - как в **4B**).


In [None]:
# @title 4A. Ресинтез или генерация аудио по тексту (full_tts)

from google.colab import files, output

# Выберите режим
MODE = "full_tts"  # @param ["resynthesis", "full_tts"]

# Текст по умолчанию (нужен только для full_tts, если не загружены .txt)
DEFAULT_TEXT = "Пример текста для синтеза."  # @param {type:"string"}

print("Upload audio files (flac/wav/mp3) + optionally txt for full_tts:")
uploaded = files.upload()
output.clear()

if not uploaded:
    print("No files uploaded.")
else:
    print(f"Uploaded {len(uploaded)} files")

    # Настройка путей
    base_dir = os.path.abspath(".")
    input_root = os.path.join(base_dir, "temp_data")
    audio_dir = os.path.join(input_root, "audio")
    trans_dir = os.path.join(input_root, "transcriptions")

    # Папка для сохранения результатов
    save_folder_name = "outputs"
    full_save_path = os.path.join(input_root, save_folder_name)

    # Очистка и создание директорий
    if os.path.exists(input_root):
        shutil.rmtree(input_root)
    os.makedirs(audio_dir, exist_ok=True)

    # full_tts требует transcriptions
    if MODE == "full_tts":
        os.makedirs(trans_dir, exist_ok=True)

    # Очистка папки вывода
    if os.path.exists(full_save_path):
        shutil.rmtree(full_save_path)

    # Перемещение файлов в структуру датасета
    uploaded_filenames = list(uploaded.keys())

    audio_files = []
    txt_files = []

    for filename in uploaded_filenames:
        if filename.lower().endswith((".wav", ".mp3", ".flac")):
            audio_files.append(filename)
        elif filename.lower().endswith(".txt"):
            txt_files.append(filename)

    # Перемещаем аудио
    for filename in audio_files:
        src_path = os.path.join(base_dir, filename)
        dst_path = os.path.join(audio_dir, filename)
        if os.path.exists(src_path):
            shutil.move(src_path, dst_path)

    # Перемещаем/создаем тексты для full_tts
    if MODE == "full_tts":
        if txt_files:
            for filename in txt_files:
                src_path = os.path.join(base_dir, filename)
                dst_path = os.path.join(trans_dir, filename)
                if os.path.exists(src_path):
                    shutil.move(src_path, dst_path)
        else:
            # Создаем одинаковый текст под каждый аудиофайл
            for filename in audio_files:
                stem = Path(filename).stem
                with open(os.path.join(trans_dir, f"{stem}.txt"), "w", encoding="utf-8") as f:
                    f.write(DEFAULT_TEXT.strip())

    # Synthesis
    print(f"Running {MODE}...")

    command = [
        "python", "synthesize.py",
        f"synthesize.mode={MODE}",
        "synthesize.device=auto",
        "synthesize.output_dir=auto",
        "synthesize.checkpoint_path=saved/generator.pth",
        f"dataset.data_dir={input_root}",
    ]

    # full_tts: сохраняем hf аудио
    if MODE == "full_tts":
        command += [
            "synthesize.save_hf_audio=true",
            "synthesize.acoustic_model.model_name=facebook/mms-tts-rus",
        ]

    try:
        result = subprocess.run(command, capture_output=True, text=True)

        if result.returncode != 0:
            print("Synthesis Error:")
            print(result.stderr)
        else:
            print("Synthesis finished.\n")
            # Показываем оригинальные и сгенерированные
            show_synthesis_results(full_save_path, input_root, mode=MODE, limit=None)

    except Exception as e:
        print(f"Critical Error during execution: {e}")

    finally:
        # Удаление исходных файлов из корня проекта
        for filename in uploaded_filenames:
            if os.path.exists(filename):
                os.remove(filename)


Uploaded 1 files
Running full_tts...
Synthesis finished.

Showing full_tts results from: /content/DL_tts_project/temp_data/outputs

------------------------------------------------------------
ID: common_voice_en_102448

[ORIGINAL INPUT]
Audio: common_voice_en_102448.mp3 | SR: 48000



[HF AUDIO (text->wav, resampled)]
Audio: common_voice_en_102448_hf.wav | SR: 22050



[HiFi-GAN OUTPUT (mel->wav)]
Audio: common_voice_en_102448_full_tts.wav | SR: 22050


In [None]:
# @title 4B. FULL_TTS

# @markdown Введите текст для синтеза:
TEXT = "Пример текста для синтеза."  # @param {type:"string"}

import os
import shutil
import subprocess
from pathlib import Path

import numpy as np
import soundfile as sf

base_dir = os.path.abspath(".")
input_root = os.path.join(base_dir, "temp_data_full_tts")
audio_dir = os.path.join(input_root, "audio")
trans_dir = os.path.join(input_root, "transcriptions")

save_folder_name = "outputs"
full_save_path = os.path.join(input_root, save_folder_name)

# Очистка и создание директорий
if os.path.exists(input_root):
    shutil.rmtree(input_root)
os.makedirs(audio_dir, exist_ok=True)
os.makedirs(trans_dir, exist_ok=True)

# Очистка папки вывода
if os.path.exists(full_save_path):
    shutil.rmtree(full_save_path)

text = (TEXT or "").strip()
if not text:
    raise RuntimeError("FULL_TTS requires non-empty TEXT.")

# Заглушка для датасета
sr = 22050
silent = np.zeros(int(sr * 0.1), dtype=np.float32)

with open(os.path.join(trans_dir, f"tts_audio.txt"), "w", encoding="utf-8") as f:
    f.write(text)
sf.write(os.path.join(audio_dir, f"tts_audio.wav"), silent, sr)

print("Running full_tts...")

command = [
    "python", "synthesize.py",
    "synthesize.mode=full_tts",
    "synthesize.device=auto",
    "synthesize.output_dir=auto",
    "synthesize.checkpoint_path=saved/generator.pth",
    "synthesize.save_hf_audio=true",
    "synthesize.acoustic_model.model_name=facebook/mms-tts-rus",
    f"dataset.data_dir={input_root}",
]

result = subprocess.run(command, capture_output=True, text=True)

if result.returncode != 0:
    print("Synthesis Error:")
    print(result.stderr)
else:
    print("Synthesis finished.\n")
    show_synthesis_results(full_save_path, input_root, mode="full_tts", limit=None, show_original=False)


Running full_tts...
Synthesis finished.

Showing full_tts results from: /content/DL_tts_project/temp_data_full_tts/outputs

------------------------------------------------------------
ID: tts_audio

[HF AUDIO (text->wav, resampled)]
Audio: tts_audio_hf.wav | SR: 22050



[HiFi-GAN OUTPUT (mel->wav)]
Audio: tts_audio_full_tts.wav | SR: 22050


Код ниже позволяет скачивать архив или папку с GDrive и прогонять инференс на ней.

Структура папки (или подпапки) должна иметь следующий формат:

```python
NameOfTheDirectoryWithUtterances
├── audio
│   ├── UtteranceID1.wav    # may be flac or mp3
│   ├── UtteranceID2.wav
│   .
│   .
│   .
│   └── UtteranceIDn.wav
└── transcriptions          # ground truth, may not exist
    ├── UtteranceID1.txt
    ├── UtteranceID2.txt
    .
    .
    .
    └── UtteranceIDn.txt
```

Аудиофайлы для инференса должны быть в форматах `flac`, `mp3` или `wav`

Вот ссылка на архив с файлами `1.wav`, `2.wav` и `3.wav` (+ тексты) и правильной структурой датасета:
- https://drive.google.com/file/d/19lit5O8HOMg7fwtcox-NuF6gbTDUhaFu/view?usp=sharing

In [None]:
# @title 4С. Run on GDrive Dataset

import zipfile
import gdown

# @markdown Выберите режим
MODE = "full_tts"  # @param ["resynthesis", "full_tts"]

# @markdown Вставьте ссылку на Google Drive (на .zip файл или на папку):
gdrive_url = "https://drive.google.com/file/d/19lit5O8HOMg7fwtcox-NuF6gbTDUhaFu/view?usp=sharing"  # @param {type:"string"}

# Настройка путей
base_dir = os.path.abspath(".")
dataset_root = os.path.join(base_dir, "temp_custom_dataset")

# Папка для сохранения результатов
save_folder_name = "outputs"
full_save_path = os.path.join(dataset_root, save_folder_name)

# Очистка предыдущих запусков
if os.path.exists(dataset_root):
    shutil.rmtree(dataset_root)

os.makedirs(dataset_root, exist_ok=True)

if not gdrive_url:
    print("Please paste a Google Drive link above. (zip or folder)")
else:
    try:
        # Скачивание (Folder или zip)
        if "drive.google.com" in gdrive_url and "/folders/" in gdrive_url:
            print("Detected Google Drive FOLDER.")
            gdown.download_folder(url=gdrive_url, output=dataset_root, quiet=True)
        elif "drive.google.com" in gdrive_url:
            print("Detected Google Drive FILE (assuming .zip).")
            zip_path = os.path.join(dataset_root, "dataset.zip")
            gdown.download(url=gdrive_url, output=zip_path, quiet=True, fuzzy=True)
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(dataset_root)
            os.remove(zip_path)
        else:
            # прямая ссылка
            print("Direct link detected.")
            zip_path = os.path.join(dataset_root, "dataset.zip")
            subprocess.run(["wget", gdrive_url, "-q", "-O", zip_path], check=True)
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(dataset_root)
            os.remove(zip_path)

        # Поиск данных
        data_dir = None
        for root, dirs, files in os.walk(dataset_root):
            if "audio" in dirs:
                data_dir = root
                break

        if not data_dir:
            raise FileNotFoundError("Could not find 'audio' folder.")

        print(f"Dataset found at: {data_dir}")

        # full_tts требует transcriptions
        if MODE == "full_tts":
            trans_dir = os.path.join(data_dir, "transcriptions")
            if not (os.path.exists(trans_dir) and os.path.isdir(trans_dir)):
                raise FileNotFoundError("FULL_TTS requires 'transcriptions' folder with *.txt files.")

        # Synthesis
        print(f"\nRunning {MODE}...")
        cmd_syn = [
            "python", "synthesize.py",
            f"synthesize.mode={MODE}",
            "synthesize.device=auto",
            "synthesize.output_dir=auto",
            "synthesize.checkpoint_path=saved/generator.pth",
            f"dataset.data_dir={data_dir}",
        ]

        if MODE == "full_tts":
            cmd_syn += [
                "synthesize.save_hf_audio=true",
                "synthesize.acoustic_model.model_name=facebook/mms-tts-rus",
            ]

        result = subprocess.run(cmd_syn, capture_output=True, text=True)

        if result.returncode != 0:
            print("Synthesis Error:")
            print(result.stderr)
        else:
            print("Synthesis finished.")
            if os.path.exists(full_save_path):
                show_synthesis_results(full_save_path, data_dir, mode=MODE, limit=5)
            else:
                print("Outputs directory was not created.")

    except Exception as e:
        print(f"\nFAILED: {e}")


Detected Google Drive FILE (assuming .zip).
Dataset found at: /content/DL_tts_project/temp_custom_dataset

Running full_tts...
Synthesis finished.
Showing full_tts results from: /content/DL_tts_project/temp_custom_dataset/outputs

------------------------------------------------------------
ID: 1

[ORIGINAL INPUT]
Audio: 1.wav | SR: 24000



[HF AUDIO (text->wav, resampled)]
Audio: 1_hf.wav | SR: 22050



[HiFi-GAN OUTPUT (mel->wav)]
Audio: 1_full_tts.wav | SR: 22050


------------------------------------------------------------
ID: 2

[ORIGINAL INPUT]
Audio: 2.wav | SR: 24000



[HF AUDIO (text->wav, resampled)]
Audio: 2_hf.wav | SR: 22050



[HiFi-GAN OUTPUT (mel->wav)]
Audio: 2_full_tts.wav | SR: 22050


------------------------------------------------------------
ID: 3

[ORIGINAL INPUT]
Audio: 3.wav | SR: 24000



[HF AUDIO (text->wav, resampled)]
Audio: 3_hf.wav | SR: 22050



[HiFi-GAN OUTPUT (mel->wav)]
Audio: 3_full_tts.wav | SR: 22050
