In [2]:
#@title ### **2. Setup Environment & TTS Model**
#@markdown This cell will install all required packages, including the powerful `TTS` library **and its system dependency `espeak-ng`**. It will then download and load the specified TTS model.
#@markdown <br>
#@markdown *This may take seemingly forever, especially the first time.*
#@markdown *If this is the first time, you will likely need to "restart" the runtime*
# 1. Install Python packages
# !pip uninstall -y keras tensorflow
%pip install tensorflow==2.18.0

%pip install TTS==0.22.0 tensorflow-model-optimization pydub librosa soundfile tqdm


Collecting numpy<2.1.0,>=1.26.0
  Using cached numpy-2.0.2-cp310-cp310-win_amd64.whl (15.9 MB)
Note: you may need to restart the kernel to use updated packages.
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.0
    Uninstalling numpy-1.22.0:
      Successfully uninstalled numpy-1.22.0
Successfully installed numpy-2.0.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tts 0.22.0 requires numpy==1.22.0; python_version <= "3.10", but you have numpy 2.0.2 which is incompatible.
tensorflow-model-optimization 0.7.3 requires numpy~=1.14, but you have numpy 2.0.2 which is incompatible.
scipy 1.11.4 requires numpy<1.28.0,>=1.21.6, but you have numpy 2.0.2 which is incompatible.
gruut 2.2.3 requires numpy<2.0.0,>=1.19.0, but you have numpy 2.0.2 which is incompatible.

[notice] A new release of pip available: 22.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting numpy==1.22.0
  Using cached numpy-1.22.0-cp310-cp310-win_amd64.whl (14.7 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
Successfully installed numpy-1.22.0
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.22.0 which is incompatible.
statsmodels 0.14.4 requires numpy<3,>=1.22.3, but you have numpy 1.22.0 which is incompatible.

[notice] A new release of pip available: 22.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
#@title <h1>Advanced Speech Dataset Generator (Local TTS)</h1>
#@markdown This notebook uses a powerful, local, open-source Text-to-Speech (TTS) model to generate the dataset. This completely avoids API rate limits and produces much more human-sounding speech.
#@markdown <br>
#@markdown **Key Improvements:**
#@markdown - **No API Errors:** All generation is done locally on the Colab CPU/GPU.
#@markdown - **High-Quality Voice:** Uses a state-of-the-art model (VITS) for natural, human-like speech.
#@markdown - **Fast Generation:** Leverages the GPU if available for a significant speed-up.
#@markdown - **Robust Pipeline:** Retains all previous features like background noise mixing and data augmentation.

#@markdown ---
#@markdown ### **1. Configuration**
#@markdown Set your desired parameters. The TTS model will be downloaded automatically.

words_to_generate = "talking, fish" #@param {type:"string"}
samples_per_word = 1000 #@param {type:"integer"}
output_directory_name = "speech_commands_dataset_local" #@param {type:"string"}
# You can find more models here: https://huggingface.co/coqui/vits-v2-ljspeech-en
tts_model_name = "tts_models/en/ljspeech/vits" #@param {type:"string"}


# --- End of Configuration ---

In [4]:
# @title Build Sound samples
import os
import torch
import numpy as np
import librosa
import soundfile as sf
from TTS.api import TTS
from pydub import AudioSegment
import tensorflow as tf
import random
import shutil
from pathlib import Path
import zipfile
import time
from tqdm.notebook import tqdm
import requests



# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Get device and load the TTS model
# This will now work because espeak-ng is visible after the restart
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print(f"Loading TTS model: {tts_model_name}...")
synthesizer = TTS(tts_model_name).to(device)
print("✅ TTS model loaded successfully.")

# --- The Generator Class ---
class SpeechDatasetGenerator:
    def __init__(self, words, samples_per_word, synthesizer, output_dir="speech_commands_dataset"):
        self.words = [word.strip() for word in words.split(',')]
        self.samples_per_word = samples_per_word
        self.output_dir = Path(output_dir)
        self.synthesizer = synthesizer
        # Standardize on 16kHz for speech commands, which is the most common for these tasks.
        self.target_sample_rate = 16000
        self.duration_ms = 1000

        self.setup_directories()

    def setup_directories(self):
        if self.output_dir.exists():
            shutil.rmtree(self.output_dir)
        self.output_dir.mkdir(exist_ok=True)
        for word in self.words:
            (self.output_dir / word).mkdir(exist_ok=True)
        self.noise_dir = self.output_dir / "_background_noise_"
        self.noise_dir.mkdir(exist_ok=True)

    def generate_tts_samples(self, word):
        print(f"Generating {self.samples_per_word} samples for word: '{word}'")
        word_path = self.output_dir / word
        pbar = tqdm(total=self.samples_per_word, desc=f"Generating '{word}'")
        for i in range(self.samples_per_word):
            try:
                temp_file = Path(f"temp_{word}.wav")
                self.synthesizer.tts_to_file(text=word, file_path=str(temp_file))

                audio = AudioSegment.from_wav(temp_file)
                # Resample to the target rate and ensure mono
                audio = audio.set_frame_rate(self.target_sample_rate).set_channels(1)
                audio = self.apply_audio_effects(audio)

                if len(audio) > self.duration_ms:
                    start_trim = (len(audio) - self.duration_ms) // 2
                    audio = audio[start_trim:start_trim + self.duration_ms]
                elif len(audio) < self.duration_ms:
                    # Calculate the amount of silence needed
                    padding_ms = self.duration_ms - len(audio)
                    # Create a silent audio segment
                    silence_segment = AudioSegment.silent(duration=padding_ms)
                    # Add the silence to the end of the original audio
                    audio = audio + silence_segment

                audio = self.mix_with_background_noise(audio)
                audio = audio.normalize()
                output_file = word_path / f"{word}_{i:04d}.wav"
                audio.export(output_file, format="wav")
                temp_file.unlink()
                pbar.update(1)

            except Exception as e:
                print(f"\nError generating sample {i} for '{word}': {e}. Skipping.")
                if 'temp_file' in locals() and temp_file.exists():
                    temp_file.unlink()
                continue
        pbar.close()

    def apply_audio_effects(self, audio):
        audio += random.uniform(-6, 2)
        samples = np.array(audio.get_array_of_samples()).astype(np.float32)
        if random.random() < 0.6:
            n_steps = random.uniform(-1.5, 1.5)
            samples = librosa.effects.pitch_shift(y=samples, sr=audio.frame_rate, n_steps=n_steps)
        return AudioSegment(samples.astype(np.int16).tobytes(), frame_rate=audio.frame_rate, sample_width=2, channels=1)

    def download_background_noise(self):
        print("Downloading official background noise files...")
        url = "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz"
        archive_path = self.output_dir / "speech_commands.tar.gz"
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(archive_path, 'wb') as f:
                for chunk in tqdm(r.iter_content(chunk_size=8192), desc="Downloading archive"):
                    f.write(chunk)
        shutil.unpack_archive(archive_path, self.output_dir)
        archive_path.unlink()
        for item in self.output_dir.iterdir():
            if item.is_dir() and item.name != "_background_noise_" and item.name not in self.words:
                shutil.rmtree(item)
            elif item.is_file() and not item.name.endswith(('.md', '.txt')):
                item.unlink()
        self.background_noises = [AudioSegment.from_wav(f) for f in self.noise_dir.glob("*.wav")]
        print(f"✓ Loaded {len(self.background_noises)} background noise files.")

    def mix_with_background_noise(self, audio):
        if not hasattr(self, 'background_noises') or not self.background_noises or random.random() < 0.1:
            return audio
        noise = random.choice(self.background_noises)
        if len(noise) > self.duration_ms:
            start_pos = random.randint(0, len(noise) - self.duration_ms)
            noise_snippet = noise[start_pos:start_pos + self.duration_ms]
        else:
            noise_snippet = noise
        snr_db = random.uniform(5, 20)
        return audio.overlay(noise_snippet, gain_during_overlay=-snr_db)

    def create_split_lists(self):
        all_files = [f"{w}/{f.name}" for w in self.words for f in (self.output_dir / w).glob('*.wav')]
        random.shuffle(all_files)
        val_idx = int(0.8 * len(all_files))
        test_idx = int(0.9 * len(all_files))
        (self.output_dir / "validation_list.txt").write_text('\n'.join(all_files[val_idx:test_idx]))
        (self.output_dir / "testing_list.txt").write_text('\n'.join(all_files[test_idx:]))

    def generate_dataset(self):
        print("="*50)
        print("Starting Local TTS Dataset Generation Pipeline")
        print("="*50)
        self.download_background_noise()
        for word in self.words:
            self.generate_tts_samples(word)
        self.create_split_lists()
        print("\nCreating zip file...")
        shutil.make_archive(self.output_dir.name, 'zip', self.output_dir)
        print("🎉 DATASET GENERATION COMPLETE! 🎉")

# --- Verification Functions (Corrected) ---
def load_dataset_for_tensorflow(dataset_path: Path):
    if not dataset_path.exists():
        print(f"Error: Dataset path '{dataset_path}' does not exist.")
        return None
    def get_label(file_path_tensor):
        parts = tf.strings.split(file_path_tensor, os.path.sep)
        return parts[-2]
    def decode_audio(audio_binary):
        audio, _ = tf.audio.decode_wav(audio_binary, desired_channels=1)
        return tf.squeeze(audio, axis=-1)
    def get_waveform_and_label(file_path_tensor):
        label = get_label(file_path_tensor)
        audio_binary = tf.io.read_file(file_path_tensor)
        waveform = decode_audio(audio_binary)
        return waveform, label

    all_files_pattern = str(dataset_path / '*/*.wav')
    filenames = tf.io.gfile.glob(all_files_pattern)
    command_filenames = [f for f in filenames if '_background_noise_' not in f]

    if not command_filenames:
        print("Error: No command audio files found. Check the directory structure.")
        return None

    files_ds = tf.data.Dataset.from_tensor_slices(command_filenames)
    waveform_ds = files_ds.map(get_waveform_and_label, num_parallel_calls=tf.data.AUTOTUNE)
    return waveform_ds

def verify_dataset(dataset_path_str: str):
    print("\n" + "="*50)
    print("Verifying generated dataset...")
    print("="*50)
    dataset_path = Path(dataset_path_str)
    if not dataset_path.exists():
      print(f"✗ Verification failed: Directory '{dataset_path}' not found.")
      return

    command_dirs = [p for p in dataset_path.iterdir() if p.is_dir() and not p.name.startswith('_')]
    for word_dir in command_dirs:
        file_count = len(list(word_dir.glob('*.wav')))
        print(f"✓ Word '{word_dir.name}': Found {file_count} audio files.")

    try:
        ds = load_dataset_for_tensorflow(dataset_path)
        if ds is None: raise ValueError("Dataset could not be loaded.")
        print("\n--- TensorFlow Compatibility Check ---")
        for waveform, label in ds.take(3):
            print(f"Sample - Label: {label.numpy().decode('utf-8'):<10} "
                  f"Shape: {waveform.shape}, "
                  f"Duration: {waveform.shape[0] / 16000:.2f}s")
        print("\n✅ Dataset verification successful.")
    except Exception as e:
        print(f"\n✗ Error during dataset verification: {e}")

Using device: cpu
Loading TTS model: tts_models/en/ljspeech/vits...
 > tts_models/en/ljspeech/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
✅ TTS model loaded successfully.


In [None]:
# 1. Generate the dataset
generator = SpeechDatasetGenerator(
    words=words_to_generate,
    samples_per_word=samples_per_word,
    synthesizer=synthesizer,
    output_dir=output_directory_name
)
generator.generate_dataset()

# 2. Verify the generated dataset
verify_dataset(output_directory_name)

Starting Local TTS Dataset Generation Pipeline
Downloading official background noise files...


Downloading archive: 0it [00:00, ?it/s]

✓ Loaded 6 background noise files.
Generating 1000 samples for word: 'talking'


Generating 'talking':   0%|          | 0/1000 [00:00<?, ?it/s]

 > Text splitted to sentences.
['talking']
 > Processing time: 0.5490784645080566
 > Real-time factor: 0.513712667277777

Error generating sample 0 for 'talking': numpy._core.multiarray failed to import. Skipping.
 > Text splitted to sentences.
['talking']
 > Processing time: 0.2931070327758789
 > Real-time factor: 0.26557404966749387
 > Text splitted to sentences.
['talking']
 > Processing time: 0.21846747398376465
 > Real-time factor: 0.2043961219170914

Error generating sample 2 for 'talking': numpy._core.multiarray failed to import. Skipping.
 > Text splitted to sentences.
['talking']
 > Processing time: 0.20738554000854492
 > Real-time factor: 0.19402796831247518

Error generating sample 3 for 'talking': numpy._core.multiarray failed to import. Skipping.
 > Text splitted to sentences.
['talking']
 > Processing time: 0.23241519927978516
 > Real-time factor: 0.20839114932170066

Error generating sample 4 for 'talking': numpy._core.multiarray failed to import. Skipping.
 > Text split