This notebook is designed to be run in Google Colab, which provides a pre-configured Python environment with many common data science and machine learning libraries already installed.
Package Installation Syntax:
In Colab, you can install additional Python packages using the ! at the start of a cell:
This is different from Jupyter notebooks run locally, where you might use %pip install or run pip in a terminal.

Some imports may not have explicit installation commands in the notebook because they are already available in the Colab environment by default.

In [None]:
!pip install datasets torchaudio torch

In [None]:
from datasets import load_dataset
import torchaudio
import os
import csv

languages = ["en", "bg", "uk", "ru", "ar", "it", "pl", "pt"]
maxTestRows = 50_000 # Selecting how many samples to process
rowsExported  = 1_000 # Selecting how many samples to save

for lang in languages:
    # Load the test split
    test_dataset = load_dataset("mozilla-foundation/common_voice_13_0", lang, split=f"test[:{maxTestRows}]")

    test_dataset = test_dataset.shuffle(seed=42)

    subset_n = min(rowsExported, len(test_dataset))
    subset_dataset = test_dataset.select(range(subset_n))

    os.makedirs(f"/content/tests/{lang}_gold_wavs", exist_ok=True)

    # Open CSV file for writing
    with open(f"/content/tests/{lang}_gold.csv", mode="w", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["file", "text"])
        writer.writeheader()

        print(f"Starting file save loop for {lang}")
        # Process a subset of the test dataset
        for i, sample in enumerate(subset_dataset):
            print(f"Processing file {i+1}/1000 for {lang}")
            filename = f"sample_{i}.wav"
            filepath = f"/content/tests/{lang}_gold_wavs/{filename}"

            # Save audio file
            waveform = torch.tensor(sample["audio"]["array"]).unsqueeze(0)
            torchaudio.save(filepath, waveform, sample["audio"]["sampling_rate"])

            # Write to CSV
            writer.writerow({"file": filename, "text": sample["sentence"]})

In [None]:
import shutil

# Create a zip of all test directories
shutil.make_archive("/content/all_tests", 'zip', "/content/tests")

# Download the zip
from google.colab import files
files.download("/content/all_tests.zip")