In [1]:
import tarfile
import os

def extract_dataset(tar_path, extract_to):
    with tarfile.open(tar_path, 'r:gz') as tar:
        members = tar.getmembers()

        lang_code = os.path.basename(tar_path).split('-')[-1].replace('.tar.gz', '')

        root_prefix = None
        for m in members:
            if f"/{lang_code}/" in m.name:
                root_prefix = m.name.split(f"{lang_code}/")[0] + f"{lang_code}/"
                break

        if root_prefix is None:
            print(f"Couldn't determine folder structure for: {tar_path}")
            return

        os.makedirs(extract_to, exist_ok=True)

        for member in members:
            if member.name.startswith(root_prefix):
                member.name = os.path.relpath(member.name, root_prefix)
                if member.name == '.':  
                    continue
                tar.extract(member, path=extract_to)

        print(f"Extracted {lang_code.upper()} to: {extract_to} (flattened)")


In [2]:
extract_dataset(
    "cv-corpus-21.0-delta-2025-03-14-en.tar.gz",
    r"C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_en"
)

In [3]:
extract_dataset(
    "cv-corpus-21.0-2025-03-14-hi.tar.gz",
    r"C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_hi"
)

In [4]:
pip install pydub tqdm

In [5]:
import os

clips_folder = r"C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_hi\clips"
validated_tsv = r"C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_hi\validated.tsv"
existing_files = set(os.listdir(clips_folder))

with open(validated_tsv, 'r', encoding='utf-8') as f:
    lines = f.readlines()

header = lines[0]
valid_lines = [header]

for line in lines[1:]:
    parts = line.strip().split('\t')
    if len(parts) > 1:
        filename = parts[1]
        if filename in existing_files:
            valid_lines.append(line)

output_filtered_tsv = os.path.join(
    r"C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_hi", "validated_filtered.tsv"
)

with open(output_filtered_tsv, 'w', encoding='utf-8') as f:
    f.writelines(valid_lines)

print(f"Filtered validated.tsv to {len(valid_lines)-1} entries with existing files.")

âœ… Filtered validated.tsv to 10979 entries with existing files.


In [6]:
import os

clips_folder = r"C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_en\clips"
validated_tsv = r"C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_en\validated.tsv"
existing_files = set(os.listdir(clips_folder))

with open(validated_tsv, 'r', encoding='utf-8') as f:
    lines = f.readlines()

header = lines[0]
valid_lines = [header]

for line in lines[1:]:
    parts = line.strip().split('\t')
    if len(parts) > 1:
        filename = parts[1]
        if filename in existing_files:
            valid_lines.append(line)

output_filtered_tsv = os.path.join(
    r"C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_en", "validated_filtered.tsv"
)

with open(output_filtered_tsv, 'w', encoding='utf-8') as f:
    f.writelines(valid_lines)

print(f"Filtered validated.tsv to {len(valid_lines)-1} entries with existing files.")

âœ… Filtered validated.tsv to 249 entries with existing files.


In [7]:
import os
from pydub import AudioSegment
from pydub.utils import which

ffmpeg_path = r"C:\Users\WAGHMARE\Downloads\ffmpeg-7.1.1-essentials_build\bin\ffmpeg.exe"
AudioSegment.converter = ffmpeg_path
os.environ["PATH"] += os.pathsep + os.path.dirname(ffmpeg_path)

print("FFmpeg manually set to:", AudioSegment.converter)
print("FFmpeg found by which():", which("ffmpeg"))

FFmpeg manually set to: C:\Users\WAGHMARE\Downloads\ffmpeg-7.1.1-essentials_build\bin\ffmpeg.exe
FFmpeg found by which(): C:\Users\WAGHMARE\Downloads\ffmpeg-7.1.1-essentials_build\bin\ffmpeg.exe




In [8]:
import os
import pandas as pd
import shutil
from pydub import AudioSegment
from tqdm import tqdm

def prepare_data(language_folder, output_folder, max_samples_per_speaker=10):
    validated_file = os.path.join(language_folder, "validated_filtered.tsv")

    if not os.path.exists(validated_file):
        print(f"Error: {validated_file} does not exist!")
        return

    df = pd.read_csv(validated_file, sep="\t")

    if 'client_id' not in df.columns or 'path' not in df.columns:
        print("Error: validated_filtered.tsv missing required columns.")
        return

    # Drop rows with missing 'path' or 'client_id'
    df = df[df['path'].notnull() & df['client_id'].notnull()]

    # Group by speaker and limit samples
    grouped = df.groupby("client_id").head(max_samples_per_speaker)

    print(f"Total unique speakers in {language_folder}: {df['client_id'].nunique()}")
    print(f"Preparing data for {grouped['client_id'].nunique()} speakers")

    clips_folder = os.path.join(language_folder, "clips")

    if not os.path.exists(clips_folder):
        print(f"Error: Clips folder not found at {clips_folder}")
        return

    os.makedirs(output_folder, exist_ok=True)

    for _, row in tqdm(grouped.iterrows(), total=len(grouped), desc="Converting MP3 to WAV"):
        client_id = row['client_id']
        filename = row['path']
        mp3_path = os.path.join(clips_folder, filename)

        if not os.path.exists(mp3_path):
            print(f"File missing: {mp3_path}")
            continue

        speaker_folder = os.path.join(output_folder, client_id)
        os.makedirs(speaker_folder, exist_ok=True)

        wav_filename = filename.replace(".mp3", ".wav")
        wav_path = os.path.join(speaker_folder, wav_filename)

        try:
            audio = AudioSegment.from_mp3(mp3_path)
            audio.export(wav_path, format="wav")
        except Exception as e:
            print(f"Error converting {mp3_path}: {e}")

    print("Data preparation complete.")

In [11]:
prepare_data(
    language_folder=r"C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_hi",
    output_folder=r"C:\Users\WAGHMARE\Desktop\Research Project\processed_hi",
    max_samples_per_speaker=10
)

ðŸ‘¤ Total unique speakers in C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_hi: 351
ðŸŽ¯ Preparing data for 351 speakers


ðŸ”„ Converting MP3 to WAV: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1891/1891 [05:10<00:00,  6.09it/s]

âœ… Data preparation complete.





In [12]:
prepare_data(
    language_folder=r"C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_en",
    output_folder=r"C:\Users\WAGHMARE\Desktop\Research Project\processed_en",
    max_samples_per_speaker=10
)

ðŸ‘¤ Total unique speakers in C:\Users\WAGHMARE\Desktop\Research Project\commonvoice_en: 47
ðŸŽ¯ Preparing data for 47 speakers


ðŸ”„ Converting MP3 to WAV: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 125/125 [00:23<00:00,  5.31it/s]

âœ… Data preparation complete.



