In [1]:
import os
import pandas as pd
import shutil
import random

In [None]:
TSV_INPUT_PATH = "../data_filtered_data_labeled.tsv"
FILTERED_DIR = "../data_filtered"
TSV_OUTPUT_PATH = "../stats/raw_data_labeled.tsv"

In [3]:
def filter_tsv_by_audio_existence(tsv_path, filtered_dir, output_path):
    """
    Read a TSV file, check if audio files in 'path' column exist in filtered_dir,
    and write a new TSV with only rows where the audio file exists.
    """
    try:
        df = pd.read_csv(tsv_path, sep="\t")
    except Exception as e:
        print(f"Error reading TSV file: {str(e)}")
        return

    if "path" not in df.columns:
        print("Error: 'path' column not found in TSV file.")
        return

    valid_rows = []

    for index, row in df.iterrows():
        audio_path = os.path.join(filtered_dir, row["path"])

        if os.path.exists(audio_path):
            valid_rows.append(row)
        else:
            print(f"Audio file not found, skipping: {row['path']}")

    if valid_rows:
        filtered_df = pd.DataFrame(valid_rows)

        try:
            filtered_df.to_csv(output_path, sep="\t", index=False)
            print(f"Filtered TSV saved to {output_path}")
            print(f"Original rows: {len(df)}, Filtered rows: {len(filtered_df)}")
        except Exception as e:
            print(f"Error writing output TSV: {str(e)}")
    else:
        print("No valid rows found. No output TSV created.")

In [4]:
if not os.path.exists(FILTERED_DIR):
    print(f"Filtered directory does not exist: {FILTERED_DIR}")

else:
    filter_tsv_by_audio_existence(TSV_INPUT_PATH, FILTERED_DIR, TSV_OUTPUT_PATH)

Audio file not found, skipping: common_voice_en_529569.mp3
Audio file not found, skipping: common_voice_en_529573.mp3
Audio file not found, skipping: common_voice_en_570619.mp3
Audio file not found, skipping: common_voice_en_570621.mp3
Audio file not found, skipping: common_voice_en_570622.mp3
Audio file not found, skipping: common_voice_en_570623.mp3
Audio file not found, skipping: common_voice_en_508243.mp3
Audio file not found, skipping: common_voice_en_54973.mp3
Audio file not found, skipping: common_voice_en_82281.mp3
Audio file not found, skipping: common_voice_en_541573.mp3
Audio file not found, skipping: common_voice_en_84713.mp3
Audio file not found, skipping: common_voice_en_589235.mp3
Audio file not found, skipping: common_voice_en_589236.mp3
Audio file not found, skipping: common_voice_en_589238.mp3
Audio file not found, skipping: common_voice_en_589239.mp3
Audio file not found, skipping: common_voice_en_589240.mp3
Audio file not found, skipping: common_voice_en_453064.mp3


In [7]:
def count_label_distribution(tsv_path):
    """
    Read a TSV file and count the number of patterns for each label (0 to 3).
    """
    # Read TSV file
    try:
        df = pd.read_csv(tsv_path, sep="\t")
    except Exception as e:
        print(f"Error reading TSV file: {str(e)}")
        return

    # Check if 'label' column exists
    if "label" not in df.columns:
        print("Error: 'label' column not found in TSV file.")
        return

    # Count occurrences of each label
    label_counts = df["label"].value_counts().sort_index()

    # Ensure all labels (0 to 3) are represented, even if count is 0
    for label in range(4):
        if label not in label_counts.index:
            label_counts[label] = 0

    # Print distribution
    print("\nLabel Distribution:")
    print("-------------------")
    for label in range(4):
        count = label_counts.get(label, 0)
        print(f"Label {label}: {count} patterns")
    print(f"Total patterns: {len(df)}")

In [9]:
count_label_distribution(TSV_INPUT_PATH)


Label Distribution:
-------------------
Label 0: 119700 patterns
Label 1: 19341 patterns
Label 2: 17292 patterns
Label 3: 15825 patterns
Total patterns: 172158


In [16]:
AUDIO_DIR = "./filtered"
OUTPUT_BASE_DIR = "./data_batches"
SAMPLE_SIZE = 15825

# Define output directories for each label
LABEL_DIRS = {
    0: os.path.join(OUTPUT_BASE_DIR, "Male_Twenties"),
    1: os.path.join(OUTPUT_BASE_DIR, "Female_Twenties "),
    2: os.path.join(OUTPUT_BASE_DIR, "Male_Fifties"),
    3: os.path.join(OUTPUT_BASE_DIR, "Female_Fifties"),
}

In [None]:
def setup_output_directories():
    """Create output directories for each label if they don't exist."""
    for label, label_dir in LABEL_DIRS.items():
        normalized_dir = os.path.normpath(label_dir)
        os.makedirs(normalized_dir, exist_ok=True)
        print(f"Created/Verified directory for label {label}: {normalized_dir}")


def copy_audio_files(df, label, output_dir, sample_size=None):
    """
    Copy audio files for a given label to the specified output directory.
    If sample_size is provided, randomly select that many files.
    """
    # Normalize output directory path
    output_dir = os.path.normpath(output_dir)

    # Filter rows for the given label
    label_df = df[df["label"] == label]
    print(f"Label {label}: Found {len(label_df)} patterns in TSV")

    # If sample_size is specified, randomly sample the rows
    if sample_size and len(label_df) > sample_size:
        label_df = label_df.sample(
            n=sample_size, random_state=42
        )  # Fixed seed for reproducibility
        print(f"Label {label}: Sampled {sample_size} patterns")
    elif sample_size and len(label_df) < sample_size:
        print(
            f"Warning: Only {len(label_df)} patterns available for label {label}, less than requested {sample_size}"
        )

    # Copy audio files
    copied_files = 0
    missing_files = 0
    for _, row in label_df.iterrows():
        audio_path = os.path.normpath(os.path.join(AUDIO_DIR, row["path"]))
        output_path = os.path.normpath(os.path.join(output_dir, row["path"]))

        # Ensure the audio file exists
        if os.path.exists(audio_path):
            # Handle duplicate filenames
            base, ext = os.path.splitext(row["path"])
            counter = 1
            while os.path.exists(output_path):
                output_path = os.path.normpath(
                    os.path.join(output_dir, f"{base}_{counter}{ext}")
                )
                counter += 1

            try:
                shutil.copy2(audio_path, output_path)  # Copy file, preserving metadata
                copied_files += 1
            except Exception as e:
                print(f"Error copying {audio_path}: {str(e)}")
        else:
            print(f"Audio file not found: {audio_path}")
            missing_files += 1

    print(f"Label {label}: {copied_files} files copied, {missing_files} files missing")
    return copied_files


def process_tsv(tsv_path, audio_dir):
    """
    Process TSV file, select patterns by label, and copy audio files to respective folders.
    """
    # Normalize audio directory path
    audio_dir = os.path.normpath(audio_dir)

    # Verify audio directory exists
    if not os.path.exists(audio_dir):
        print(f"Error: Audio directory does not exist: {audio_dir}")
        return

    # Read TSV file
    try:
        df = pd.read_csv(tsv_path, sep="\t")
        print(f"TSV loaded: {len(df)} rows")
    except Exception as e:
        print(f"Error reading TSV file: {str(e)}")
        return

    # Check for required columns
    if "label" not in df.columns or "path" not in df.columns:
        print("Error: 'label' or 'path' column not found in TSV file.")
        return

    # Print label distribution
    print("\nLabel distribution in TSV:")
    for label in range(4):
        count = len(df[df["label"] == label])
        print(f"Label {label}: {count} patterns")

    # Create output directories
    setup_output_directories()

    # Process each label
    print("\nProcessing labels...")
    for label in range(4):
        output_dir = LABEL_DIRS[label]
        sample_size = None if label == 3 else SAMPLE_SIZE  # No sampling for label 3

        print(f"\nProcessing label {label}...")
        copied_files = copy_audio_files(df, label, output_dir, sample_size)
        print(f"Label {label}: Copied {copied_files} audio files to {output_dir}")

In [20]:
# Ensure audio directory exists
if not os.path.exists(AUDIO_DIR):
    print(f"Audio directory does not exist: {AUDIO_DIR}")


else:
    process_tsv(TSV_INPUT_PATH, AUDIO_DIR)

Created/Verified directory for label 0: data_batches\Male_Twenties
Created/Verified directory for label 1: data_batches\Female_Twenties 
Created/Verified directory for label 2: data_batches\Male_Fifties
Created/Verified directory for label 3: data_batches\Female_Fifties

Processing labels...

Processing label 0...
Label 0: 15825 files copied, 0 files missing
Label 0: Copied 15825 audio files to ./data_batches\Male_Twenties

Processing label 1...
Error copying filtered\common_voice_en_311731.mp3: [WinError 3] The system cannot find the path specified
Error copying filtered\common_voice_en_18667542.mp3: [WinError 3] The system cannot find the path specified
Error copying filtered\common_voice_en_19634135.mp3: [WinError 3] The system cannot find the path specified
Error copying filtered\common_voice_en_19538473.mp3: [WinError 3] The system cannot find the path specified
Error copying filtered\common_voice_en_19537203.mp3: [WinError 3] The system cannot find the path specified
Error copyin

In [23]:
OUTPUT_DIR = "./data_batches/Female_Twenties"


def process_label_1_patterns(tsv_path, audio_dir, output_dir, sample_size):
    """
    Process a random sample of up to sample_size patterns with label 1 from the TSV file,
    copying their audio files from audio_dir to output_dir.

    Args:
        tsv_path (str): Path to the TSV file
        audio_dir (str): Directory containing audio files
        output_dir (str): Directory to copy audio files to
        sample_size (int): Number of patterns to randomly sample

    Returns:
        tuple: (copied_files, missing_files) - Count of successfully copied and missing files
    """
    # Normalize paths for cross-platform compatibility
    tsv_path = os.path.normpath(tsv_path)
    audio_dir = os.path.normpath(audio_dir)
    output_dir = os.path.normpath(output_dir)

    # Verify audio directory exists
    if not os.path.exists(audio_dir):
        print(f"Error: Audio directory does not exist: {audio_dir}")
        return 0, 0

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    print(f"Created/Verified output directory: {output_dir}")

    # Read TSV file
    try:
        df = pd.read_csv(tsv_path, sep="\t")
        print(f"TSV loaded: {len(df)} rows")
    except Exception as e:
        print(f"Error reading TSV file: {str(e)}")
        return 0, 0

    # Check for required columns
    if "label" not in df.columns or "path" not in df.columns:
        print("Error: 'label' or 'path' column not found in TSV file.")
        return 0, 0

    # Filter for label 1 patterns
    label_1_df = df[df["label"] == 1]
    print(f"Label 1 (Female_Twenties): Found {len(label_1_df)} patterns in TSV")

    if len(label_1_df) == 0:
        print("No patterns found for label 1. Exiting.")
        return 0, 0

    # Randomly sample up to sample_size patterns
    if len(label_1_df) > sample_size:
        label_1_df = label_1_df.sample(n=sample_size, random_state=42)
        print(f"Label 1: Sampled {sample_size} patterns")
    elif len(label_1_df) < sample_size:
        print(
            f"Warning: Only {len(label_1_df)} patterns available, less than requested {sample_size}"
        )

    # Copy audio files
    copied_files = 0
    missing_files = 0

    for _, row in label_1_df.iterrows():
        audio_path = os.path.normpath(os.path.join(audio_dir, row["path"]))
        output_path = os.path.normpath(os.path.join(output_dir, row["path"]))

        # Check if audio file exists
        if os.path.exists(audio_path):
            # Handle duplicate filenames
            base, ext = os.path.splitext(row["path"])
            counter = 1
            while os.path.exists(output_path):
                output_path = os.path.normpath(
                    os.path.join(output_dir, f"{base}_{counter}{ext}")
                )
                counter += 1

            try:
                shutil.copy2(audio_path, output_path)  # Copy file, preserving metadata
                copied_files += 1
                print(f"Copied: {audio_path} -> {output_path}")
            except Exception as e:
                print(f"Error copying {audio_path}: {str(e)}")
        else:
            print(f"Audio file not found: {audio_path}")
            missing_files += 1

    print(f"\nSummary for Label 1 (Female_Twenties):")
    print(f"Total patterns processed: {len(label_1_df)}")
    print(f"Files copied: {copied_files}")
    print(f"Files missing: {missing_files}")

    return copied_files, missing_files


copied, missing = process_label_1_patterns(
    TSV_INPUT_PATH, AUDIO_DIR, OUTPUT_DIR, SAMPLE_SIZE
)
print(f"\nFinal Summary: {copied} files copied, {missing} files missing")

Created/Verified output directory: data_batches\Female_Twenties
TSV loaded: 172158 rows
Label 1 (Female_Twenties): Found 19341 patterns in TSV
Label 1: Sampled 15825 patterns
Copied: filtered\common_voice_en_311731.mp3 -> data_batches\Female_Twenties\common_voice_en_311731.mp3
Copied: filtered\common_voice_en_18667542.mp3 -> data_batches\Female_Twenties\common_voice_en_18667542.mp3
Copied: filtered\common_voice_en_19634135.mp3 -> data_batches\Female_Twenties\common_voice_en_19634135.mp3
Copied: filtered\common_voice_en_19538473.mp3 -> data_batches\Female_Twenties\common_voice_en_19538473.mp3
Copied: filtered\common_voice_en_19537203.mp3 -> data_batches\Female_Twenties\common_voice_en_19537203.mp3
Copied: filtered\common_voice_en_17653227.mp3 -> data_batches\Female_Twenties\common_voice_en_17653227.mp3
Copied: filtered\common_voice_en_19635928.mp3 -> data_batches\Female_Twenties\common_voice_en_19635928.mp3
Copied: filtered\common_voice_en_18664651.mp3 -> data_batches\Female_Twenties\co