In [7]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
InTheWildPrep.py - Script for preparing the InTheWild audio dataset

This script:
1. Unzips the dataset (if zipped)
2. Generates comprehensive metadata if none exists
3. Processes the metadata CSV file
4. Prepares the dataset for machine learning use
"""

import numpy as np
import pandas as pd
import os
import shutil
import zipfile

def unzip_dataset(zip_path, extract_to):
    """
    Unzips the dataset file if it exists
    
    Parameters:
    - zip_path (str): Path to the zip file
    - extract_to (str): Directory to extract to
    
    Returns:
    - bool: True if unzipped successfully, False otherwise
    """
    try:
        if not os.path.exists(zip_path):
            print(f"Zip file not found at {zip_path}")
            return False
            
        print(f"Unzipping {zip_path} to {extract_to}...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print("Unzip completed successfully!")
        return True
    except Exception as e:
        print(f"Error unzipping file: {e}")
        return False

def move_file_outside_folder(folder_path, file_name, destination_path):
    """Moves a file outside a folder to a specified destination."""
    try:
        source_path = os.path.join(folder_path, file_name)
        destination_path = os.path.join(destination_path, file_name)
        shutil.move(source_path, destination_path)
        return True
    except FileNotFoundError:
        print(f"File not found: {os.path.join(folder_path, file_name)}")
        return False
    
def generate_metadata(dataset_root):
    """
    Generates comprehensive metadata for the dataset with train/val/test splits
    
    Parameters:
    - dataset_root (str): Root directory of the dataset
    
    Returns:
    - str: Path to the generated metadata file
    """
    metadata = []
    
    for split in ["training", "validation", "testing"]:
        for label in ["real", "fake"]:
            folder_path = os.path.join(dataset_root, split, label)
            
            if not os.path.exists(folder_path):
                print(f"Warning: {folder_path} not found. Skipping...")
                continue
            
            for file in os.listdir(folder_path):
                if file.endswith((".wav", ".flac", ".mp3")):
                    metadata.append({
                        "file": file,
                        "path": os.path.join(folder_path, file),
                        "label": label,
                        "split": split.replace("ing", "")  # "training" → "train"
                    })
    
    df = pd.DataFrame(metadata)
    output_csv = os.path.join(dataset_root, "generated_metadata.csv")
    df.to_csv(output_csv, index=False)
    
    print(f"\nMetadata generation complete. Saved to {output_csv}")
    print(f"Total entries: {len(df)}")
    print("\nCounts by split and label:")
    print(df.groupby(["split", "label"]).size())
    
    return output_csv

def process_metadata(csv_path):
    """Processes the metadata CSV file."""
    try:
        df = pd.read_csv(csv_path)

        # Standardize label names if needed
        if 'label' in df.columns:
            df['label'] = df['label'].replace({'spoof': 'fake', 'bona-fide': 'real'})

        # Clean up speaker column if exists
        if 'speaker' in df.columns:
            df.drop(columns=['speaker'], inplace=True)

        # Ensure split column is properly formatted
        if 'split' in df.columns:
            df['split'] = df['split'].str.replace('ing', '')

        output_csv_path = os.path.join(os.path.dirname(csv_path), 'processed_metadata.csv')
        df.to_csv(output_csv_path, index=False)
        return output_csv_path
    except Exception as e:
        print(f"Error processing metadata: {e}")
        return None

def main():
    base_path = "./data/ASVspoofBalanced"
    zip_file_path = os.path.join(base_path, "BalancedASVspoof2021PA.zip")
    dataset_folder = os.path.join(base_path, "BalancedASVspoof2021PA")
    
    print("Starting dataset preparation...")
    
    # Step 0: Unzip dataset if needed
    print("\nStep 0: Checking for zipped dataset...")
    if os.path.exists(zip_file_path):
        unzip_success = unzip_dataset(zip_file_path, base_path)
    else:
        print("No zip file found, assuming dataset is already extracted")
    
    # Step 1: Check for existing metadata or generate new
    print("\nStep 1: Handling metadata...")
    meta_path = os.path.join(base_path, "meta.csv")
    
    if os.path.exists(meta_path):
        print("Found existing metadata file, processing...")
        processed_meta_path = process_metadata(meta_path)
        print(f"Processed metadata saved to: {processed_meta_path}")
    else:
        print("No existing metadata found, generating new metadata...")
        if os.path.exists(dataset_folder):
            generated_meta_path = generate_metadata(dataset_folder)
            print(f"Generated metadata saved to: {generated_meta_path}")
        else:
            print(f"Error: Dataset folder not found at {dataset_folder}")
    
#     # Step 1: Move meta.csv if needed
#     print("\nStep 1: Moving metadata file...")
#     if os.path.exists(os.path.join(dataset_folder, "meta.csv")):
#         move_success = move_file_outside_folder(
#             folder_path=dataset_folder,
#             file_name="meta.csv",
#             destination_path=base_path
#         )
    
#     # Step 2: Process metadata
#     print("\nStep 2: Processing metadata...")
#     meta_path = os.path.join(base_path, "meta.csv")
#     if os.path.exists(meta_path):
#         modified_meta_path = process_metadata(meta_path)
#         print(f"Modified metadata saved to: {modified_meta_path}")
#     else:
#         print(f"Metadata file not found at {meta_path}")
    
    print("\nDataset preparation completed!")

if __name__ == "__main__":
    main()

Starting dataset preparation...

Step 0: Checking for zipped dataset...
No zip file found, assuming dataset is already extracted

Step 1: Handling metadata...
No existing metadata found, generating new metadata...

Metadata generation complete. Saved to ./data/ASVspoofBalanced/generated_metadata.csv
Total entries: 110468

Counts by split and label:
label
fake    50000
real    60468
dtype: int64
Generated metadata saved to: ./data/ASVspoofBalanced/generated_metadata.csv

Dataset preparation completed!


In [14]:
import os
import shutil

def move_all_files_to_parent(folder_path):
    """
    Moves all files from a subfolder into its parent directory.

    Parameters:
    - folder_path (str): The path to the subfolder whose contents should be moved.
    """
    if not os.path.isdir(folder_path):
        print(f"Folder not found: {folder_path}")
        return

    parent_path = os.path.dirname(folder_path)
    moved_files = 0

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        dest_path = os.path.join(parent_path, filename)

        # Only move files (not subdirectories)
        if os.path.isfile(file_path):
            shutil.move(file_path, dest_path)
            moved_files += 1
            print(f"Moved: {filename}")

    print(f"\nTotal files moved: {moved_files} from '{folder_path}' → '{parent_path}'")

subfolder_to_flatten = 'data/FOR/FOR_dataset/validation'
move_all_files_to_parent(subfolder_to_flatten)

subfolder_to_flatten = 'data/FOR/FOR_dataset/testing'
move_all_files_to_parent(subfolder_to_flatten)

subfolder_to_flatten = 'data/FOR/FOR_dataset/training'
move_all_files_to_parent(subfolder_to_flatten)

Moved: recording15419.wav_norm_mono.wav
Moved: recording16738.wav_norm_mono.wav
Moved: recording16713.wav_norm_mono.wav
Moved: recording15331.wav_norm_mono.wav
Moved: recording15779.wav_norm_mono.wav
Moved: recording14870.wav_norm_mono.wav
Moved: recording16695.wav_norm_mono.wav
Moved: recording15935.wav_norm_mono.wav
Moved: recording16493.wav_norm_mono.wav
Moved: recording14250.wav_norm_mono.wav
Moved: recording16649.wav_norm_mono.wav
Moved: recording15563.wav_norm_mono.wav
Moved: recording14149.wav_norm_mono.wav
Moved: recording16248.wav_norm_mono.wav
Moved: recording14361.wav_norm_mono.wav
Moved: recording15031.wav_norm_mono.wav
Moved: recording15684.wav_norm_mono.wav
Moved: recording14935.wav_norm_mono.wav
Moved: recording15959.wav_norm_mono.wav
Moved: recording14713.wav_norm_mono.wav
Moved: recording14920.wav_norm_mono.wav
Moved: recording16544.wav_norm_mono.wav
Moved: recording15756.wav_norm_mono.wav
Moved: recording16223.wav_norm_mono.wav
Moved: recording14247.wav_norm_mono.wav


In [10]:
import shutil
import os

def delete_folder(folder_path):
    """
    Deletes the specified folder and all its contents.

    Parameters:
    - folder_path (str): The full path to the folder to delete.
    """
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        try:
            shutil.rmtree(folder_path)
            print(f"Deleted folder: {folder_path}")
        except Exception as e:
            print(f"Error deleting folder: {e}")
    else:
        print(f"Folder does not exist or is not a directory: {folder_path}")

folder_to_delete = 'data/FOR/validation/fake'
delete_folder(folder_to_delete)
folder_to_delete = 'data/FOR/validation/real'
delete_folder(folder_to_delete)

Folder does not exist or is not a directory: data/FOR/validation/fake
Folder does not exist or is not a directory: data/FOR/validation/real


In [12]:
import pandas as pd

# Load your CSV file
csv_path = './data/FOR/FOR_metadata.csv'
df = pd.read_csv(csv_path)

# Show existing columns
print("Before renaming:", df.columns.tolist())

# Rename a column
# Example: Rename 'filename' → 'file_name'
df.rename(columns={'filename': 'file'}, inplace=True)

# Confirm the change
print("After renaming:", df.columns.tolist())

# Save the updated CSV (overwrite or save as new)
df.to_csv(csv_path, index=False)
# Or: df.to_csv('./data/ASVspoofBalanced/renamed_metadata.csv', index=False)


Before renaming: ['filename', 'path', 'label', 'split']
After renaming: ['file', 'path', 'label', 'split']
