In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
InTheWildPrep.py - Script for preparing the InTheWild audio dataset

This script:
1. Unzips the dataset (if zipped)
2. Organizes audio files into 'real' and 'fake' folders based on metadata
3. Processes the metadata CSV file
4. Prepares the dataset for machine learning use
"""

import numpy as np
import pandas as pd
import os
import shutil
import zipfile

def unzip_dataset(zip_path, extract_to):
    """
    Unzips the dataset file if it exists
    
    Parameters:
    - zip_path (str): Path to the zip file
    - extract_to (str): Directory to extract to
    
    Returns:
    - bool: True if unzipped successfully, False otherwise
    """
    try:
        if not os.path.exists(zip_path):
            print(f"Zip file not found at {zip_path}")
            return False
            
        print(f"Unzipping {zip_path} to {extract_to}...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print("Unzip completed successfully!")
        return True
    except Exception as e:
        print(f"Error unzipping file: {e}")
        return False

def move_file_outside_folder(folder_path, file_name, destination_path):
    """Moves a file outside a folder to a specified destination."""
    try:
        source_path = os.path.join(folder_path, file_name)
        destination_path = os.path.join(destination_path, file_name)
        shutil.move(source_path, destination_path)
        return True
    except FileNotFoundError:
        print(f"File not found: {os.path.join(folder_path, file_name)}")
        return False

def process_metadata(csv_path):
    """Processes the metadata CSV file."""
    df = pd.read_csv(csv_path)
    
    if 'speaker' in df.columns:
        df.drop(columns=['speaker'], inplace=True)
    
    df['label'] = df['label'].replace({'spoof': 'fake', 'bona-fide': 'real'})
    
    output_csv_path = os.path.join(os.path.dirname(csv_path), 'modified_meta.csv')
    df.to_csv(output_csv_path, index=False)
    return output_csv_path

def main():
    # Configuration - adjust these paths for your JupyterHub environment
    base_path = "./data/InTheWild"
    zip_file_path = os.path.join(base_path, "ITWdownload.zip")  # Change if your zip has different name
    dataset_folder = os.path.join(base_path, "release_in_the_wild")
    
    print("Starting dataset preparation...")
    
    # Step 0: Unzip dataset if needed
    print("\nStep 0: Checking for zipped dataset...")
    if os.path.exists(zip_file_path):
        unzip_success = unzip_dataset(zip_file_path, base_path)
        if unzip_success:
            list_directory_contents(base_path)
    else:
        print("No zip file found, assuming dataset is already extracted")
    
    # Step 1: Move meta.csv if needed
    print("\nStep 1: Moving metadata file...")
    if os.path.exists(os.path.join(dataset_folder, "meta.csv")):
        move_success = move_file_outside_folder(
            folder_path=dataset_folder,
            file_name="meta.csv",
            destination_path=base_path
        )
    
    # Step 2: Process metadata
    print("\nStep 2: Processing metadata...")
    meta_path = os.path.join(base_path, "meta.csv")
    if os.path.exists(meta_path):
        modified_meta_path = process_metadata(meta_path)
        print(f"Modified metadata saved to: {modified_meta_path}")
    else:
        print(f"Metadata file not found at {meta_path}")
    
    print("\nDataset preparation completed!")

if __name__ == "__main__":
    main()

In [2]:
import os
import shutil

def move_files_back_to_root(folder_path):
    """
    Moves all .wav files from 'real' and 'fake' subfolders back to the parent folder.
    
    Args:
        folder_path (str): Path to the directory containing 'real' and 'fake' subfolders
    """
    # Define subfolders to process
    subfolders = ['real', 'fake']
    
    # Counters for moved files
    moved_files = 0
    total_files = 0
    
    for subfolder in subfolders:
        subfolder_path = os.path.join(folder_path, subfolder)
        
        # Check if subfolder exists
        if not os.path.exists(subfolder_path):
            print(f"⚠️ Subfolder '{subfolder}' not found in {folder_path}")
            continue
        
        print(f"\nProcessing '{subfolder}' folder...")
        
        # Iterate through all files in subfolder
        for filename in os.listdir(subfolder_path):
            if filename.lower().endswith('.wav'):
                total_files += 1
                
                # Source and destination paths
                src = os.path.join(subfolder_path, filename)
                dst = os.path.join(folder_path, filename)
                
                try:
                    # Move the file
                    shutil.move(src, dst)
                    moved_files += 1
                except Exception as e:
                    print(f"✗ Failed to move {filename}: {str(e)}")
    
    # Summary
    print(f"\n✅ Done! Moved {moved_files}/{total_files} files back to {folder_path}")
    
    # Optionally remove empty subfolders
    for subfolder in subfolders:
        subfolder_path = os.path.join(folder_path, subfolder)
        if os.path.exists(subfolder_path) and not os.listdir(subfolder_path):
            os.rmdir(subfolder_path)
            print(f"♻️ Removed empty folder: {subfolder_path}")

# Example usage
if __name__ == "__main__":
    base_path = "./data/InTheWild"
    dataset_folder = os.path.join(base_path, "release_in_the_wild") # Change this to your folder
    move_files_back_to_root(dataset_folder)


Processing 'real' folder...

Processing 'fake' folder...

✅ Done! Moved 18687/18687 files back to ./data/InTheWild/release_in_the_wild
