In [None]:
import soundfile as sf
import numpy as np
import pandas as pd
import os
from pathlib import Path

def split_audio_and_create_metadata(audio_dir, output_dir, class_labels, chunk_length=6, overlap=0):
    """
    Split all WAV files into chunks and create a single metadata file, with chunks stored in road folders named after the audio files.
    
    Parameters:
    - audio_dir: Directory containing WAV audio files.
    - output_dir: Directory to save road folders, audio chunks, and metadata.
    - class_labels: Dictionary mapping audio filenames to (class, classID) tuples, or path to a CSV.
    - chunk_length: Length of each chunk in seconds (default: 6).
    - overlap: Overlap between chunks in seconds (default: 0).
    """
    
    # Verify audio_dir exists
    if not os.path.exists(audio_dir):
        print(f"Error: audio_dir '{audio_dir}' does not exist.")
        return
    
    # Create output directory if it doesn't exist
    try:
        os.makedirs(output_dir, exist_ok=True)
        print(f"Output directory created or exists: {output_dir}")
    except Exception as e:
        print(f"Error creating output_dir '{output_dir}': {e}")
        return
    
    # Initialize metadata list
    metadata = []
    
    # Handle class labels
    if isinstance(class_labels, str):
        try:
            label_df = pd.read_csv(class_labels)
            class_labels = {row['filename']: (row['class'], row['classID']) for _, row in label_df.iterrows()}
            print(f"Loaded class labels from CSV: {class_labels}")
        except Exception as e:
            print(f"Error loading class_labels CSV '{class_labels}': {e}")
            return
    
    # Find all audio files in audio_dir
    audio_files = [str(p) for p in Path(audio_dir).glob("*.wav")]
    print(f"Found {len(audio_files)} WAV files in {audio_dir}: {audio_files}")
    
    if not audio_files:
        print("No WAV files found. Exiting.")
        return
    
    # Process each audio file
    for audio_path in audio_files:
        audio_filename = os.path.basename(audio_path)
        print(f"Processing file: {audio_filename}")
        
        # Get class label and class id from dictionary
        class_label, class_id = class_labels.get(audio_filename, ("unknown", -1))
        print(f"Assigned class_label: {class_label}, class_id: {class_id}")
        
        # Generate fsID (e.g., first part of filename like 100032 in 100032-3.wav)
        fsID = audio_filename.split('-')[0] if '-' in audio_filename else audio_filename.split('.')[0]
        print(f"fsID: {fsID}")
        
        # Define road name as the filename without extension
        road_name = os.path.splitext(audio_filename)[0]
        road_dir = os.path.join(output_dir, road_name)
        try:
            os.makedirs(road_dir, exist_ok=True)
            print(f"Created road directory: {road_dir}")
        except Exception as e:
            print(f"Error creating road_dir '{road_dir}': {e}")
            continue
        
        # Load audio file with soundfile
        try:
            audio, sr = sf.read(audio_path)
            audio_duration = len(audio) / sr
            print(f"Loaded audio with soundfile: {audio_path}, duration: {audio_duration}s, sample rate: {sr}Hz")
        except Exception as e:
            print(f"Error loading audio file '{audio_path}' with soundfile: {e}")
            continue
        
        # Calculate number of samples per chunk
        samples_per_chunk = int(chunk_length * sr)
        step_size = int((chunk_length - overlap) * sr)
        print(f"Samples per chunk: {samples_per_chunk}, step size: {step_size}")
        
        # Split audio into chunks
        for i in range(0, len(audio) - samples_per_chunk + 1, step_size):
            start_sample = i
            end_sample = i + samples_per_chunk
            
            # Extract chunk
            chunk = audio[start_sample:end_sample]
            
            # Define chunk filename
            chunk_index = i // step_size
            slice_file_name = f"{fsID}-{class_id}-{chunk_index}.wav"
            chunk_path = os.path.join(road_dir, slice_file_name)
            
            # Save chunk as WAV file
            try:
                sf.write(chunk_path, chunk, sr)
                print(f"Saved chunk: {chunk_path}")
            except Exception as e:
                print(f"Error saving chunk '{chunk_path}': {e}")
                continue
            
            # Calculate start and end times
            start_time = start_sample / sr
            end_time = end_sample / sr
            
            # Append to metadata
            metadata.append({
                "slice_file_name": slice_file_name,
                "fsID": fsID,
                "start": start_time,
                "end": end_time,
                "road": road_name,
                "classID": class_id,
                "class": class_label
            })
            print(f"Added metadata for chunk: {slice_file_name}")
        
        # Handle the last chunk if audio length is not perfectly divisible
        if len(audio) % step_size > 0:
            start_sample = len(audio) - samples_per_chunk
            if start_sample >= 0:
                chunk = audio[start_sample:]
                chunk_index = len(metadata)
                slice_file_name = f"{fsID}-{class_id}-{chunk_index}.wav"
                chunk_path = os.path.join(road_dir, slice_file_name)
                
                try:
                    sf.write(chunk_path, chunk, sr)
                    print(f"Saved last chunk: {chunk_path}")
                except Exception as e:
                    print(f"Error saving last chunk '{chunk_path}': {e}")
                    continue
                
                start_time = start_sample / sr
                end_time = len(audio) / sr
                metadata.append({
                    "slice_file_name": slice_file_name,
                    "fsID": fsID,
                    "start": start_time,
                    "end": end_time,
                    "road": road_name,
                    "classID": class_id,
                    "class": class_label
                })
                print(f"Added metadata for last chunk: {slice_file_name}")
    
    # Create metadata DataFrame and save to CSV
    if metadata:
        metadata_df = pd.DataFrame(metadata)
        metadata_path = os.path.join(output_dir, "metadata.csv")
        try:
            metadata_df.to_csv(metadata_path, index=False)
            print(f"Metadata saved to {metadata_path}")
            print(f"Created {len(metadata)} chunks from {len(audio_files)} files.")
        except Exception as e:
            print(f"Error saving metadata to '{metadata_path}': {e}")
    else:
        print("No metadata to save. No chunks were created.")

# Example usage
audio_dir = "data//noiseData"  # Directory with WAV files
output_dir = "data/DATA/output_chunks"  # Directory to save road folders, chunks, and metadata
class_labels = {}  # Empty to assign "unknown" labels, to be edited manually
chunk_length = 6  # Chunk length in seconds
overlap = 0       # Overlap in seconds

split_audio_and_create_metadata(audio_dir, output_dir, class_labels, chunk_length, overlap)