In [1]:
import os
import librosa
import shutil
from tqdm import tqdm
import numpy as np

In [None]:
BASE_INPUT_DIR = "../data_raw"
OUTPUT_DIR = "../data_filtered"
SUPPORTED_EXTENSIONS = (
    ".wav",
    ".mp3",
)

In [3]:
# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:
def is_audio_valid(file_path):
    try:
        audio, sr = librosa.load(file_path, sr=None)
        if len(audio) < 100 or np.all(audio == 0):
            return False
        return True
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return False

In [5]:
def process_batch(batch_dir, output_dir):
    valid_files = []
    corrupted_files = []

    if not os.path.exists(batch_dir):
        print(f"Directory {batch_dir} does not exist.")
        return valid_files, corrupted_files

    files = [
        f for f in os.listdir(batch_dir) if f.lower().endswith(SUPPORTED_EXTENSIONS)
    ]

    # Process each file with progress bar
    for file in tqdm(files, desc=f"Processing {os.path.basename(batch_dir)}"):
        file_path = os.path.join(batch_dir, file)

        # Check file size (optional additional check)
        if os.path.getsize(file_path) < 1024:  # Less than 1KB is suspicious
            corrupted_files.append(file_path)
            continue

        if is_audio_valid(file_path):
            output_path = os.path.join(output_dir, file)
            base, ext = os.path.splitext(file)
            counter = 1
            while os.path.exists(output_path):
                output_path = os.path.join(output_dir, f"{base}_{counter}{ext}")
                counter += 1
            shutil.copy2(file_path, output_path)
            valid_files.append(file_path)
        else:
            corrupted_files.append(file_path)

    return valid_files, corrupted_files

In [6]:
all_valid_files = []
all_corrupted_files = []

# Process each batch directory
batch_dir = os.path.join(BASE_INPUT_DIR)
print(f"\nProcessing data...")
valid_files, corrupted_files = process_batch(batch_dir, OUTPUT_DIR)
all_valid_files.extend(valid_files)
all_corrupted_files.extend(corrupted_files)
# Print summary for this batch
print(f"Summary:")
print(f"Valid files: {len(valid_files)}")
print(f"Corrupted files: {len(corrupted_files)}")

# Print final summary
print("\nFinal Summary:")
print(f"Total valid files copied: {len(all_valid_files)}")
print(f"Total corrupted files: {len(all_corrupted_files)}")

# Optionally save lists of valid and corrupted files
with open(os.path.join(OUTPUT_DIR, "valid_files.txt"), "w") as f:
    f.write("\n".join(all_valid_files))
with open(os.path.join(OUTPUT_DIR, "corrupted_files.txt"), "w") as f:
    f.write("\n".join(all_corrupted_files))


Processing data...


Processing data: 100%|██████████| 209791/209791 [19:30<00:00, 179.30it/s]  

Summary:
Valid files: 172158
Corrupted files: 37633

Final Summary:
Total valid files copied: 172158
Total corrupted files: 37633



