In [9]:
import os
import json
import random
import shutil
from pathlib import Path
import torch
import torchaudio

# Configuration
MOISESDB_ROOT = Path("./moisesdb/moisesdb_v0.1")
OUTPUT_ROOT = Path("./data_moisesdb")
TARGET_STEM = "guitar"
SAMPLE_RATE = 44100
DURATION_SEC = 6.0
MAX_SAMPLES = int(SAMPLE_RATE * DURATION_SEC)

# 1. Verify dataset path exists
if not MOISESDB_ROOT.exists():
    print(f"❌ Error: Dataset path not found at {MOISESDB_ROOT}")
    exit(1)

# 2. Prepare output directory
if OUTPUT_ROOT.exists():
    shutil.rmtree(OUTPUT_ROOT)
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
(OUTPUT_ROOT / "train").mkdir(exist_ok=True)
(OUTPUT_ROOT / "valid").mkdir(exist_ok=True)

def load_stem_folder(stem_folder_path):
    """Load and combine all audio files from a stem folder"""
    if not stem_folder_path.exists():
        return None
    
    combined_audio = None
    audio_files = list(stem_folder_path.glob("*.wav")) + list(stem_folder_path.glob("*.flac"))
    
    for audio_file in audio_files:
        try:
            waveform, orig_sr = torchaudio.load(audio_file)
            
            # Resample to 44.1kHz if needed
            if orig_sr != SAMPLE_RATE:
                resampler = torchaudio.transforms.Resample(orig_sr, SAMPLE_RATE)
                waveform = resampler(waveform)
            
            # Convert mono to stereo
            if waveform.shape[0] == 1:
                waveform = torch.cat([waveform, waveform], dim=0)
            
            # Add to combined audio
            if combined_audio is None:
                combined_audio = waveform
            else:
                # Ensure same length for mixing
                min_length = min(combined_audio.shape[1], waveform.shape[1])
                combined_audio = combined_audio[:, :min_length] + waveform[:, :min_length]
                
        except Exception as e:
            print(f"⚠️  Error loading {audio_file}: {e}")
    
    return combined_audio

def process_track(track_folder, output_path):
    """Process a single track folder"""
    # Load guitar stem
    guitar_folder = track_folder / TARGET_STEM
    guitar_audio = load_stem_folder(guitar_folder)
    
    if guitar_audio is None:
        return False
    
    # Load all other stems for mixture
    stem_folders = ["bass", "drums", "vocals", "other", "other_keys", "piano", 
                   "percussion", "wind", "bowed_strings", "other_plucked"]
    
    mixture_audio = guitar_audio.clone()  # Start with guitar
    
    for stem_name in stem_folders:
        if stem_name == TARGET_STEM:
            continue
        stem_folder = track_folder / stem_name
        stem_audio = load_stem_folder(stem_folder)
        if stem_audio is not None:
            # Ensure same length for mixing
            min_length = min(mixture_audio.shape[1], stem_audio.shape[1])
            mixture_audio = mixture_audio[:, :min_length] + stem_audio[:, :min_length]
    
    # Handle duration - truncate or loop to exactly 6 seconds
    current_samples = mixture_audio.shape[1]
    
    if current_samples < MAX_SAMPLES:
        # Loop audio to reach target duration
        repeats = (MAX_SAMPLES // current_samples) + 1
        mixture_audio = mixture_audio.repeat(1, repeats)[:, :MAX_SAMPLES]
        guitar_audio = guitar_audio.repeat(1, repeats)[:, :MAX_SAMPLES]
    else:
        # Truncate to target duration
        mixture_audio = mixture_audio[:, :MAX_SAMPLES]
        guitar_audio = guitar_audio[:, :MAX_SAMPLES]
    
    # Save files
    output_path.mkdir(parents=True, exist_ok=True)
    torchaudio.save(output_path / "mixture.wav", mixture_audio, SAMPLE_RATE)
    torchaudio.save(output_path / "guitar.wav", guitar_audio, SAMPLE_RATE)
    
    return True

# 3. Find all tracks with guitar folders
print(f"🔍 Scanning for tracks with '{TARGET_STEM}' folders...")
valid_tracks = []

for track_folder in MOISESDB_ROOT.iterdir():
    if not track_folder.is_dir():
        continue
    
    guitar_folder = track_folder / TARGET_STEM
    if guitar_folder.exists() and guitar_folder.is_dir():
        # Check if guitar folder contains audio files
        audio_files = list(guitar_folder.glob("*.wav")) + list(guitar_folder.glob("*.flac"))
        if audio_files:
            valid_tracks.append(track_folder)

print(f"✅ Found {len(valid_tracks)} tracks with '{TARGET_STEM}' stems")

if not valid_tracks:
    print("❌ No valid tracks found. Exiting.")
    exit(1)

# 4. Split into train/validation
random.seed(42)
random.shuffle(valid_tracks)
split_idx = int(0.8 * len(valid_tracks))
train_tracks = valid_tracks[:split_idx]
valid_tracks = valid_tracks[split_idx:]

print(f"📊 Dataset split: {len(train_tracks)} train, {len(valid_tracks)} validation")

# 5. Process training tracks
print("🔄 Processing training tracks...")
train_success = 0
for idx, track_folder in enumerate(train_tracks):
    output_path = OUTPUT_ROOT / "train" / f"track_{idx:03d}"
    if process_track(track_folder, output_path):
        train_success += 1
        if (idx + 1) % 10 == 0:
            print(f"  ✅ Processed {idx + 1}/{len(train_tracks)} training tracks")

# 6. Process validation tracks
print("🔄 Processing validation tracks...")
valid_success = 0
for idx, track_folder in enumerate(valid_tracks):
    output_path = OUTPUT_ROOT / "valid" / f"track_{idx:03d}"
    if process_track(track_folder, output_path):
        valid_success += 1

print("✅ Preprocessing complete!")
print(f"Training tracks processed: {train_success}/{len(train_tracks)}")
print(f"Validation tracks processed: {valid_success}/{len(valid_tracks)}")
print(f"Output directory: {OUTPUT_ROOT}")

# Verify output structure
if train_success > 0:
    sample_track = OUTPUT_ROOT / "train" / "track_000"
    if (sample_track / "mixture.wav").exists() and (sample_track / "guitar.wav").exists():
        print("✅ Sample files created successfully")
        
        # Check audio properties
        mix_info = torchaudio.info(sample_track / "mixture.wav")
        guitar_info = torchaudio.info(sample_track / "guitar.wav")
        
        print(f"Sample track info:")
        print(f"  Mixture: {mix_info.sample_rate}Hz, {mix_info.num_channels}ch, {mix_info.num_frames/mix_info.sample_rate:.1f}s")
        print(f"  Guitar: {guitar_info.sample_rate}Hz, {guitar_info.num_channels}ch, {guitar_info.num_frames/guitar_info.sample_rate:.1f}s")


🔍 Scanning for tracks with 'guitar' folders...
✅ Found 222 tracks with 'guitar' stems
📊 Dataset split: 177 train, 45 validation
🔄 Processing training tracks...
  ✅ Processed 10/177 training tracks
  ✅ Processed 20/177 training tracks
  ✅ Processed 30/177 training tracks
  ✅ Processed 40/177 training tracks
  ✅ Processed 50/177 training tracks
  ✅ Processed 60/177 training tracks
  ✅ Processed 70/177 training tracks
  ✅ Processed 80/177 training tracks
  ✅ Processed 90/177 training tracks
  ✅ Processed 100/177 training tracks
  ✅ Processed 110/177 training tracks
  ✅ Processed 120/177 training tracks
  ✅ Processed 130/177 training tracks
  ✅ Processed 140/177 training tracks
  ✅ Processed 150/177 training tracks
  ✅ Processed 160/177 training tracks
  ✅ Processed 170/177 training tracks
🔄 Processing validation tracks...
✅ Preprocessing complete!
Training tracks processed: 177/177
Validation tracks processed: 45/45
Output directory: data_moisesdb
✅ Sample files created successfully
Sampl

In [1]:
import os
import random
import shutil
from pathlib import Path
import torch
import torchaudio

# Configuration
MOISESDB_ROOT = Path("./moisesdb/moisesdb_v0.1")
OUTPUT_ROOT = Path("./data_moisesdb")
TARGET_STEM = "guitar"
SAMPLE_RATE = 44100

# 1. Verify dataset path exists
if not MOISESDB_ROOT.exists():
    print(f"❌ Error: Dataset path not found at {MOISESDB_ROOT}")
    exit(1)

# 2. Prepare output directory
if OUTPUT_ROOT.exists():
    shutil.rmtree(OUTPUT_ROOT)
(OUTPUT_ROOT / "train").mkdir(parents=True, exist_ok=True)
(OUTPUT_ROOT / "valid").mkdir(exist_ok=True)

def load_stem_folder(stem_folder_path):
    """Load and combine all audio files from a stem folder."""
    if not stem_folder_path.exists():
        return None

    combined_audio = None
    audio_files = list(stem_folder_path.glob("*.wav")) + list(stem_folder_path.glob("*.flac"))

    for audio_file in audio_files:
        try:
            waveform, orig_sr = torchaudio.load(audio_file)

            # Resample to 44.1kHz if needed
            if orig_sr != SAMPLE_RATE:
                resampler = torchaudio.transforms.Resample(orig_sr, SAMPLE_RATE)
                waveform = resampler(waveform)

            # Convert mono to stereo
            if waveform.shape[0] == 1:
                waveform = torch.cat([waveform, waveform], dim=0)

            # Add to combined audio (align lengths)
            if combined_audio is None:
                combined_audio = waveform
            else:
                min_length = min(combined_audio.shape[1], waveform.shape[1])
                combined_audio = combined_audio[:, :min_length] + waveform[:, :min_length]

        except Exception as e:
            print(f"⚠️  Error loading {audio_file}: {e}")

    return combined_audio

def process_track(track_folder, output_path):
    """Process a single track folder, using full song length."""
    # Load guitar stem
    guitar_folder = track_folder / TARGET_STEM
    guitar_audio = load_stem_folder(guitar_folder)

    if guitar_audio is None:
        return False

    # Load all other stems for mixture
    stem_folders = ["bass", "drums", "vocals", "other", "other_keys", "piano", 
                   "percussion", "wind", "bowed_strings", "other_plucked"]

    # Start mixture with guitar
    mixture_audio = guitar_audio.clone()

    for stem_name in stem_folders:
        if stem_name == TARGET_STEM:
            continue
        stem_folder = track_folder / stem_name
        stem_audio = load_stem_folder(stem_folder)
        if stem_audio is not None:
            min_length = min(mixture_audio.shape[1], stem_audio.shape[1])
            mixture_audio = mixture_audio[:, :min_length] + stem_audio[:, :min_length]
            guitar_audio = guitar_audio[:, :min_length]  # Keep target aligned

    # Save files (no truncation, full song)
    output_path.mkdir(parents=True, exist_ok=True)
    torchaudio.save(output_path / "mixture.wav", mixture_audio, SAMPLE_RATE)
    torchaudio.save(output_path / "guitar.wav", guitar_audio, SAMPLE_RATE)
    return True

# 3. Find all tracks with guitar folders
print(f"🔍 Scanning for tracks with '{TARGET_STEM}' folders...")
valid_tracks = []

for track_folder in MOISESDB_ROOT.iterdir():
    if not track_folder.is_dir():
        continue

    guitar_folder = track_folder / TARGET_STEM
    if guitar_folder.exists() and guitar_folder.is_dir():
        audio_files = list(guitar_folder.glob("*.wav")) + list(guitar_folder.glob("*.flac"))
        if audio_files:
            valid_tracks.append(track_folder)

print(f"✅ Found {len(valid_tracks)} tracks with '{TARGET_STEM}' stems")

if not valid_tracks:
    print("❌ No valid tracks found. Exiting.")
    exit(1)

# 4. Split into train/validation
random.seed(42)
random.shuffle(valid_tracks)
split_idx = int(0.8 * len(valid_tracks))
train_tracks = valid_tracks[:split_idx]
valid_tracks = valid_tracks[split_idx:]

print(f"📊 Dataset split: {len(train_tracks)} train, {len(valid_tracks)} validation")

# 5. Process training tracks
print("🔄 Processing training tracks...")
train_success = 0
for idx, track_folder in enumerate(train_tracks):
    output_path = OUTPUT_ROOT / "train" / f"track_{idx:03d}"
    if process_track(track_folder, output_path):
        train_success += 1
        if (idx + 1) % 10 == 0:
            print(f"  ✅ Processed {idx + 1}/{len(train_tracks)} training tracks")

# 6. Process validation tracks
print("🔄 Processing validation tracks...")
valid_success = 0
for idx, track_folder in enumerate(valid_tracks):
    output_path = OUTPUT_ROOT / "valid" / f"track_{idx:03d}"
    if process_track(track_folder, output_path):
        valid_success += 1

print("✅ Preprocessing complete!")
print(f"Training tracks processed: {train_success}/{len(train_tracks)}")
print(f"Validation tracks processed: {valid_success}/{len(valid_tracks)}")
print(f"Output directory: {OUTPUT_ROOT}")

# Verify output structure
if train_success > 0:
    sample_track = OUTPUT_ROOT / "train" / "track_000"
    if (sample_track / "mixture.wav").exists() and (sample_track / "guitar.wav").exists():
        print("✅ Sample files created successfully")
        mix_info = torchaudio.info(sample_track / "mixture.wav")
        guitar_info = torchaudio.info(sample_track / "guitar.wav")
        print(f"Sample track info:")
        print(f"  Mixture: {mix_info.sample_rate}Hz, {mix_info.num_channels}ch, {mix_info.num_frames/mix_info.sample_rate:.1f}s")
        print(f"  Guitar: {guitar_info.sample_rate}Hz, {guitar_info.num_channels}ch, {guitar_info.num_frames/guitar_info.sample_rate:.1f}s")


🔍 Scanning for tracks with 'guitar' folders...
✅ Found 222 tracks with 'guitar' stems
📊 Dataset split: 177 train, 45 validation
🔄 Processing training tracks...
  ✅ Processed 10/177 training tracks
  ✅ Processed 20/177 training tracks
  ✅ Processed 30/177 training tracks
  ✅ Processed 40/177 training tracks
  ✅ Processed 50/177 training tracks
  ✅ Processed 60/177 training tracks
  ✅ Processed 70/177 training tracks
  ✅ Processed 80/177 training tracks
  ✅ Processed 90/177 training tracks
  ✅ Processed 100/177 training tracks
  ✅ Processed 110/177 training tracks
  ✅ Processed 120/177 training tracks
  ✅ Processed 130/177 training tracks
  ✅ Processed 140/177 training tracks
  ✅ Processed 150/177 training tracks
  ✅ Processed 160/177 training tracks
  ✅ Processed 170/177 training tracks
🔄 Processing validation tracks...
✅ Preprocessing complete!
Training tracks processed: 177/177
Validation tracks processed: 45/45
Output directory: data_moisesdb
✅ Sample files created successfully
Sampl