In [1]:
!uv add datasets requests pandas soundfile
import datasets
print("Foobar")

[2mResolved [1m80 packages[0m [2min 0.20ms[0m[0m
[2mAudited [1m50 packages[0m [2min 0.02ms[0m[0m
Foobar


In [31]:
import os
import json
import requests
import numpy as np
from pathlib import Path
from datasets import load_dataset
from tqdm.notebook import tqdm
import tempfile
import soundfile as sf

# Create output directory
output_dir = Path('voices')
output_dir.mkdir(exist_ok=True)

# Track our speakers and their example texts
# Load existing index if it exists, otherwise create new
index_path = output_dir / 'index.json'
if index_path.exists():
    with open(index_path, 'r', encoding='utf-8') as f:
        speakers_index = json.load(f)
    print(f"Loaded existing index with {len(speakers_index['speakers'])} speakers")
else:
    speakers_index = {"speakers": {}}
    print("Starting fresh index")




Loaded existing index with 1000 speakers


In [24]:
def process_audio(audio_dict, speaker_id, metadata):
    """Process a single audio file through the encoding endpoint"""
    # Extract the raw audio data and sampling rate
    audio_array = audio_dict['array']
    sr = audio_dict['sampling_rate']
    
    # Save as WAV
    with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
        sf.write(temp_file.name, audio_array, sr, format='WAV')
        temp_path = temp_file.name
    
    try:
        # Send to your endpoint
        files = {'file': ('audio.wav', open(temp_path, 'rb'), 'audio/wav')}
        response = requests.post('http://localhost:5000/v1/audio/encoding', 
                               files=files)
        
        if response.status_code == 200:
            # Save the numpy array
            output_path = output_dir / f"{speaker_id}.npy"
            with open(output_path, 'wb') as f:
                f.write(response.content)
            
            # Add to our index
            speakers_index["speakers"][speaker_id] = metadata['text']
            
            # Write index after each successful addition
            with open(output_dir / 'index.json', 'w', encoding='utf-8') as f:
                json.dump(speakers_index, f, ensure_ascii=False, indent=4)
                
            return True
        else:
            print(f"Failed to process {speaker_id}: {response.status_code}")
            return False
            
    finally:
        # Cleanup temp file
        os.unlink(temp_path)

In [29]:
def process_language(lang_code, max_speakers=1000):
    print(f"\nProcessing {lang_code}...")
    
    # Load dataset for specific language
    path = f"{lang_code.upper()}/*.tar"
    dataset = load_dataset("amphion/Emilia-Dataset", 
                         data_files={lang_code: path}, 
                         split=lang_code, 
                         streaming=True)
    
    processed_speakers = set()
    pbar = tqdm(desc=f"Speakers processed for {lang_code}")
    
    for item in dataset:
        try:
            # Parse the JSON data
            metadata = item['json']
            speaker_id = metadata['speaker']
            
            # Skip if we already have this speaker
            if speaker_id in processed_speakers:
                continue
                
            success = process_audio(item['mp3'], speaker_id, metadata)
            if success:
                processed_speakers.add(speaker_id)
                pbar.update(1)
                
                if len(processed_speakers) >= max_speakers:
                    break
            else:
                # If we hit an error, stop processing
                print(f"Stopping processing due to failure on {speaker_id}")
                break
                
        except Exception as e:
            print(f"Error processing {speaker_id}: {e}")
            print("Stopping processing due to error")
            break
    
    pbar.close()
    return len(processed_speakers)


In [30]:
speakers_ja = process_language('ja', max_speakers=1000)


Processing ja...


Resolving data files:   0%|          | 0/70 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/70 [00:00<?, ?it/s]

Speakers processed for ja: 0it [00:00, ?it/s]

In [32]:
speakers_zh = process_language('zh', max_speakers=1000)


Processing zh...


Resolving data files:   0%|          | 0/920 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/920 [00:00<?, ?it/s]

Speakers processed for zh: 0it [00:00, ?it/s]

In [33]:
speakers_en = process_language('en', max_speakers=1000)


Processing en...


Resolving data files:   0%|          | 0/1140 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1140 [00:00<?, ?it/s]

Speakers processed for en: 0it [00:00, ?it/s]