# Build Test CSV

- takes in path to dataset
- makes 10 CSV file for each speaker
- Each file will be used for test for one of the 10 model trained.

In [3]:
import os
import pandas as pd
from pathlib import Path

def build_speaker_csvs(dataset_path, output_dir='info'):
    """
    Build CSV files for each speaker from the dataset.
    
    Parameters:
    - dataset_path: Path to the dataset folder containing wav files
    - output_dir: Directory where speaker CSV files will be saved
    
    Returns:
    - Dictionary with speaker IDs as keys and their CSV file paths as values
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get all wav files from dataset
    dataset_path = Path(dataset_path)
    wav_files = sorted(dataset_path.glob('*.wav'))
    
    # Extract speaker info from filenames
    # Format: XXaYYZz.wav or XXbYYZz.wav where XX is the speaker number
    speaker_data = {}
    
    for wav_file in wav_files:
        filename = wav_file.name
        # Extract speaker number (first 2 digits)
        speaker_num = filename[:2]
        speaker_id = f'S{speaker_num}'
        
        if speaker_id not in speaker_data:
            speaker_data[speaker_id] = []
        
        # Add file info to speaker's data
        speaker_data[speaker_id].append({
            'id': filename.replace('.wav', ''),
            'file_path': str(wav_file)
        })
    
    # Create CSV for each speaker (limiting to 10 speakers)
    created_files = {}
    speaker_ids = sorted(speaker_data.keys())[:10]
    
    for speaker_id in speaker_ids:
        # Create DataFrame
        df = pd.DataFrame(speaker_data[speaker_id])
        
        # Save to CSV
        csv_filename = f'{speaker_id}.csv'
        csv_path = os.path.join(output_dir, csv_filename)
        df.to_csv(csv_path, index=False)
        
        created_files[speaker_id] = csv_path
        print(f'Created {csv_filename} with {len(df)} files')
    
    return created_files




In [4]:
# Example usage:
dataset_path = '/Users/adityakumar/Developer/Projects/mini_project/dataset'
result = build_speaker_csvs(dataset_path)


Created S03.csv with 49 files
Created S08.csv with 58 files
Created S09.csv with 43 files
Created S10.csv with 38 files
Created S11.csv with 55 files
Created S12.csv with 35 files
Created S13.csv with 61 files
Created S14.csv with 69 files
Created S15.csv with 56 files
Created S16.csv with 71 files
