In [1]:
import datasets

In [3]:
dataset = datasets.load_dataset('keithito/lj_speech', cache_dir='../data')

In [6]:
dataset['train']['audio']

[{'path': 'LJ001-0001.wav',
  'array': array([-7.32421875e-04, -7.62939453e-04, -6.40869141e-04, ...,
          7.32421875e-04,  2.13623047e-04,  6.10351562e-05]),
  'sampling_rate': 22050},
 {'path': 'LJ001-0002.wav',
  'array': array([-0.00027466,  0.        ,  0.        , ..., -0.00088501,
         -0.00097656, -0.00109863]),
  'sampling_rate': 22050},
 {'path': 'LJ001-0003.wav',
  'array': array([-0.00247192, -0.00372314,  0.00045776, ...,  0.00030518,
          0.00015259, -0.00036621]),
  'sampling_rate': 22050},
 {'path': 'LJ001-0004.wav',
  'array': array([ 0.00024414,  0.        , -0.00018311, ..., -0.00064087,
         -0.00064087, -0.00073242]),
  'sampling_rate': 22050},
 {'path': 'LJ001-0005.wav',
  'array': array([0.00036621, 0.00045776, 0.00036621, ..., 0.00045776, 0.00061035,
         0.00082397]),
  'sampling_rate': 22050},
 {'path': 'LJ001-0006.wav',
  'array': array([6.10351562e-05, 2.13623047e-04, 1.52587891e-04, ...,
         7.32421875e-04, 7.01904297e-04, 7.32421

In [12]:
import os
import csv
from datasets import load_dataset
import soundfile as sf
from tqdm import tqdm

def reformat_dataset(output_dir):
    # Load the dataset
    dataset = load_dataset('keithito/lj_speech', cache_dir='../data')

    # Create output directory structure
    os.makedirs(output_dir, exist_ok=True)
    wavs_dir = os.path.join(output_dir, "wavs")
    os.makedirs(wavs_dir, exist_ok=True)

    # Prepare metadata file
    metadata_path = os.path.join(output_dir, "metadata.txt")

    with open(metadata_path, 'w', newline='', encoding='utf-8') as metadata_file:
        writer = csv.writer(metadata_file, delimiter='|')

        # Iterate through the dataset
        for item in tqdm(dataset['train'], desc="Processing items"):
            # Generate a filename for the audio
            audio_filename = f"{item['id']}.wav"

            # Save the audio file
            audio_path = os.path.join(wavs_dir, audio_filename)
            sf.write(audio_path, item['audio']['array'], item['audio']['sampling_rate'])

            # Write metadata
            writer.writerow([
                audio_filename,
                item['text'],
                item['normalized_text']
            ])

    print(f"Dataset reformatted and saved to {output_dir}")

# Usage
output_directory = "../data/keithito_lj_speech"
reformat_dataset(output_directory)

Processing items: 100%|██████████| 13100/13100 [01:41<00:00, 129.14it/s]


Dataset reformatted and saved to ../data/keithito_lj_speech


In [13]:
import os
import random

def split_metadata(input_file, train_file, test_file, train_ratio=0.8):
    # Read all lines from the input file
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Shuffle the lines randomly
    random.shuffle(lines)

    # Calculate the split point
    split_point = int(len(lines) * train_ratio)

    # Write train data
    with open(train_file, 'w', encoding='utf-8') as f:
        f.writelines(lines[:split_point])

    # Write test data
    with open(test_file, 'w', encoding='utf-8') as f:
        f.writelines(lines[split_point:])

    print(f"Total lines: {len(lines)}")
    print(f"Train lines: {split_point}")
    print(f"Test lines: {len(lines) - split_point}")

# Usage
input_metadata = "../data/keithito_lj_speech/metadata.txt"
train_metadata = "../data/keithito_lj_speech/train_metadata.txt"
test_metadata = "../data/keithito_lj_speech/test_metadata.txt"

# Ensure the output directory exists
os.makedirs(os.path.dirname(train_metadata), exist_ok=True)

# Perform the split
split_metadata(input_metadata, train_metadata, test_metadata)

Total lines: 13100
Train lines: 10480
Test lines: 2620
