In [1]:
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset
from IPython.display import display

#import torchaudio
import datasets
import pandas as pd
import os
import re
import numpy as np
import gc
import soundfile as sf
import IPython
import torch
from tqdm import tqdm
#import wandb

In [2]:
"""!pip uninstall -y torch
!pip install torch==2.4.1"""

'!pip uninstall -y torch\n!pip install torch==2.4.1'

In [3]:
from huggingface_hub import HfApi, HfFolder
from dotenv import load_dotenv
load_dotenv()

# Access the token key
HF_TOKEN = os.getenv("HF_TOKEN")

def login_hugging_face(token: str) -> None:
    """
    Loging to Hugging Face portal with a given token.
    """
    api = HfApi(token=token)
    #api.set_access_token(token)
    #folder = HfFolder()
    #folder.save_token(token)
    return None

login_hugging_face(HF_TOKEN)
print('We are logged in to Hugging Face now!')

We are logged in to Hugging Face now!


In [4]:
# Load the dataset splits
dataset = load_dataset("ArissBandoss/sentences-audio-texte-denoised-enhanced")

# Get the train and test datasets
train_dataset = dataset['train']
test_dataset = dataset['test']

# Concatenate the datasets
dataset = concatenate_datasets([train_dataset, test_dataset])

# Create a new DatasetDict with a single 'train' key
dataset = DatasetDict({
    'train': dataset
})

# Check the structure of the new DatasetDict
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['fr', 'mos', 'audio', 'speaker_id', 'index', 'denoised_audio', 'enhanced_audio'],
        num_rows: 1657
    })
})


In [5]:
dataset['train'] = dataset['train'].add_column('lang', ['mos'] * len(dataset['train'])).rename_column("mos", "text")
dataset

DatasetDict({
    train: Dataset({
        features: ['fr', 'text', 'audio', 'speaker_id', 'index', 'denoised_audio', 'enhanced_audio', 'lang'],
        num_rows: 1657
    })
})

In [6]:
CHAR_LIMIT = {
    "en": 250,
    "fr": 273,
    "es": 239,
    "it": 213,
    "mos": 300,
}

def iterable_to_dataset(iterable_dataset, lang, num_rows):
    """
    Converts an IterableDataset to a Dataset with a specified number of rows using a while loop.

    Parameters:
    - iterable_dataset (IterableDataset): The input IterableDataset from Hugging Face datasets.
    - num_rows (int): The number of rows desired in the output Dataset.

    Returns:
    - Dataset: A Dataset object with the specified number of rows.
    """
    # Create an iterator from the iterable dataset
    iterator = iter(iterable_dataset)

    # Initialize an empty list to store the dataset rows
    rows = []

    # Initialize the tqdm progress bar
    progress_bar = tqdm(total=num_rows, desc='Converting', unit='row')

    try:
        # Collect the specified number of rows
        while len(rows) < num_rows:
            item = next(iterator)
            if len(item['text']) <= CHAR_LIMIT[lang]:
                item["lang"] = lang
                rows.append(item)
                progress_bar.update(1)
    except StopIteration:
        # End of iterator reached
        print("End of iterable dataset reached before requested number of rows.")
    finally:
        progress_bar.close()

    # Convert the list of rows to a Dataset object
    converted_dataset = Dataset.from_pandas(pd.DataFrame(rows))

    return converted_dataset

In [7]:
def create_audio_file(example, audio_column, output_dir, index):
    """
    Creates a single audio file from the 'audio' column of an example and returns the file path.
    """
    # Construct the output file path
    audio_filename = f"audio_{index}.wav"
    audio_filepath = os.path.join(output_dir, audio_filename)

    # If file does not exist, write the audio data to the file
    if not os.path.isfile(audio_filepath):
        # Extract audio data and sample rate from the example
        audio_data = example[audio_column]['array']
        sample_rate = example[audio_column]['sampling_rate']

        # Save the audio file
        sf.write(audio_filepath, audio_data, sample_rate)

    return {"audio_file_path": audio_filepath}



def batch_create_audio_files_and_update_dataset(dataset, audio_column, output_dir):
    """
    Maps over the dataset, creates audio files and updates the dataset with the file paths.
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Use the .map() function to process the dataset and create audio files
    dataset_with_audio_paths = dataset.map(
        lambda example, idx: create_audio_file(example, audio_column, output_dir, idx),
        with_indices=True,  # Pass example indices to the map function
        num_proc=12
    )

    return dataset_with_audio_paths



def create_audio_files_and_update_dataset(dataset, audio_column, output_dir):
    """
    Create audio files from the 'audio' column of a Hugging Face dataset and update the dataset with file paths.

    Parameters:
    - dataset: The input dataset that contains the 'audio' column.
    - audio_column: The name of the column containing the audio data (datasets.Audio feature).
    - output_dir: The directory where audio files will be saved.

    Returns:
    - The updated dataset with the 'audio' column containing the file paths of saved audio files.
    """
    # Make sure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Prepare a list to hold the file paths, to avoid modifying the dataset in-place
    audio_file_paths = []

    for index, example in tqdm(enumerate(dataset), total=len(dataset), desc="Creating audio files", unit="file"):
        audio_filename = f"audio_{index}.wav"
        audio_filepath = os.path.join(output_dir, audio_filename)

        if os.path.isfile(audio_filepath):
            audio_file_paths.append(audio_filepath)
            continue

        audio_data = example[audio_column]['array']
        # Typically, the sample rate should also be retrieved from the dataset
        sample_rate = example[audio_column]['sampling_rate']

        # Save the audio file
        sf.write(audio_filepath, audio_data, sample_rate)

        # Append the file path to the list
        audio_file_paths.append(audio_filepath)

        # Option to clear memory if needed, uncomment if large arrays are involved
        del audio_data
        gc.collect()

    # Update the dataset with the new file paths
    dataset = dataset.add_column("audio_file_path", audio_file_paths)

    return dataset


# Function to create the metadata file
def create_metadata_file(dataset, output_dir='MyTTSDataSet', filename='metadata.txt'):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Define the path to the metadata file
    metadata_path = os.path.join(output_dir, filename)

    # Open the metadata file in write mode
    with open(metadata_path, 'w', encoding='utf-8') as f:
        # Iterate over each item in the dataset
        for item in dataset:
            # Your dataset should have an 'audio' column with a dictionary containing the file path and 'array' for the audio data
            audio_path = item['audio_file_path'].replace(".wav", "")
            text = item['text'].replace(" ", " ").replace(" ", " ").replace("\n", " ")
            normalized_text = text
            speaker_id = item['speaker_id']
            lang = item['lang']

            # Write the formatted data to the metadata file
            f.write(f"{audio_path}|{text}|{normalized_text}|{speaker_id}|{lang}\n")

    return metadata_path

In [8]:
!pwd

/teamspace/studios/this_studio


In [9]:
dataset = batch_create_audio_files_and_update_dataset(
    dataset,
    audio_column="denoised_audio",
    output_dir="/teamspace/studios/this_studio/coqui-TTS/train_moore/dataset/audios/"
)

Map (num_proc=12):   0%|          | 0/1657 [00:00<?, ? examples/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['fr', 'text', 'audio', 'speaker_id', 'index', 'denoised_audio', 'enhanced_audio', 'lang', 'audio_file_path'],
        num_rows: 1657
    })
})

In [11]:
dataset_dict = dataset["train"].train_test_split(test_size=0.15, seed=2024)
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['fr', 'text', 'audio', 'speaker_id', 'index', 'denoised_audio', 'enhanced_audio', 'lang', 'audio_file_path'],
        num_rows: 1408
    })
    test: Dataset({
        features: ['fr', 'text', 'audio', 'speaker_id', 'index', 'denoised_audio', 'enhanced_audio', 'lang', 'audio_file_path'],
        num_rows: 249
    })
})

In [12]:
dataset_path = "/teamspace/studios/this_studio/coqui-TTS/train_moore/dataset/"

create_metadata_file(dataset_dict['train'], output_dir=dataset_path, filename='metadata.txt')
create_metadata_file(dataset_dict['test'],  output_dir=dataset_path, filename='metadata_val.txt')

'/teamspace/studios/this_studio/coqui-TTS/train_moore/dataset/metadata_val.txt'

In [17]:
#! CUDA_VISIBLE_DEVICES="0" python /teamspace/studios/this_studio/coqui-TTS/train_moore/train_gpt_xtts.py

  return torch.load(f, map_location=map_location, **kwargs)
 > Loading checkpoint with 1552 additional tokens.
  self.mel_norms = torch.load(f)
  dvae_checkpoint = torch.load(self.args.dvae_checkpoint, map_location=torch.device("cpu"))
>> DVAE weights restored from: /teamspace/studios/this_studio/coqui-TTS/train_moore/run/training/CHECKPOINT_GPT_XTTS_v2.0_MOS_FT_2/dvae.pth
108it [00:00, 66322.82it/s]
 | > Found 108 files in /teamspace/studios/this_studio/coqui-TTS/train_moore/dataset
12it [00:00, 22036.62it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Num. of CPUs: 4
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > 