In [1]:
#import shutil
#! pip install transformers datasets tokenizers

In [2]:
"""!pip uninstall -y torch
!pip install torch==2.4.1"""

'!pip uninstall -y torch\n!pip install torch==2.4.1'

In [3]:
from huggingface_hub import HfApi, HfFolder
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers import normalizers
from typing import List
import re
import tokenizers
import json
import os
import random
import soundfile as sf
import numpy as np
from types import NoneType

In [None]:
from huggingface_hub import HfApi, HfFolder
from dotenv import load_dotenv
load_dotenv()

# Access the token key
HF_TOKEN = os.getenv("HF_TOKEN")

def login_hugging_face(token: str) -> None:
    """
    Loging to Hugging Face portal with a given token.
    """
    api = HfApi(token=token)
    #api.set_access_token(token)
    #folder = HfFolder()
    #folder.save_token(token)
    return None

login_hugging_face(HF_TOKEN)
print('We are logged in to Hugging Face now!')

In [None]:
mos_ds = load_dataset("ArissBandoss/moore-data-webscraping-full-CSV", split="train")
mos_ds

In [None]:
class VoiceMooreTextPreprocessor:

    def preprocess_batch(self, texts: List[str]) -> List[str]:
        return [self.preprocess(text) for text in texts]

    def preprocess(self, text: str) -> str:
        if type(text) == NoneType:
            text = str(text)
        text = text.lower()
        
        return text

In [None]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    vocab_size=2000,
    special_tokens=["[STOP]", "[UNK]", "[SPACE]", "[START]", "[mos]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

text_preprocessor = VoiceMooreTextPreprocessor()

In [None]:
def batch_iterator(batch_size=1000):
    for i in range(0, len(mos_ds), batch_size):
        yield text_preprocessor.preprocess_batch(mos_ds[i: i + batch_size]["mos"])

In [None]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(mos_ds))

In [None]:
print(os.getcwd())
!ls

In [None]:
os.chdir("/teamspace/studios/this_studio/coqui-TTS/train_moore")
print(os.getcwd())
tokenizer.save("./saved/mos_vocab.json")

In [None]:
outputs = tokenizer.encode_batch(text_preprocessor.preprocess_batch(mos_ds['mos'][:10]))
outputs

In [None]:
outputs[5].tokens

In [None]:
outputs = tokenizer.encode(text_preprocessor.preprocess("b sẽn deeg bi wã"))
outputs.tokens

In [None]:
def integrate_vocabs(main_vocab_path, mos_vocab_path, output_dir):
    # Load the main vocabulary
    with open(main_vocab_path, 'r', encoding='utf-8') as f:
        main_vocab = json.load(f)
    main_tokens = set(main_vocab['model']['vocab'].keys())
    next_id = max(main_vocab['model']['vocab'].values()) + 1

    # Load the Moore vocabulary
    with open(mos_vocab_path, 'r', encoding='utf-8') as f:
        mos_vocab = json.load(f)
    mos_tokens = set(mos_vocab['model']['vocab'].keys())
    
    # Add tokens from mos_vocab to main_vocab if they don't exist
    for token in mos_tokens:
        if token not in main_tokens:
            main_vocab['model']['vocab'][token] = next_id
            next_id += 1
    
    # Now for the merges
    main_merges = set(main_vocab['model']['merges'])
    mos_merges = set(mos_vocab['model']['merges'])

    # Add merges from mos_vocab to main_vocab if they don't exist
    for merge in mos_merges:
        if merge not in main_merges:
            main_vocab['model']['merges'].append(merge)

    # Save the updated vocabulary
    output_vocab_path = os.path.join(output_dir, 'combined_vocab.json')
    with open(output_vocab_path, 'w', encoding='utf-8') as f:
        json.dump(main_vocab, f, ensure_ascii=False, indent=2)

    print(f"Updated vocabulary saved to {output_vocab_path}")
    return output_vocab_path

In [None]:
!ls

In [None]:
# Specify the paths to your main and Bambara vocab files
main_vocab_path = './saved/xtts_default_vocab.json'
mos_vocab_path = './saved/mos_vocab.json'
output_dir = './saved'

# Integrate the Moore vocab into the main vocab and save the updated vocab
updated_vocab_path = integrate_vocabs(main_vocab_path, mos_vocab_path, output_dir)

In [None]:
combined_tokenizer = Tokenizer.from_file("./saved/combined_vocab.json")

In [None]:
combined_tokenizer.encode("b sẽn deeg bi wã").ids

In [None]:
mos_multi_ds = load_dataset("ArissBandoss/sentences-audio-texte-denoised-enhanced")
mos_multi_ds

In [None]:
def select_and_save_audio_samples(dataset, speaker_id, num_samples=10, audio_column='audio'):
    """
    Selects a specified number of random audio samples for a given speaker from a dataset,
    and saves them to a designated directory.

    Args:
    dataset (Dataset): The Hugging Face dataset containing audio data.
    audio_column (str): The name of the column in the dataset that contains the audio file paths.
    speaker_id (str): The speaker ID to filter the audio samples by.
    num_samples (int): The number of random samples to select and save.
    """
    # Filter the dataset for the specified speaker
    speaker_data = dataset.filter(lambda ex: [x == speaker_id for x in ex['speaker_id']], batched=True, batch_size=100)

    # Check if there are enough samples for the requested number
    if len(speaker_data) < num_samples:
        raise ValueError("The number of samples requested exceeds the number available for this speaker.")

    # Randomly select samples
    selected_samples = random.sample(list(speaker_data), num_samples)

    # Create the directory for the speaker if it does not exist
    speaker_dir = f'./reference_audios/speaker_{speaker_id}/'
    os.makedirs(speaker_dir, exist_ok=True)

    # Save the selected audio files
    for index, sample in enumerate(selected_samples):
        audio_data = sample[audio_column]['array']
        sample_rate = sample[audio_column]['sampling_rate']
        destination_path = os.path.join(speaker_dir, f'{index}.wav')
        # Write the audio file
        sf.write(destination_path, audio_data, sample_rate)
        print(f"Saved: {destination_path}")

In [None]:
select_and_save_audio_samples(mos_multi_ds['train'], speaker_id=17)