In [1]:
%%capture

#!pip install resemble-enhance --upgrade
#!pip uninstall -y datasets transformers pandas tabulate
#!pip install --upgrade datasets transformers pandas tabulate

In [2]:
from huggingface_hub import HfApi, HfFolder
import torch
import torchaudio
from datasets import load_dataset, Audio, concatenate_datasets, Dataset
from resemble_enhance.enhancer.inference import denoise, enhance
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"

from huggingface_hub import HfApi, HfFolder
def login_hugging_face(token: str) -> None:
    """
    Loging to Hugging Face portal with a given token.
    """
    api = HfApi(token=token)
    #api.set_access_token(token)
    folder = HfFolder()
    folder.save_token(token)
    return None

token = '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'
login_hugging_face(token)

print('We are logged in to Hugging Face now!')

[2024-11-05 21:37:55,901] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)
We are logged in to Hugging Face now!


## Denoise old dataset: ArissBandoss/moore-tts-full-dataset

In [3]:
dataset_name = "ArissBandoss/moore-tts-full-dataset"
split = "train"

dataset = load_dataset(dataset_name, 
                        split=split, 
                        #revision="e9c0251804007c5f528e3d970813b19afe4f744b"
                    )
dataset

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/18 [00:00<?, ?it/s]

Dataset({
    features: ['audio', 'text', 'source', 'speaker_id', 'denoised_audio'],
    num_rows: 22794
})

## Filter new dataset: ArissBandoss/moore-tts-new-yt-dataset

In [4]:
new_dataset_name = "ArissBandoss/moore-tts-new-yt-dataset"
split = "train"

new_dataset = load_dataset(new_dataset_name, split=split)
new_dataset

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/20 [00:00<?, ?it/s]

Dataset({
    features: ['audio_IDs', 'audio', 'text', 'audio_length', 'valid'],
    num_rows: 14281
})

In [8]:
# Load the .txt file with corrections and deletions
corrections = {}
with open("mosXTTS/moore_dataset_filtering.txt", "r", encoding="utf-8") as file:
    for line in file:
        line = line.strip()
        if not line: 
            continue
        audio_id, text = line.split("|")
        audio_id, text = audio_id.strip(), text.strip()
        if audio_id.startswith("LJ"):
            corrections[audio_id] = text


# Filter and update the dataset based on the corrections dictionary
def update_transcription(example):
    audio_id = example['audio_IDs']
    if audio_id in corrections:
        if "delete" in corrections[audio_id].lower():
            example['text'] = None  # Mark for deletion
        else:
            example['text'] = corrections[audio_id]  # Update transcription
    return example

# Apply the updates
updated_dataset = new_dataset.map(update_transcription)
print(updated_dataset)

# Filter out the rows marked for deletion
updated_dataset = updated_dataset.filter(lambda x: x['text'] is not None 
                                        and (
                                                  1 <= int(x['audio_IDs'][2:]) <= 1501
                                            or 5001 <= int(x['audio_IDs'][2:]) <= 5038
                                            or 7500 <= int(x['audio_IDs'][2:]) <= 8100
                                            )
                                        )
print(updated_dataset)

Map:   0%|          | 0/14281 [00:00<?, ? examples/s]

Dataset({
    features: ['audio_IDs', 'audio', 'text', 'audio_length', 'valid'],
    num_rows: 14281
})


Filter:   0%|          | 0/14281 [00:00<?, ? examples/s]

Dataset({
    features: ['audio_IDs', 'audio', 'text', 'audio_length', 'valid'],
    num_rows: 1511
})


In [9]:
# ['audio', 'text', 'source', 'speaker_id', 'denoised_audio']
updated_dataset = updated_dataset.remove_columns(["audio_IDs", "audio_length", "valid"])
updated_dataset = updated_dataset.map(lambda x: {
    "source": "yt", 
    "speaker_id": 97, 
    "denoised_audio": x["audio"]  # Copy 'audio' to 'denoised_audio'
})

print(updated_dataset)

Map:   0%|          | 0/1511 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'text', 'source', 'speaker_id', 'denoised_audio'],
    num_rows: 1511
})


In [10]:
updated_dataset = updated_dataset.cast_column("denoised_audio", Audio(sampling_rate=16000))
updated_dataset = updated_dataset.cast_column("audio", Audio(sampling_rate=16000))

dataset = dataset.cast_column("denoised_audio", Audio(sampling_rate=16000))
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [11]:
from datasets import concatenate_datasets, DatasetDict, load_dataset

# Assuming 'dataset' is your DatasetDict
merged_dataset = concatenate_datasets([dataset, updated_dataset])

# Create a new DatasetDict with the merged train dataset
merged_dataset = DatasetDict({'train': merged_dataset})
merged_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'text', 'source', 'speaker_id', 'denoised_audio'],
        num_rows: 24305
    })
})

In [13]:
# Save or push the updated dataset back to Hugging Face
merged_dataset.push_to_hub("ArissBandoss/moore-tts-asr-dataset")

Uploading the dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

Map:   0%|          | 0/1158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1158 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Map:   0%|          | 0/1157 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/513 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ArissBandoss/moore-tts-asr-dataset/commit/57d1a8da0842e6c53b565fa3a5faeb07149e9650', commit_message='Upload dataset', commit_description='', oid='57d1a8da0842e6c53b565fa3a5faeb07149e9650', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ArissBandoss/moore-tts-asr-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ArissBandoss/moore-tts-asr-dataset'), pr_revision=None, pr_num=None)