In [1]:
from datasets import (
    load_dataset,
    Dataset,
    DatasetDict,
    concatenate_datasets,
    Audio,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PUSH_DATA_TO = "BounharAbdelaziz/French-and-English-ASR"

# English

In [3]:
DATA_PATH = "ylacombe/english_dialects"

In [4]:
dataset_en_train_female = load_dataset(DATA_PATH, split= "train", name="northern_female", trust_remote_code=True)

In [5]:
dataset_en_train_male = load_dataset(DATA_PATH, split= "train", name="northern_male", trust_remote_code=True)

In [6]:
dataset_en_train_female

Dataset({
    features: ['line_id', 'audio', 'text', 'speaker_id'],
    num_rows: 750
})

In [7]:
dataset_en_train_male

Dataset({
    features: ['line_id', 'audio', 'text', 'speaker_id'],
    num_rows: 2097
})

In [8]:
dataset_en = DatasetDict({
    "train": concatenate_datasets([dataset_en_train_female, dataset_en_train_male])
})

In [9]:
dataset_en["train"] = dataset_en["train"].remove_columns(['line_id', 'speaker_id'])

In [10]:
dataset_en

DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 2847
    })
})

# French

In [11]:
DATA_PATH = "AdrienB134/Emilia-dataset-french-with-gender"

In [12]:
dataset_fr = load_dataset(DATA_PATH, split="test", trust_remote_code=True)

In [13]:
dataset_fr

Dataset({
    features: ['__key__', '__url__', 'json', 'audio', 'text', 'speaker_id', 'gender'],
    num_rows: 1088
})

## Keep relevant columns

In [14]:
columns_to_drop_fr = ['__key__', '__url__', 'json', 'speaker_id', 'gender']

In [15]:
dataset_fr = dataset_fr.remove_columns(columns_to_drop_fr)

In [16]:
dataset_fr = DatasetDict({
    "train": dataset_fr
})

## Add language

In [17]:
dataset_en["train"] = dataset_en["train"].add_column('language', ["english"] * len(dataset_en["train"]))
dataset_fr["train"] = dataset_fr["train"].add_column('language', ["french"] * len(dataset_fr["train"]))

## Add dataset source

In [18]:
dataset_en["train"] = dataset_en["train"].add_column('dataset_source', ["ylacombe/english_dialects/northern_male_female"] * len(dataset_en["train"]))
dataset_fr["train"] = dataset_fr["train"].add_column('dataset_source', ["AdrienB134/Emilia-dataset-french-with-gender"] * len(dataset_fr["train"]))

## Cast to 16khz if needed

In [19]:
print(dataset_fr["train"].features)
print(dataset_en["train"].features)

{'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'text': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None), 'dataset_source': Value(dtype='string', id=None)}
{'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'text': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None), 'dataset_source': Value(dtype='string', id=None)}


In [20]:
dataset_en["train"] = dataset_en["train"].cast_column("audio", Audio(sampling_rate=16000))
dataset_fr["train"] = dataset_fr["train"].cast_column("audio", Audio(sampling_rate=16000))

# Merge

In [21]:
train_dataset = concatenate_datasets([dataset_en["train"], dataset_fr["train"]])

In [22]:
train_dataset

Dataset({
    features: ['audio', 'text', 'language', 'dataset_source'],
    num_rows: 3935
})

In [23]:
train_dataset = DatasetDict({
    "train": train_dataset
})

## Rename column to transcription

In [24]:
train_dataset["train"] = train_dataset["train"].rename_column('text', 'transcription')

In [25]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 3935
    })
})

In [26]:
train_dataset.push_to_hub(PUSH_DATA_TO, commit_message="French and English ASR data")

Map: 100%|██████████| 984/984 [00:01<00:00, 591.45 examples/s]it/s]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:02<00:00,  4.81ba/s]
Map: 100%|██████████| 984/984 [00:04<00:00, 200.27 examples/s]1, 23.99s/it]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:01<00:00,  5.18ba/s]
Map: 100%|██████████| 984/984 [00:04<00:00, 209.39 examples/s]9, 19.64s/it]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:01<00:00,  5.48ba/s]
Map: 100%|██████████| 983/983 [00:00<00:00, 3184.86 examples/s], 17.93s/it]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 96.63ba/s]
Uploading the dataset shards: 100%|██████████| 4/4 [00:59<00:00, 14.91s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/French-and-English-ASR/commit/ca019d345a43afa99305fb1422fd35b3fa52f14d', commit_message='French and English ASR data', commit_description='', oid='ca019d345a43afa99305fb1422fd35b3fa52f14d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/French-and-English-ASR', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/French-and-English-ASR'), pr_revision=None, pr_num=None)

# Mix with moroccan data

In [32]:
DATA_MA = "BounharAbdelaziz/Morocco-Darija-and-Amazigh-ASR"
PUSH_MIXED_DATA_TO = "BounharAbdelaziz/Mixed-Morocco-Darija-Amazigh-English-and-French-ASR"

In [28]:
data_ma = load_dataset(DATA_MA)

Generating train split: 100%|██████████| 11831/11831 [00:08<00:00, 1346.38 examples/s]
Generating validation split: 100%|██████████| 2110/2110 [00:04<00:00, 525.79 examples/s] 


In [29]:
train_dataset_mixed = concatenate_datasets([data_ma['train'], train_dataset['train']])

In [30]:
mixed_dataset = DatasetDict({
    "train": train_dataset_mixed,
    "validation": data_ma["validation"],
})

In [31]:
mixed_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 15766
    })
    validation: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 2110
    })
})

In [33]:
mixed_dataset.push_to_hub(PUSH_MIXED_DATA_TO, commit_message="mixed all current ASR data for the moroccan, amazigh, english and french languages.")

Map: 100%|██████████| 1752/1752 [00:01<00:00, 1610.02 examples/s]s]
Creating parquet from Arrow format: 100%|██████████| 18/18 [00:00<00:00, 20.57ba/s]
Map: 100%|██████████| 1752/1752 [00:01<00:00, 1291.06 examples/s]17.00s/it]
Creating parquet from Arrow format: 100%|██████████| 18/18 [00:00<00:00, 19.65ba/s]
Map: 100%|██████████| 1752/1752 [00:02<00:00, 603.79 examples/s] 17.74s/it]
Creating parquet from Arrow format: 100%|██████████| 18/18 [00:02<00:00,  8.37ba/s]
Map: 100%|██████████| 1752/1752 [00:00<00:00, 2741.34 examples/s]19.52s/it]
Creating parquet from Arrow format: 100%|██████████| 18/18 [00:00<00:00, 41.48ba/s]
Map: 100%|██████████| 1752/1752 [00:00<00:00, 6811.36 examples/s]14.51s/it]
Creating parquet from Arrow format: 100%|██████████| 18/18 [00:00<00:00, 152.03ba/s]
Map: 100%|██████████| 1752/1752 [00:00<00:00, 9488.41 examples/s] 0.55s/it]
Creating parquet from Arrow format: 100%|██████████| 18/18 [00:00<00:00, 282.04ba/s]
Map: 100%|██████████| 1752/1752 [00:00<00:00, 

CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Mixed-Morocco-Darija-Amazigh-English-and-French-ASR/commit/21ed6b067692a768c2c7828a2a86cfb92b7c330c', commit_message='mixed all current ASR data for the moroccan, amazigh, english and french languages.', commit_description='', oid='21ed6b067692a768c2c7828a2a86cfb92b7c330c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Mixed-Morocco-Darija-Amazigh-English-and-French-ASR', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Mixed-Morocco-Darija-Amazigh-English-and-French-ASR'), pr_revision=None, pr_num=None)