In [1]:
from datasets import (
    load_dataset,
    Dataset,
    DatasetDict,
    concatenate_datasets,
    Audio,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH_1 = "BounharAbdelaziz/Moroccan-Darija-STT-Dataset"
DATA_PATH_2 = "BounharAbdelaziz/Casablanca_cleaned"
DATA_PATH_3 = "BounharAbdelaziz/Dvoice_cleaned"
DATA_PATH_4 = "BounharAbdelaziz/Amazigh_ASR"

In [20]:
PUSH_DATA_TO = "BounharAbdelaziz/Morocco-Darija-and-Amazigh-ASR"

# Moroccan Arabic Data

In [None]:
dataset_adrien = load_dataset(DATA_PATH_1)

In [4]:
dataset_casa = load_dataset(DATA_PATH_2)

In [5]:
dataset_dvoice = load_dataset(DATA_PATH_3)

In [6]:
dataset_amazigh = load_dataset(DATA_PATH_4)

## Add language

In [7]:
dataset_adrien["train"] = dataset_adrien["train"].add_column('language', ["moroccan_darija"] * len(dataset_adrien["train"]))
dataset_adrien["validation"] = dataset_adrien["validation"].add_column('language', ["moroccan_darija"] * len(dataset_adrien["validation"]))
dataset_casa["train"] = dataset_casa["train"].add_column('language', ["moroccan_darija"] * len(dataset_casa["train"]))
dataset_dvoice["train"] = dataset_dvoice["train"].add_column('language', ["moroccan_darija"] * len(dataset_dvoice["train"]))
dataset_amazigh["train"] = dataset_amazigh["train"].add_column('language', ["amazigh"] * len(dataset_amazigh["train"]))

In [8]:
dataset_adrien["validation"][0]

{'audio': {'path': 'a1pVO40Tzkw_segment_86.mp3',
  'array': array([-0.0167015 ,  0.00307631,  0.0022051 , ..., -0.00553913,
         -0.00444421, -0.0047589 ]),
  'sampling_rate': 16000},
 'transcription': 'شي حاجة فمشات قبل ما تجي عندي انا يعني مشات عند مجموعة ديال الرقاة فالإنسان بالسبب الإنسان يتخذ بالسبب من اجل العلاج فمني جات فعلا صرعت كاينة الحالة اللي كتعالج على الحصة وحدة في الحصة ديال التشخيص كتجي كتبغي تشخص الحالة ديالها تعرف شنو عندها فهاديك الحصة كييسر ليها الله تبارك وتعالى في العلاج. كاينين ثلاثة الحصات. كاينين سبعة الحصات على حسب النوع',
 'language': 'moroccan_darija'}

## Add source

In [9]:
dataset_adrien["train"] = dataset_adrien["train"].add_column('dataset_source', ["adiren7/darija_speech_to_text"] * len(dataset_adrien["train"]))
dataset_adrien["validation"] = dataset_adrien["validation"].add_column('dataset_source', ["adiren7/darija_speech_to_text"] * len(dataset_adrien["validation"]))

dataset_casa["train"] = dataset_casa["train"].add_column('dataset_source', ["UBC-NLP/Casablanca"] * len(dataset_casa["train"]))

dataset_dvoice["train"] = dataset_dvoice["train"].add_column('dataset_source', ["dvoice"] * len(dataset_dvoice["train"]))

dataset_amazigh["train"] = dataset_amazigh["train"].add_column('dataset_source', ["TifinLab/amazigh_moroccan_asr"] * len(dataset_amazigh["train"]))

## Extract only audio, text, language and source

In [11]:
columns_to_drop_casa = ['transcription_darija_ltn', 'intent', 'sentiment', 'gender_label', 'lead_time', 'sample_rate', 'split', 'duration']
columns_to_drop_dvoice = ['transcription_darija_ltn', 'intent', 'sentiment', 'gender_label', 'lead_time', 'sample_rate', 'split', 'duration']
columns_to_drop_amazigh = ['transcription_darija_ltn', 'intent', 'sentiment', 'gender_label', 'lead_time', 'sample_rate', 'split', 'duration']

In [12]:
dataset_casa["train"] = dataset_casa["train"].remove_columns(columns_to_drop_casa)
dataset_dvoice["train"] = dataset_dvoice["train"].remove_columns(columns_to_drop_dvoice)
dataset_amazigh["train"] = dataset_amazigh["train"].remove_columns(columns_to_drop_amazigh)

## Rename column to transcription

In [13]:
dataset_casa["train"] = dataset_casa["train"].rename_column('transcription_darija_ar', 'transcription')
dataset_dvoice["train"] = dataset_dvoice["train"].rename_column('transcription_darija_ar', 'transcription')

## Cast to 16khz if needed

In [14]:
print(dataset_adrien["train"].features)
print(dataset_dvoice["train"].features)
print(dataset_casa["train"].features)
print(dataset_amazigh["train"].features)

{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'transcription': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None), 'dataset_source': Value(dtype='string', id=None)}
{'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'transcription': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None), 'dataset_source': Value(dtype='string', id=None)}
{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'transcription': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None), 'dataset_source': Value(dtype='string', id=None)}
{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'transcription': Value(dtype='string', id=None), 'language': Value(dtype='string', id=None), 'dataset_source': Value(dtype='string', id=None)}


In [15]:
dataset_dvoice["train"] = dataset_dvoice["train"].cast_column("audio", Audio(sampling_rate=16000))

In [16]:
train_dataset = concatenate_datasets([dataset_adrien["train"], dataset_dvoice["train"], dataset_casa["train"], dataset_amazigh["train"]])

In [17]:
train_dataset

Dataset({
    features: ['audio', 'transcription', 'language', 'dataset_source'],
    num_rows: 11831
})

In [18]:
dataset = DatasetDict({
    "train": train_dataset,
    "validation": dataset_adrien["validation"]
})

In [21]:
dataset.push_to_hub(PUSH_DATA_TO, commit_message="Grouped all moroccan arabic and amazigh STT data")

Map: 100%|██████████| 2367/2367 [00:02<00:00, 841.35 examples/s]/s]
Creating parquet from Arrow format: 100%|██████████| 24/24 [00:01<00:00, 18.23ba/s]
Map: 100%|██████████| 2366/2366 [00:05<00:00, 412.91 examples/s] 19.96s/it]
Creating parquet from Arrow format: 100%|██████████| 24/24 [00:01<00:00, 13.08ba/s]
Map: 100%|██████████| 2366/2366 [00:06<00:00, 388.73 examples/s] 22.06s/it]
Creating parquet from Arrow format: 100%|██████████| 24/24 [00:01<00:00, 14.42ba/s]
Map: 100%|██████████| 2366/2366 [00:00<00:00, 4081.43 examples/s]20.62s/it]
Creating parquet from Arrow format: 100%|██████████| 24/24 [00:00<00:00, 136.82ba/s]
Map: 100%|██████████| 2366/2366 [00:00<00:00, 6654.09 examples/s]14.01s/it]
Creating parquet from Arrow format: 100%|██████████| 24/24 [00:00<00:00, 242.17ba/s]
Uploading the dataset shards: 100%|██████████| 5/5 [01:09<00:00, 13.99s/it]
Map: 100%|██████████| 1055/1055 [00:00<00:00, 1553.25 examples/s]s]
Creating parquet from Arrow format: 100%|██████████| 11/11 [00

CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Morocco-Darija-and-Amazigh-ASR/commit/87c3fcff54546d23afe682729a374b6c5b15e001', commit_message='Grouped all moroccan arabic and amazigh STT data', commit_description='', oid='87c3fcff54546d23afe682729a374b6c5b15e001', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Morocco-Darija-and-Amazigh-ASR', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Morocco-Darija-and-Amazigh-ASR'), pr_revision=None, pr_num=None)

# Keep only Moroccan Darija first

In [8]:
DATA_PATH = "BounharAbdelaziz/Morocco-Darija-and-Amazigh-ASR"
PUSH_DATA_TO = "BounharAbdelaziz/Morocco-Darija-ASR"

In [4]:
dataset = load_dataset(DATA_PATH)

Generating train split: 100%|██████████| 11831/11831 [00:08<00:00, 1356.82 examples/s]
Generating validation split: 100%|██████████| 2110/2110 [00:03<00:00, 550.25 examples/s] 


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 11831
    })
    validation: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 2110
    })
})

In [6]:
dataset['train'] = dataset['train'].filter(lambda row: row['language'] == 'moroccan_darija')

Filter: 100%|██████████| 11831/11831 [01:24<00:00, 140.42 examples/s]


In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 7591
    })
    validation: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 2110
    })
})

## Make val smaller

In [13]:
n_samples_in_val = len(dataset['validation'])
n_samples_to_keep_in_val = 300
n_samples_to_put_in_train = n_samples_in_val - n_samples_to_keep_in_val

seed=1998

In [14]:
# Sample n_samples_to_put_in_train rows from the validation split
sampled_validation = dataset['validation'].shuffle(seed=seed).select(range(n_samples_to_put_in_train))

# Concatenate the sampled rows with the train split
dataset['train'] = concatenate_datasets([dataset['train'], sampled_validation])

# Remove the sampled rows from the validation split
dataset['validation'] = dataset['validation'].shuffle(seed=seed).select(range(n_samples_to_put_in_train, n_samples_in_val))

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 9401
    })
    validation: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 300
    })
})

In [16]:
dataset.push_to_hub(PUSH_DATA_TO, commit_message="Grouped all moroccan arabic STT data only, made val smaller.", private=True)

Map: 100%|██████████| 1881/1881 [00:01<00:00, 1842.03 examples/s]s]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:00<00:00, 19.99ba/s]
Map: 100%|██████████| 1880/1880 [00:01<00:00, 1699.17 examples/s]22.24s/it]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:01<00:00, 18.67ba/s]
Map: 100%|██████████| 1880/1880 [00:01<00:00, 1182.43 examples/s]17.85s/it]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:02<00:00,  7.37ba/s]
Map: 100%|██████████| 1880/1880 [00:00<00:00, 5409.34 examples/s]22.43s/it]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:00<00:00, 87.94ba/s]
Map: 100%|██████████| 1880/1880 [00:02<00:00, 705.41 examples/s] 16.24s/it]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:02<00:00,  7.14ba/s]
Uploading the dataset shards: 100%|██████████| 5/5 [01:35<00:00, 19.15s/it]
Map: 100%|██████████| 300/300 [00:00<00:00, 1102.46 examples/s]t/s]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<

CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Morocco-Darija-ASR/commit/10042237e5fda64a0b4d8eff4e39d746619910b8', commit_message='Grouped all moroccan arabic STT data only, made val smaller.', commit_description='', oid='10042237e5fda64a0b4d8eff4e39d746619910b8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Morocco-Darija-ASR', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Morocco-Darija-ASR'), pr_revision=None, pr_num=None)