In [1]:
from datasets import (
    load_dataset,
    DatasetDict,
)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
EVAL_DATA_HUB = "UBC-NLP/Casablanca"
# dataset = load_dataset(EVAL_DATA_HUB, "Morocco", split='test')
dataset = load_dataset(EVAL_DATA_HUB, "Morocco")

In [6]:
dataset[0]

{'audio': {'path': '01 - Al Sir Laqdim - Ep 2 - السر القديم الحلقة_1152.3346875000002_1166.8978124999999_14500_1.wav',
  'array': array([0.00048828, 0.00064087, 0.00061035, ..., 0.00039673, 0.00091553,
         0.00100708]),
  'sampling_rate': 44100},
 'seg_id': '14500_1',
 'transcription': 'فراسك أماما كون مهدي بقا ساكن معانا فالدار كون شديت الباك شحال هادي من نهار مشا عند الجيلالي مابقيتش كانعرف نقرا حيت هو لي كان كايشرحليا كلشي',
 'gender': 'F',
 'duration': 7.207262946}

In [10]:
dataset

Dataset({
    features: ['audio', 'seg_id', 'transcription', 'gender', 'duration'],
    num_rows: 1045
})

In [12]:
dataset['validation'][0]

{'audio': {'path': '01 - Al Sir Laqdim - Ep 2 - السر القديم الحلقة_1054.2403125_1061.4290624999999_14498_1.wav',
  'array': array([-0.00105286, -0.00083923, -0.00073242, ...,  0.00296021,
          0.00283813,  0.00265503]),
  'sampling_rate': 44100},
 'seg_id': '14498_1',
 'transcription': 'واش من نيتك أمريم كانقوليك أو موان بقاي تعلمي مي سعاد قوليها راني غادا عند مهدي يعاوني في الخدمة',
 'gender': 'M',
 'duration': 5.623854258}

In [2]:
def compute_duration(example):
    """
    Compute the duration of an audio file in seconds.
    
    Parameters:
        example (dict): A dictionary containing the 'audio' column 
                        with raw waveform data and sample rate.
    
    Returns:
        dict: Updated example with 'duration' field in seconds.
    """
    waveform = example['audio']['array']
    sample_rate = example['audio']['sampling_rate']
    duration = len(waveform) / sample_rate
    example['duration'] = duration
    return example

def filter_short_audio(dataset, max_duration=4):
    """
    Filter dataset to keep only audio files with duration less than max_duration.
    
    Parameters:
        dataset (Dataset): A Hugging Face Dataset containing an 'duration' column.
        max_duration (float): Maximum allowed duration in seconds.
    
    Returns:
        Dataset: Filtered dataset with elements having duration < max_duration.
    """
    # Add duration column
    dataset = dataset.map(compute_duration)
    
    # Filter based on duration
    filtered_dataset = dataset.filter(lambda example: example['duration'] < max_duration)
    return filtered_dataset

# Play with dataset

In [3]:
eval_ds = load_dataset("BounharAbdelaziz/Morocco-Darija-ASR", split="validation")

In [4]:
eval_ds

Dataset({
    features: ['audio', 'transcription', 'language', 'dataset_source'],
    num_rows: 300
})

In [5]:
# Add duration column
dataset = eval_ds.map(compute_duration)

In [6]:
dataset

Dataset({
    features: ['audio', 'transcription', 'language', 'dataset_source', 'duration'],
    num_rows: 300
})

In [7]:
filtered_dataset = filter_short_audio(dataset)

In [8]:
new_dataset = DatasetDict({"validation": filtered_dataset})

In [9]:
new_dataset.push_to_hub("atlasia/Morocco-Youtube-Commons-Eval", commit_message="Kept audios < 4s as these have better transcriptions.")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/atlasia/Morocco-Youtube-Commons-Eval/commit/f84c4aad54927805d139b52117babdfc15818184', commit_message='Kept audios < 4s as these have better transcriptions.', commit_description='', oid='f84c4aad54927805d139b52117babdfc15818184', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/atlasia/Morocco-Youtube-Commons-Eval', endpoint='https://huggingface.co', repo_type='dataset', repo_id='atlasia/Morocco-Youtube-Commons-Eval'), pr_revision=None, pr_num=None)

# Fix my training set, remove interesection

In [1]:
from datasets import (
    load_dataset,
    DatasetDict,
    Dataset,
    concatenate_datasets,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
current_ds = load_dataset("BounharAbdelaziz/Morocco-Darija-ASR")

In [3]:
eval_set = load_dataset("atlasia/Moroccan-Darija-Youtube-Commons-Eval")

In [4]:
current_ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 9401
    })
    validation: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 300
    })
})

In [5]:
eval_set

DatasetDict({
    validation: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source', 'duration'],
        num_rows: 105
    })
})

In [6]:
new_ds = concatenate_datasets([current_ds['train'], current_ds['validation']])

In [7]:
new_ds = DatasetDict({"train": new_ds, "validation": eval_set['validation']})

In [8]:
new_ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 9701
    })
    validation: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source', 'duration'],
        num_rows: 105
    })
})

In [9]:
new_ds['validation'] = new_ds['validation'].remove_columns("duration")

### Now we remove duplicates and also items in train and val

In [15]:
def deduplicate_datasets(dataset_dict):
    """
    Efficiently remove duplicates within training set and elements that appear in validation set.
    
    Args:
        dataset_dict (DatasetDict): Input dataset containing 'train' and 'validation' splits
        
    Returns:
        DatasetDict: Cleaned dataset with no duplicates in train and no overlap with validation
    """
    # First remove duplicates within training set
    seen_transcriptions = set()
    
    def is_unique(example):
        if example['transcription'] in seen_transcriptions:
            return False
        seen_transcriptions.add(example['transcription'])
        return True
    
    # Remove duplicates from training set
    deduped_train = dataset_dict['train'].filter(
        is_unique,
        num_proc=1  # Must be 1 for stateful filtering
    )
    
    # Then remove validation overlaps
    val_transcriptions = set(dataset_dict['validation']['transcription'])
    
    clean_train = deduped_train.filter(
        lambda x: x['transcription'] not in val_transcriptions,
        num_proc= 16
    )
    
    return DatasetDict({
        'train': clean_train,
        'validation': dataset_dict['validation']
    })

In [16]:
cleaned_dataset = deduplicate_datasets(new_ds)

Filter: 100%|██████████| 9701/9701 [03:01<00:00, 53.52 examples/s]
Filter (num_proc=16): 100%|██████████| 7574/7574 [00:17<00:00, 430.32 examples/s]


In [17]:
# Print statistics
print(f"Original training samples: {len(new_ds['train'])}")
print(f"Cleaned training samples: {len(cleaned_dataset['train'])}")
print(f"Removed samples: {len(new_ds['train']) - len(cleaned_dataset['train'])}")

Original training samples: 9701
Cleaned training samples: 7469
Removed samples: 2232


In [18]:
cleaned_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 7469
    })
    validation: Dataset({
        features: ['audio', 'transcription', 'language', 'dataset_source'],
        num_rows: 105
    })
})

In [21]:
cleaned_dataset.push_to_hub("BounharAbdelaziz/Morocco-Darija-ASR-v1.2", commit_message="Deduplicated training set.", private=True)

Map: 100%|██████████| 1494/1494 [00:00<00:00, 1533.20 examples/s]s]
Creating parquet from Arrow format: 100%|██████████| 15/15 [00:01<00:00, 11.76ba/s]
Map: 100%|██████████| 1494/1494 [00:00<00:00, 1519.32 examples/s]13.90s/it]
Creating parquet from Arrow format: 100%|██████████| 15/15 [00:01<00:00, 10.79ba/s]
Map: 100%|██████████| 1494/1494 [00:01<00:00, 1277.31 examples/s]13.99s/it]
Creating parquet from Arrow format: 100%|██████████| 15/15 [00:01<00:00,  9.08ba/s]
Map: 100%|██████████| 1494/1494 [00:01<00:00, 761.23 examples/s] 13.89s/it]
Creating parquet from Arrow format: 100%|██████████| 15/15 [00:02<00:00,  5.61ba/s]
Map: 100%|██████████| 1493/1493 [00:00<00:00, 1658.73 examples/s]15.16s/it]
Creating parquet from Arrow format: 100%|██████████| 15/15 [00:01<00:00, 12.38ba/s]
Uploading the dataset shards: 100%|██████████| 5/5 [01:09<00:00, 13.89s/it]
Map: 100%|██████████| 105/105 [00:00<00:00, 12702.31 examples/s]/s]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<

CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Morocco-Darija-ASR-v1.2/commit/dab32da41ef60fe7b1df28cfbc3724a55dbc9c57', commit_message='Deduplicated training set.', commit_description='', oid='dab32da41ef60fe7b1df28cfbc3724a55dbc9c57', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Morocco-Darija-ASR-v1.2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Morocco-Darija-ASR-v1.2'), pr_revision=None, pr_num=None)