In [1]:
import os
os.environ["TORCH_USE_CUDA_DSA"] = "1" # Enable CUDA DSA
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_DATA_PATH = "BounharAbdelaziz/Terjman-v2-English-Darija-Dataset-580K"
NEW_DODA_DATA_PATH = "atlasia/DODa-audio-dataset-V3"

# Base

In [3]:
base = load_dataset(BASE_DATA_PATH, split='train')

In [4]:
base = base.to_pandas()
base = base[base['dataset_source'] != 'doda']
base['darija_Latn'] = ''
base.columns = ['english', 'darija_Arab', 'dataset_source', 'id', 'role', 'darija_Latn']
base = base[['english', 'darija_Arab', 'darija_Latn', 'dataset_source', 'id', 'role']]

In [5]:
base.head()

Unnamed: 0,english,darija_Arab,darija_Latn,dataset_source,id,role
0,More than half of the Makhzen's expenditures w...,التقريب أكثر من نص مصاريف المخزن كانت كتمشي لب...,,medmac01/moroccan_history_qa,,
1,What were the consequences of the Makhzen's ex...,ماهي عواقب مصاريف المخزن على الاقتصاد المغربي؟,,medmac01/moroccan_history_qa,,
2,The Makhzen's expenditures led to a deteriorat...,نفقات المخزن أدت إلى تدهور الاقتصاد المغربي، ح...,,medmac01/moroccan_history_qa,,
3,"In the 1890s, the French administration and mi...",في تسعينيات القرن التاسع عشر، طالبت الإدارة وا...,,medmac01/moroccan_history_qa,,
4,What were the main reasons behind the French a...,ما كانو الأسباب الرئيسية اللي دفعو فرنسا باش ت...,,medmac01/moroccan_history_qa,,


# Doda

In [6]:
doda = load_dataset(NEW_DODA_DATA_PATH, split='train')

In [7]:
doda = doda.remove_columns(['audio', 'darija_Arab_old'])
doda = doda.to_pandas()
doda.columns = ['darija_Latn', 'darija_Arab', 'english']
doda = doda[['english', 'darija_Arab', 'darija_Latn']]
doda['dataset_source'] = NEW_DODA_DATA_PATH
doda['id'] = ''
doda['role'] = ''

In [8]:
doda.head()

Unnamed: 0,english,darija_Arab,darija_Latn,dataset_source,id,role
0,"They're hiding something, I'm sure!",هوما مخبيين شي حاجة انا متيقن,"homa mkhbbyin chi haja, ana mti99en!",atlasia/DODa-audio-dataset-V3,,
1,It's obvious they're trying to keep their cool.,باينة هوما كيحاولو يبقاو مبردين,bayna homa tay7awlo ib9aw mbrrdin.,atlasia/DODa-audio-dataset-V3,,
2,the hotels don't seem very comfortable.,لوطيلات مبيناش فيهم مريحين بزاف,loTilat mabaynach fihom mori7in bzzaf.,atlasia/DODa-audio-dataset-V3,,
3,he is probably about to be laid off by head of...,غالبا غيجريو عليه من الخدمة,ghaliban ghayjrriw 3lih mn lkhdma!,atlasia/DODa-audio-dataset-V3,,
4,of course he's depressive!,طبعا راه مكتئب,Tab3an rah mkta2eb!,atlasia/DODa-audio-dataset-V3,,


# Concat

In [9]:
import pandas as pd

In [49]:
concatenated = pd.concat([base, doda])

In [50]:
concatenated.head()

Unnamed: 0,english,darija_Arab,darija_Latn,dataset_source,id,role
0,More than half of the Makhzen's expenditures w...,التقريب أكثر من نص مصاريف المخزن كانت كتمشي لب...,,medmac01/moroccan_history_qa,,
1,What were the consequences of the Makhzen's ex...,ماهي عواقب مصاريف المخزن على الاقتصاد المغربي؟,,medmac01/moroccan_history_qa,,
2,The Makhzen's expenditures led to a deteriorat...,نفقات المخزن أدت إلى تدهور الاقتصاد المغربي، ح...,,medmac01/moroccan_history_qa,,
3,"In the 1890s, the French administration and mi...",في تسعينيات القرن التاسع عشر، طالبت الإدارة وا...,,medmac01/moroccan_history_qa,,
4,What were the main reasons behind the French a...,ما كانو الأسباب الرئيسية اللي دفعو فرنسا باش ت...,,medmac01/moroccan_history_qa,,


In [53]:
concatenated.isna().sum()

english                0
darija_Arab            0
darija_Latn            0
dataset_source         0
id                213686
role              213686
dtype: int64

In [56]:
concatenated = concatenated.dropna(subset=['english', 'darija_Arab'])
concatenated = concatenated.drop_duplicates(subset=['english', 'darija_Arab'])

# TerjamaBench

In [12]:
bench = load_dataset("atlasia/TerjamaBench", split='test').to_pandas()

In [13]:
bench.head()

Unnamed: 0,topic,subtopic,Arabizi,English,Darija,annotator_dialect
0,dialect_variation,marrakech,lays3d lmasa,good evening,الله يسعد الماسا,Marrakech
1,dialect_variation,marrakech,lays3d saba7,good morning,الله يسعد الصباح,Marrakech
2,dialect_variation,marrakech,bit nmchi ndrb chi 9siyes flmdina,I’m heading to the old medina to eat something,بيت نمشي نضرب شي قسيس فالمدينة,Marrakech
3,dialect_variation,marrakech,aji lhad jih,come here,أجي لهاد جيه,Marrakech
4,dialect_variation,marrakech,achawa had ti9i9t lioma,It’s scorching hot today!,أشاوا هاد تيقيقت ليوما,Marrakech


In [14]:
bench.columns = ['topic', 'subtopic', 'darija_Latn', 'english', 'darija_Arab', 'annotator_dialect']

Index(['topic', 'subtopic', 'Arabizi', 'English', 'Darija',
       'annotator_dialect'],
      dtype='object')

In [16]:
bench.head()

Unnamed: 0,topic,subtopic,darija_Latn,english,darija_Arab,annotator_dialect
0,dialect_variation,marrakech,lays3d lmasa,good evening,الله يسعد الماسا,Marrakech
1,dialect_variation,marrakech,lays3d saba7,good morning,الله يسعد الصباح,Marrakech
2,dialect_variation,marrakech,bit nmchi ndrb chi 9siyes flmdina,I’m heading to the old medina to eat something,بيت نمشي نضرب شي قسيس فالمدينة,Marrakech
3,dialect_variation,marrakech,aji lhad jih,come here,أجي لهاد جيه,Marrakech
4,dialect_variation,marrakech,achawa had ti9i9t lioma,It’s scorching hot today!,أشاوا هاد تيقيقت ليوما,Marrakech


# Create train and test splits

In [17]:
from datasets import Dataset, DatasetDict

In [57]:
dataset = DatasetDict({
    "train": Dataset.from_pandas(concatenated),
    "test": Dataset.from_pandas(bench),
})

In [58]:
dataset

DatasetDict({
    train: Dataset({
        features: ['english', 'darija_Arab', 'darija_Latn', 'dataset_source', 'id', 'role', '__index_level_0__'],
        num_rows: 577974
    })
    test: Dataset({
        features: ['topic', 'subtopic', 'darija_Latn', 'english', 'darija_Arab', 'annotator_dialect'],
        num_rows: 850
    })
})

In [62]:
dataset['train'] = dataset['train'].remove_columns(['__index_level_0__'])

## Token count

In [19]:
from transformers import AutoTokenizer

In [20]:
TOKENIZER_PATH = "facebook/nllb-200-3.3B"

In [39]:
def count_tokens(text, tokenizer):
    tokens = tokenizer(text)
    num_tokens = [len(tokens) for tokens in tokens['input_ids']]
    return {'darija_tokens': num_tokens}

In [21]:
# we use NLLB-3.3B tokenizer
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

In [23]:
count_tokens("hello world", tokenizer)

4

In [59]:
dataset['train'] = dataset['train'].map(lambda row: count_tokens(row['darija_Arab'], tokenizer), batched=True)

Map: 100%|██████████| 577974/577974 [00:36<00:00, 15948.77 examples/s]


In [60]:
dataset['test'] = dataset['test'].map(lambda row: count_tokens(row['darija_Arab'], tokenizer), batched=True)

Map: 100%|██████████| 850/850 [00:00<00:00, 54744.16 examples/s]


In [63]:
dataset

DatasetDict({
    train: Dataset({
        features: ['english', 'darija_Arab', 'darija_Latn', 'dataset_source', 'id', 'role', 'darija_tokens'],
        num_rows: 577974
    })
    test: Dataset({
        features: ['topic', 'subtopic', 'darija_Latn', 'english', 'darija_Arab', 'annotator_dialect', 'darija_tokens'],
        num_rows: 850
    })
})

In [64]:
total_train_tokens = sum(dataset['train']['darija_tokens'])
total_test_tokens = sum(dataset['test']['darija_tokens'])

In [67]:
print(f'Terjman-v2 total train tokens: {total_train_tokens}')

Terjman-v2 total train tokens: 77911974


In [75]:
HUB_DATA_PATH = "BounharAbdelaziz/Terjman-v2-English-Darija-Dataset-580K"

In [69]:
dataset

DatasetDict({
    train: Dataset({
        features: ['english', 'darija_Arab', 'darija_Latn', 'dataset_source', 'id', 'role', 'darija_tokens'],
        num_rows: 577974
    })
    test: Dataset({
        features: ['topic', 'subtopic', 'darija_Latn', 'english', 'darija_Arab', 'annotator_dialect', 'darija_tokens'],
        num_rows: 850
    })
})

## Align the two datasets

In [76]:
# Get all column names from both train and test
all_columns = set(dataset["train"].column_names) | set(dataset["test"].column_names)

# Function to add missing columns with default values
def align_columns(ds, all_columns):
    for col in all_columns:
        if col not in ds.column_names:
            ds = ds.add_column(col, [""] * len(ds))  # Fill with empty strings (one can use None)
    return ds

# Apply to both train and test
dataset["train"] = align_columns(dataset["train"], all_columns)
dataset["test"] = align_columns(dataset["test"], all_columns)

# Now push to hub
dataset.push_to_hub(HUB_DATA_PATH, commit_message="Added TerjamaBench, removed NaNs, counted tokens, aligned features", private=True)

Creating parquet from Arrow format: 100%|██████████| 289/289 [00:00<00:00, 338.06ba/s]
Creating parquet from Arrow format: 100%|██████████| 289/289 [00:01<00:00, 174.74ba/s]
Uploading the dataset shards: 100%|██████████| 2/2 [00:11<00:00,  5.88s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 737.27ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.55it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Terjman-v2-English-Darija-Dataset-580K/commit/69785d61869579790360729aa205a2454fc34fdc', commit_message='Added TerjamaBench, removed NaNs, counted tokens, aligned features', commit_description='', oid='69785d61869579790360729aa205a2454fc34fdc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Terjman-v2-English-Darija-Dataset-580K', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Terjman-v2-English-Darija-Dataset-580K'), pr_revision=None, pr_num=None)

In [77]:
dataset

DatasetDict({
    train: Dataset({
        features: ['english', 'darija_Arab', 'darija_Latn', 'dataset_source', 'id', 'role', 'darija_tokens', 'subtopic', 'topic', 'annotator_dialect'],
        num_rows: 577974
    })
    test: Dataset({
        features: ['topic', 'subtopic', 'darija_Latn', 'english', 'darija_Arab', 'annotator_dialect', 'darija_tokens', 'dataset_source', 'role', 'id'],
        num_rows: 850
    })
})