In [1]:
from datasets import load_dataset
import string
import pandas as pd
from tqdm import tqdm
from sacrebleu.metrics import BLEU, CHRF, TER
from transformers import pipeline
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load metrics
metric_bleu = BLEU(effective_order=True)
metric_chrf = CHRF()
metric_ter = TER()

In [3]:
def process_txt(text):
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator).strip()

def evaluate(df, MODELS) -> pd.DataFrame:
    for model_path in MODELS:
        if 'Ultra' in model_path:
            pipe = pipeline("translation", model=model_path, src_lang="eng_Latn", tgt_lang="ary_Arab", max_length=512, device="cuda")
        else:
            pipe = pipeline("translation", model=model_path, max_length=512, device="cuda")
        
        for i in tqdm(df.index):
            reference = df.loc[i, 'Darija']
            english_text = df.loc[i, 'English']
            translation = pipe(english_text)[0]["translation_text"]
            
            refs = [process_txt(reference)]
            hyp = process_txt(translation)

            df.loc[i, f'translated_{model_path.split('/')[-1]}'] = translation
            df.loc[i, f'BLEU_{model_path.split('/')[-1]}'] = metric_bleu.sentence_score(references=refs, hypothesis=hyp).score
            df.loc[i, f'CHRF_{model_path.split('/')[-1]}'] = metric_chrf.sentence_score(references=refs, hypothesis=hyp).score
            df.loc[i, f'TER_{model_path.split('/')[-1]}'] = metric_ter.sentence_score(references=refs, hypothesis=hyp).score
            
        avg_bleu_score = df[f'BLEU_{model_path.split('/')[-1]}'].mean()
        avg_chrf_score = df[f'CHRF_{model_path.split('/')[-1]}'].mean()
        avg_ter_score = df[f'TER_{model_path.split('/')[-1]}'].mean()
        
        print(f'[INFO] For model: {model_path}')
        print(f'bleu_score: {avg_bleu_score}')
        print(f'chrf_score: {avg_chrf_score}')
        print(f'ter_score: {avg_ter_score}')
        print('-'*10)

    return df

In [4]:
bench = load_dataset("atlasia/TerjamaBench", split='test').to_pandas()

In [5]:
bench.head()

Unnamed: 0,topic,subtopic,Arabizi,English,Darija,annotator_dialect
0,dialect_variation,marrakech,lays3d lmasa,good evening,الله يسعد الماسا,Marrakech
1,dialect_variation,marrakech,lays3d saba7,good morning,الله يسعد الصباح,Marrakech
2,dialect_variation,marrakech,bit nmchi ndrb chi 9siyes flmdina,I’m heading to the old medina to eat something,بيت نمشي نضرب شي قسيس فالمدينة,Marrakech
3,dialect_variation,marrakech,aji lhad jih,come here,أجي لهاد جيه,Marrakech
4,dialect_variation,marrakech,achawa had ti9i9t lioma,It’s scorching hot today!,أشاوا هاد تيقيقت ليوما,Marrakech


In [6]:
MODELS = [
    "BounharAbdelaziz/Terjman-Ultra-v2.2",
    "BounharAbdelaziz/Terjman-Large-v2.2",
    "BounharAbdelaziz/Terjman-Nano-v2.2",
]

In [7]:
results_df = evaluate(bench, MODELS)

Device set to use cuda
  1%|          | 10/850 [00:01<02:25,  5.75it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 850/850 [02:34<00:00,  5.50it/s]


[INFO] For model: BounharAbdelaziz/Terjman-Ultra-v2.2
bleu_score: 25.17934653659352
chrf_score: 45.163477645933845
ter_score: 77.19670121141593
----------


Device set to use cuda
100%|██████████| 850/850 [02:47<00:00,  5.07it/s]


[INFO] For model: BounharAbdelaziz/Terjman-Large-v2.2
bleu_score: 20.577393481575847
chrf_score: 41.9021347455449
ter_score: 81.81540525237182
----------


Device set to use cuda
100%|██████████| 850/850 [01:07<00:00, 12.65it/s]

[INFO] For model: BounharAbdelaziz/Terjman-Nano-v2.2
bleu_score: 18.77464106466772
chrf_score: 38.41039387371171
ter_score: 100.73932218279197
----------





In [8]:
results_df.head()

Unnamed: 0,topic,subtopic,Arabizi,English,Darija,annotator_dialect,translated_Terjman-Ultra-v2.2,BLEU_Terjman-Ultra-v2.2,CHRF_Terjman-Ultra-v2.2,TER_Terjman-Ultra-v2.2,translated_Terjman-Large-v2.2,BLEU_Terjman-Large-v2.2,CHRF_Terjman-Large-v2.2,TER_Terjman-Large-v2.2,translated_Terjman-Nano-v2.2,BLEU_Terjman-Nano-v2.2,CHRF_Terjman-Nano-v2.2,TER_Terjman-Nano-v2.2
0,dialect_variation,marrakech,lays3d lmasa,good evening,الله يسعد الماسا,Marrakech,مساء الخير,0.0,10.470546,100.0,مساء الخير,0.0,10.470546,100.0,مساء الخير,0.0,10.470546,100.0
1,dialect_variation,marrakech,lays3d saba7,good morning,الله يسعد الصباح,Marrakech,صباح الخير,0.0,19.23267,100.0,صباح الخير,0.0,19.23267,100.0,صباح الخير,0.0,19.23267,100.0
2,dialect_variation,marrakech,bit nmchi ndrb chi 9siyes flmdina,I’m heading to the old medina to eat something,بيت نمشي نضرب شي قسيس فالمدينة,Marrakech,غاديا للمدينة القديمة باش ناكل شي حاجة,6.567275,21.830982,116.666667,انا غادي للمدينة القديمة باش ناكل شي حاجة,5.522398,22.110967,133.333333,أنا غادي للمدينة القديمة باش ناكل شي حاجة,5.522398,22.110967,133.333333
3,dialect_variation,marrakech,aji lhad jih,come here,أجي لهاد جيه,Marrakech,اجي لهنا,0.0,21.938286,100.0,أجي لهنا,30.326533,33.66846,66.666667,تعال هنا,0.0,7.303708,100.0
4,dialect_variation,marrakech,achawa had ti9i9t lioma,It’s scorching hot today!,أشاوا هاد تيقيقت ليوما,Marrakech,سخون بزاف اليوم,0.0,12.767666,100.0,راه سخون بزاف اليوم!,0.0,15.09862,100.0,الجو حار بزاف اليوم!,0.0,14.140764,100.0


In [3]:
dataset = load_dataset("BounharAbdelaziz/Darija-Translation-Dataset-22K-all-13-lang")

Downloading data: 100%|██████████| 257M/257M [00:05<00:00, 47.3MB/s] 
Downloading data: 100%|██████████| 4.44M/4.44M [00:00<00:00, 9.67MB/s]
Generating train split: 100%|██████████| 22134/22134 [00:01<00:00, 12849.31 examples/s]


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['english', 'ary_Arab', 'ary_Latn', 'ary_tokens', 'dataset_source', 'arabic', 'french', 'german', 'spanish', 'russian', 'chinese_traditional', 'japanese', 'korean', 'greek', 'italian', 'turkish', 'wolof', 'hindi'],
        num_rows: 22134
    })
})

In [5]:
dataset = dataset.select_columns(['english', 'ary_Arab', 'ary_Latn', 'arabic', 'french', 'german', 'spanish', 'russian', 'chinese_traditional', 'japanese', 'korean', 'greek', 'italian', 'turkish', 'hindi', 'ary_tokens', 'dataset_source', ])

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['english', 'ary_Arab', 'ary_Latn', 'arabic', 'french', 'german', 'spanish', 'russian', 'chinese_traditional', 'japanese', 'korean', 'greek', 'italian', 'turkish', 'hindi', 'ary_tokens', 'dataset_source'],
        num_rows: 22134
    })
})

In [7]:
dataset.push_to_hub("BounharAbdelaziz/Morocco-Darija-Translation-Dataset-22K-13-lang", private=True, commit_message="Kept working columns")

Creating parquet from Arrow format: 100%|██████████| 12/12 [00:02<00:00,  5.06ba/s]
Creating parquet from Arrow format: 100%|██████████| 12/12 [00:00<00:00, 299.86ba/s]
Uploading the dataset shards: 100%|██████████| 2/2 [00:16<00:00,  8.36s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/BounharAbdelaziz/Morocco-Darija-Translation-Dataset-22K-13-lang/commit/11e4e00e86848845177352788e7d14700f918072', commit_message='Kept working columns', commit_description='', oid='11e4e00e86848845177352788e7d14700f918072', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/BounharAbdelaziz/Morocco-Darija-Translation-Dataset-22K-13-lang', endpoint='https://huggingface.co', repo_type='dataset', repo_id='BounharAbdelaziz/Morocco-Darija-Translation-Dataset-22K-13-lang'), pr_revision=None, pr_num=None)

In [8]:
ds = dataset.filter(lambda x: x['ary_Latn'] != '')

Filter: 100%|██████████| 22134/22134 [00:01<00:00, 15464.49 examples/s]


In [13]:
ds['train'][100]

{'english': 'Do you think I should take a vacation?',
 'ary_Arab': 'كيبان ليا خاصني نشد عطلة',
 'ary_Latn': 'tayban lik khess nchedd 3oTla?',
 'arabic': 'هل تعتقد أنني يجب أن تأخذ إجازة؟',
 'french': 'Pensez-vous que je devrais prendre des vacances?',
 'german': 'Glaubst du, ich sollte Urlaub machen?',
 'spanish': '¿Crees que debería tomarme unas vacaciones?',
 'russian': 'Как вы думаете, я должен взять отпуск?',
 'chinese_traditional': '您认为我应该度假吗？',
 'japanese': '休暇をとるべきだと思いますか？',
 'korean': '내가 휴가를 가져야한다고 생각하십니까?',
 'greek': 'Πιστεύετε ότι πρέπει να κάνω διακοπές;',
 'italian': 'Pensi che dovrei fare una vacanza?',
 'turkish': 'Sence tatile çıkmalıyım?',
 'hindi': 'क्या आपको लगता है कि मुझे छुट्टी लेनी चाहिए?',
 'ary_tokens': 12,
 'dataset_source': 'atlasia/DODa-audio-dataset-V3'}