In [62]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk
import random
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path = '/content/drive/My Drive/dataset/ArzEn-MultiGenre_cleaned_data.xlsx'

In [4]:
!pip install transformers



In [5]:
import pandas as pd
data = pd.read_excel(file_path)

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,egyption_Text,english_Text,category,sub_category
0,0,الأمير الصغير,The little prince,Novels,the-little-prince
1,1,أنطوان دي سانت إكزوبيري,Antoine De Saint-Exuper,Novels,the-little-prince
2,2,الفصل الأول,Chapter 1,Novels,the-little-prince
3,3,في مره، لما كان عندي ست سنين، شفت صوره روعه، ف...,Once when I was six years old I saw a magnific...,Novels,the-little-prince
4,4,وكانت عباره عن تعبان من نوع البوا بيبلع فهد.,It was a picture of a boa constrictor in the a...,Novels,the-little-prince


In [6]:
# load the arat5 tokenizer and model (Arabic dialect translation)
dialect_tokenizer = AutoTokenizer.from_pretrained("PRAli22/arat5-arabic-dialects-translation")
dialect_model = AutoModelForSeq2SeqLM.from_pretrained("PRAli22/arat5-arabic-dialects-translation")

# load the helsinki tokenizer and model (Arabic to English translation)
ar_to_en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
ar_to_en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ar-en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/21.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/875 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
# function to translate dialect to English
def translate_dialect_to_english(dialect_text):
    # tokenize input (dialect text )
    input_ids = dialect_tokenizer(dialect_text, return_tensors="pt").input_ids
    # translation from dialect to standard Arabic
    outputs = dialect_model.generate(input_ids)
    standard_arabic_text = dialect_tokenizer.decode(outputs[0], skip_special_tokens=True)
    # translate standard Arabic text to English
    input_ids = ar_to_en_tokenizer(standard_arabic_text, return_tensors="pt").input_ids
    outputs = ar_to_en_model.generate(input_ids)
    english_translation = ar_to_en_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return english_translation

In [59]:
def translate_to_english_helsinki(ar_text):
    # tokenize input (dialect text)
    input_ids = dialect_tokenizer(ar_text, return_tensors="pt").input_ids
    # translate dialect to English using Helsinki model
    input_ids = ar_to_en_tokenizer(ar_text, return_tensors="pt").input_ids
    outputs = ar_to_en_model.generate(input_ids)
    english_translation = ar_to_en_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return english_translation

In [8]:
def semantic_similarity(text1, text2):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return similarity

In [60]:
def translate_dialect_to_standard_arabic(dialect_text):
    input_ids = dialect_tokenizer(dialect_text, return_tensors="pt").input_ids
    outputs = dialect_model.generate(input_ids)
    standard_arabic_text = dialect_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return standard_arabic_text

In [57]:
ar_translated_dir = '/content/drive/My Drive/ar_translated_texts/'

In [None]:
chunk_size = 700
total_rows = len(data)
num_chunks = total_rows // chunk_size + (1 if total_rows % chunk_size != 0 else 0)
translated_data = []

# iterate over the chunks
for chunk_index in range(num_chunks):
    start_index = chunk_index * chunk_size
    end_index = min((chunk_index + 1) * chunk_size, total_rows)
    chunk_data = data.iloc[start_index:end_index]
    for index, row in chunk_data.iterrows():
        dialect_text = str(row['egyption_Text'])
        standard_arabic_text = translate_dialect_to_standard_arabic(dialect_text)

        # translate standard Arabic text to English using Helsinki model
        helsinki_translation = translate_to_english_helsinki(standard_arabic_text)
        # calculate BLEU score for the Helsinki translation
        bleu_helsinki = sentence_bleu([row['english_Text'].split()], helsinki_translation.split())

        # calculate BLEU score for the Egyptian text in the Helsinki model with the reference English text
        helsinki_bleu_egyptian = sentence_bleu([row['english_Text'].split()], translate_to_english_helsinki(dialect_text).split())

        # append the data to the list
        translated_data.append({
            'egyption_Text': dialect_text,
            'standard_arabic_Text': standard_arabic_text,
            'english_Text': row['english_Text'],
            'bleu_score_helsinki': bleu_helsinki,
            'bleu_score_helsinki_egyptian': helsinki_bleu_egyptian
        })

# DataFrame from the collected data
translated_df = pd.DataFrame(translated_data)
file_path = ar_translated_dir + f'translated_texts_all_chunks.csv'
translated_df.to_csv(file_path, index=False, encoding='utf-8')

In [9]:
# function to translate dialect to English using AraT5
def translate_dialect_to_english_arat5(dialect_text):
    # tokenize input (dialect text)
    input_ids = dialect_tokenizer(dialect_text, return_tensors="pt").input_ids
    # translation from dialect to standard Arabic
    outputs = dialect_model.generate(input_ids)
    standard_arabic_text = dialect_tokenizer.decode(outputs[0], skip_special_tokens=True)
    # translate standard Arabic text to English
    input_ids = ar_to_en_tokenizer(standard_arabic_text, return_tensors="pt").input_ids
    outputs = ar_to_en_model.generate(input_ids)
    english_translation = ar_to_en_tokenizer.decode(outputs[0], skip_special_tokens=True)

    return english_translation

In [None]:
def translate_dialect_to_english(dialect_text):
    # tokenize input (dialect text)
    input_ids = dialect_tokenizer(dialect_text, return_tensors="pt").input_ids
    # translation from dialect to standard Arabic
    outputs = dialect_model.generate(input_ids)
    standard_arabic_text = dialect_tokenizer.decode(outputs[0], skip_special_tokens=True)
    # translate standard Arabic text to English
    input_ids = ar_to_en_tokenizer(standard_arabic_text, return_tensors="pt").input_ids
    outputs = ar_to_en_model.generate(input_ids)
    english_translation = ar_to_en_tokenizer.decode(outputs[0], skip_special_tokens=True)
    # calculate semantic similarity between translated English text and original dialect text
    similarity_score = semantic_similarity(english_translation, dialect_text)
    # check if semantic similarity is below the threshold
    if similarity_score < semantic_similarity_threshold:
        return translate_dialect_to_english_arat5(dialect_text)
    else:
        return english_translation

In [18]:
semantic_similarity_threshold = 0.75
# calculate semantic similarity
def semantic_similarity(text1, text2):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return similarity

In [43]:
def translate_dialect_to_english_unseen(dialect_text):
    helsinki_translation = translate_to_english_helsinki(dialect_text)
    # calculate semantic similarity with the translated English text
    similarity_score = semantic_similarity(helsinki_translation, dialect_text)
    # check if semantic similarity is below the threshold
    if similarity_score > semantic_similarity_threshold:
        print("arat5 used")
        return translate_dialect_to_english(dialect_text)
    else:
        return helsinki_translation

In [46]:
arat5_weight = 0.15
helsinki_weight = 0.85
def translate_dialect_to_english_unseen(dialect_text):
    random_number = random.uniform(0, 1)
    if random_number < arat5_weight:
        print("AraT5 used")
        return translate_dialect_to_english(dialect_text)
    else:
        print("Helsinki used")
        return translate_to_english_helsinki(dialect_text)