In [1]:
!pip install wandb evaluate huggingface_hub datasets bert_score evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate, bert_score
Successfully installed bert_score-0.3.13 evaluate-0.4.2


In [4]:
import numpy as np
import pandas as pd
import re
import torch
import os
from torch.utils.data import DataLoader, random_split, Dataset
import requests
import ray
from datasets import load_dataset

# USE RAY TUNE. https://docs.ray.io/en/latest/train/examples/intel_gaudi/bert.html
# USE E5SCORE AS A LOSS
# deepl translations

In [5]:
import wandb
from huggingface_hub import HfApi, HfFolder
from transformers import set_seed
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
WANDB_KEY = user_secrets.get_secret("WANDB_KEY")

HfFolder.save_token(HF_TOKEN)
wandb.login(key=WANDB_KEY)

seed = 1
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True # Should be True, but False to save memory
torch.backends.cudnn.benchmark = False # Should be False, but True to save memory
set_seed(seed)
np.random.seed(seed)

[34m[1mwandb[0m: Currently logged in as: [33mabdulmohsena[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Quran Dataset

In [6]:
df = pd.read_csv("hf://datasets/ImruQays/Quran-Classical-Arabic-English-Parallel-texts/Quran-translations.csv")

In [7]:
data = df.copy()

data = data.drop(columns=[data.columns[0]])

In [8]:
# Connect Contextual verses
connection_signs = ['-', '—', ':', ';', ',']

for index, row in data.iterrows():
    
    total_connected_sentences = sum(row[column].endswith(connection_sign) 
                                    for column in data.columns for connection_sign in connection_signs)

    # If there is at least 3 translations that say the Ayah is connected with the next, then connect them.
    if total_connected_sentences >= 3:
        for column in data.columns:
            data.at[index + 1, column] = f"{data.at[index, column]} {data.at[index + 1, column]}"
            data.at[index, column] = np.nan

data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

display(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4598 entries, 0 to 4597
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   arabic-imlaei   4598 non-null   object
 1   en-ahmedali     4598 non-null   object
 2   en-ahmedraza    4598 non-null   object
 3   en-arberry      4598 non-null   object
 4   en-asad         4598 non-null   object
 5   en-daryabadi    4598 non-null   object
 6   en-hilali       4598 non-null   object
 7   en-itani        4598 non-null   object
 8   en-maududi      4598 non-null   object
 9   en-mubarakpuri  4598 non-null   object
 10  en-pickthall    4598 non-null   object
 11  en-qarai        4598 non-null   object
 12  en-qaribullah   4598 non-null   object
 13  en-sahih        4598 non-null   object
 14  en-sarwar       4598 non-null   object
 15  en-shakir       4598 non-null   object
 16  en-wahiduddi    4598 non-null   object
 17  en-yusufali     4598 non-null   object
dtypes: objec

None

In [None]:
def prepare_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove text between parentheses/brakets (Explanatory text which is not originally written in arabic)
    text = re.sub(r'\([^)]*\)|\[[^]]*\]', '', text)
    
    # Remove dashes and commas
    text = re.sub(r'[-,:;’‘\"\']+', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    arabic_diacritics = re.compile("""
        ّ    | # Shadda
        َ    | # Fatha
        ً    | # Tanwin Fath
        ُ    | # Damma
        ٌ    | # Tanwin Damm
        ِ    | # Kasra
        ٍ    | # Tanwin Kasr
        ْ    | # Sukun
        ـ    # Tatweel (kashida)
    """, re.VERBOSE)
    
    text = re.sub(arabic_diacritics, '', text)
    
    return text

data = data.map(prepare_text)

In [11]:
# Remove first Ayat (Usually "Bismillah" is not translated)
data = data[~data.iloc[:, 0].str.contains('بسم الله', na=False)]
data.reset_index(inplace=True, drop=True)

In [13]:
# Inspecting the translation quality
n = 123

print(data.iloc[n, 0], "\n")
print("\n".join(row for row in data.iloc[n, 1:].tolist()))

قولوا آمنا بالله وما أنزل إلينا وما أنزل إلىٰ إبراهيم وإسماعيل وإسحاق ويعقوب والأسباط وما أوتي موسىٰ وعيسىٰ وما أوتي النبيون من ربهم لا نفرق بين أحد منهم ونحن له مسلمون 

say we believe in god and what has been sent down to us and what had been revealed to abraham and ishmael and isaac and jacob and their progeny and that which was given to moses and christ and to all other prophets by the lord. we make no distinction among them and we submit to him.
say “we believe in allah and what is sent down to us and what was sent down to ibrahim and ismael and ishaq and yaqub and to their offspring and what was bestowed upon moosa and eisa and what was bestowed upon other prophets – from their lord we do not make any distinction in belief between any of them and to allah we have submitted ourselves.”
say you we believe in god and in that which has been sent down on us and sent down on abraham ishmael isaac and jacob and the tribes and that which was given to moses and jesus and the prophets of t

## Preparation

In [14]:
data = data.copy()

y = data.iloc[:, 0]
X = data.iloc[:, 1:]

In [15]:
pairs_list = []

for n_row in range(X.shape[0]):
    for n_col in range(X.shape[1]):
        
        pairs_list.append([X.iloc[n_row, n_col], y[n_row]])
        
pairs_list = np.array(pairs_list)

In [19]:
pairs_dict = {
    "source" : pairs_list[:, 0],
    "target" : pairs_list[:, 1]
} 

In [41]:
from datasets import Dataset

quran_ds = Dataset.from_dict(pairs_dict)
quran_ds = quran_ds.shuffle(seed=seed)

In [42]:
quran_ds[1]

{'source': 'say obey allah and obey the messenger but if you turn away he is only responsible for the duty placed on him and you for that placed on you. if you obey him you shall be on the right guidance. the messengers duty is only to convey in a clear way .',
 'target': 'قل أطيعوا الله وأطيعوا الرسول فإن تولوا فإنما عليه ما حمل وعليكم ما حملتم وإن تطيعوه تهتدوا وما على الرسول إلا البلاغ المبين'}

## Shamela Books

In [37]:
# Shamela URL is a drive csv file. Arabic Text is scraped first from https://shamela.ws/ and then batch translated \
# in google translate (Files). Then prepared as a dataset of sources & targets.

shamela_url = user_secrets.get_secret("SHAMELA_DS")

shamela_df = pd.read_csv(shamela_url)

In [38]:
# Remove index col
shamela_df = shamela_df.iloc[:, 1:]

# Prepare text
shamela_df = shamela_df.map(prepare_text)

In [39]:
shamela_ds = Dataset.from_dict({
    "source" : shamela_df.iloc[:, 1],
    "target" : shamela_df.iloc[:, 0]
})

shamela_ds = shamela_ds.shuffle(seed=seed)

In [40]:
shamela_ds[1]

{'source': ' he was not satisfied with me bringing this to talk until he had me as a martyr and he was not satisfied with my martyrdom until he made me swear an oath.',
 'target': 'فلم يرض بإحضاري هذا لكلام حتى استشهدني ولم يرض باستشهادي حتى استحلفني.'}

## Combining Datasets

In [67]:
used_datasets = [quran_ds.to_pandas(), shamela_ds.to_pandas()]

dataset = Dataset.from_pandas(pd.concat(used_datasets))
dataset = dataset.shuffle(seed=seed)

# Remove very short sentences
dataset = dataset.filter(lambda x: len(x['source']) > 10)

Filter:   0%|          | 0/83369 [00:00<?, ? examples/s]

In [68]:
dataset[0]

{'source': 'as for those who disbelieved and gave the lie to our signs they shall be the inmates of the fire and will abide in it. that is a woeful resort!',
 'target': 'والذين كفروا وكذبوا بآياتنا أولٰئك أصحاب النار خالدين فيها وبئس المصير',
 '__index_level_0__': 15436}

In [74]:
print("Percentage of each sub_dataset to the whole dataset:")

# TODO: Increase the amount of shamela books.
for i, ds in enumerate(used_datasets):
    print(f"{i}: {len(ds) / sum([len(d) for d in used_datasets]):.2%}")

Percentage of each sub_dataset to the whole dataset:
0: 91.43%
1: 8.57%


## Modeling

In [75]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
from transformers import DataCollatorForSeq2Seq

In [23]:
# # Configure any model from HF HUB
# assert input("YOU WILL REMOVE THE HUB MODEL FOR THIS, TYPE 'OK' TO PROCEED: ").upper() == 'OK'
# model_name = "facebook/mbart-large-50-many-to-many-mmt"
# model_name = "facebook/m2m100_1.2B"
# #model_name= "Helsinki-NLP/opus-mt-en-ar"
# model_name= "facebook/nllb-200-distilled-600M"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# generation_config = GenerationConfig(
#     max_length=200,
#     forced_bos_token_id=256011, # Arabic
#     num_beams = 4,
#     early_stopping=True,
#     do_sample=True,
#     top_k=50,
    
#     # Testing Config
# #     num_return_sequences=4, # Number of sentences to generate
# #     return_dict_in_generate=True, # Returns the complete generation data from within the model.
# #     output_scores=True, # Score of each token.
# )

# tokenizer.src_lang="eng_Latn"
# tokenizer.tgt_lang="arb_Arab"

# model.push_to_hub("Abdulmohsena/Faseeh")
# tokenizer.push_to_hub("Abdulmohsena/Faseeh")

In [76]:
# Instantiating The Model
model_name = "Abdulmohsena/Faseeh"

tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn", tgt_lang="arb_Arab")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
generation_config = GenerationConfig.from_pretrained(model_name)

# https://huggingface.co/docs/transformers/en/main_classes/text_generation

tokenizer_config.json:   0%|          | 0.00/40.1k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/217 [00:00<?, ?B/s]

In [79]:
# Sanity Check
dummy = "And the Saudi Arabian Foreign Minister assured the visitors of the importance to seek the security."

encoded_ar = tokenizer(dummy, return_tensors="pt")
generated_tokens = model.generate(**encoded_ar, generation_config=generation_config)

tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

'وآمن وزير خارجية السعودية الزوار بأهمية طلب الأمن.'

In [80]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples['source'], text_target=examples['target'], max_length=128, truncation=True, padding=True)
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.25)

Map:   0%|          | 0/82903 [00:00<?, ? examples/s]

In [82]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

# from transformers import DataCollatorForLanguageModeling

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=True,             # Whether to use Masked Language Modeling (MLM)
#     mlm_probability=0.15  # Probability of masking tokens for MLM
# )

In [83]:
import numpy as np
import evaluate
import transformers

metric = evaluate.load("bertscore")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

def compute_metrics(eval_preds): 
    
    preds, labels = eval_preds
    
    # Replace unknown labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode tokens into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Postprocess text for cleaniness
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Get Average bertscore F-1
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, lang="ar")
    result = {"bertscore-f1": np.mean(result['f1'])}

    # Get avg gen length
    prediction_lengths = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lengths)

    result = {k: round(v, 4) for k, v in result.items()} # Round to 4 figures

    return result

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [85]:
from datetime import datetime
wandb.init(project="Faseeh", name=f"Run @ {datetime.now()}")

In [89]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

torch.cuda.empty_cache()

# https://huggingface.co/docs/transformers/v4.42.0/performance
training_args = Seq2SeqTrainingArguments(
    output_dir="Faseeh",
    save_total_limit=1,
    
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    # torch_compile=True,
    
    logging_strategy="steps",
    logging_steps=500, 
    
    eval_strategy='epoch',
    
    weight_decay=0.01,
    warmup_steps=1_000,
    learning_rate=3e-5,
    lr_scheduler_type="cosine",
    
    
    num_train_epochs=4,
    
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    report_to='wandb'
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [90]:
trainer.train()
wandb.finish()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Bertscore-f1,Gen Len
0,0.0152,0.058165,0.9845,37.3201


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/bertscore-f1,▁
eval/gen_len,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▃▄▅▆▇██
train/global_step,▁▂▃▄▅▆▇██
train/grad_norm,█▁▃▅▃▃▃
train/learning_rate,▄█▇▆▄▂▁

0,1
eval/bertscore-f1,0.9845
eval/gen_len,37.3201
eval/loss,0.05817
eval/runtime,9230.0527
eval/samples_per_second,2.245
eval/steps_per_second,0.561
total_flos,1.6842736967614464e+16
train/epoch,0.99994
train/global_step,3886.0
train/grad_norm,0.10105
