In [None]:
!pip install wandb evaluate huggingface_hub datasets bert_score evaluate

In [2]:
import numpy as np
import pandas as pd
import torch
import os
from torch.utils.data import DataLoader, random_split, Dataset
from datasets import load_dataset

# USE RAY TUNE. https://docs.ray.io/en/latest/train/examples/intel_gaudi/bert.html
# USE E5SCORE AS A LOSS
# deepl translations


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\anaconda\envs\Faseeh\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "d:\anaconda\envs\Faseeh\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "d:\anaconda\envs\Faseeh\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "d:\anaconda\envs\Faseeh\Lib\site-packages

In [8]:
import wandb
from huggingface_hub import HfApi, HfFolder
from transformers import set_seed

try: # If it is on Kaggle
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()

    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
    WANDB_KEY = user_secrets.get_secret("WANDB_KEY")

except ModuleNotFoundError: # If it is local
    HF_TOKEN = os.environ["HF_TOKEN"]
    WANDB_KEY = os.environ["WANDB_KEY"]
    

HfFolder.save_token(HF_TOKEN)
wandb.login(key=WANDB_KEY)

# Reproducibility
seed = 1
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(seed)
np.random.seed(seed)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mabdulmohsena[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\user\.netrc


## Modeling

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
from transformers import DataCollatorForSeq2Seq

In [4]:
# # Configure any model from HF HUB
# assert input("YOU WILL REMOVE THE HUB MODEL FOR THIS, TYPE 'OK' TO PROCEED: ").upper() == 'OK'
# model_name = "facebook/mbart-large-50-many-to-many-mmt"
# model_name = "facebook/m2m100_1.2B"
# #model_name= "Helsinki-NLP/opus-mt-en-ar"
# model_name= "facebook/nllb-200-distilled-600M"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# generation_config = GenerationConfig(
#     max_length=200,
#     forced_bos_token_id=256011, # Arabic
#     num_beams = 4,
#     early_stopping=True,
#     do_sample=True,
#     top_k=50,
    
#     # Testing Config
# #     num_return_sequences=4, # Number of sentences to generate
# #     return_dict_in_generate=True, # Returns the complete generation data from within the model.
# #     output_scores=True, # Score of each token.
# )

# tokenizer.src_lang="eng_Latn"
# tokenizer.tgt_lang="arb_Arab"

# model.push_to_hub("Abdulmohsena/Faseeh")
# tokenizer.push_to_hub("Abdulmohsena/Faseeh")

In [5]:
# Instantiating The Model
model_name = "Abdulmohsena/Faseeh"

tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn", tgt_lang="arb_Arab")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
generation_config = GenerationConfig.from_pretrained(model_name)

# https://huggingface.co/docs/transformers/en/main_classes/text_generation

In [12]:
# Sanity Check
dummy = "And the Saudi Arabian Foreign Minister assured the visitors that security is always a top priority."

encoded_ar = tokenizer(dummy, return_tensors="pt")
generated_tokens = model.generate(**encoded_ar, generation_config=generation_config)

tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

'وآمن وزير خارجية السعودية الزوار أن الأمن دائم الأولوية القصوى.'

In [12]:
dataset = load_dataset("Abdulmohsena/Classic-Arabic-English-Language-Pairs")

dataset = dataset['train']

In [14]:
preprocess_function = lambda examples: tokenizer(
        examples['source'], text_target=examples['target'], max_length=128, truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.25)

In [15]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

# from transformers import DataCollatorForLanguageModeling # NOT GOOD FOR SEMANTIC TRANSLATION

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=True,             # Whether to use Masked Language Modeling (MLM)
#     mlm_probability=0.15  # Probability of masking tokens for MLM
# )

In [16]:
import numpy as np
import evaluate
import transformers

metric = evaluate.load("bertscore")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

def compute_metrics(eval_preds): 
    
    preds, labels = eval_preds
    
    # Replace unknown labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode tokens into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Postprocess text for cleaniness
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Get Average bertscore F-1
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, lang="ar")
    result = {"bertscore-f1": np.mean(result['f1'])}

    # Get avg gen length
    prediction_lengths = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lengths)

    result = {k: round(v, 4) for k, v in result.items()} # Round to 4 figures

    return result

Downloading builder script: 100%|██████████| 7.95k/7.95k [00:00<?, ?B/s]


## Training

In [85]:
from datetime import datetime
wandb.init(project="Faseeh", name=f"Run @ {datetime.now()}")

In [24]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

torch.cuda.empty_cache()

# https://huggingface.co/docs/transformers/v4.42.0/performance
training_args = Seq2SeqTrainingArguments(
    output_dir=f"Faseeh_{metric.name}",
    save_total_limit=1,
    
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    # torch_compile=True,
    
    logging_strategy="steps",
    logging_steps=500, 
    
    eval_strategy='epoch',
    
    weight_decay=0.01,
    warmup_steps=1_000,
    learning_rate=3e-5,
    lr_scheduler_type="cosine",
    
    
    num_train_epochs=4,
    
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    report_to='wandb'
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [90]:
trainer.train()
wandb.finish()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Bertscore-f1,Gen Len
0,0.0152,0.058165,0.9845,37.3201


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}


VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/bertscore-f1,▁
eval/gen_len,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▂▃▄▅▆▇██
train/global_step,▁▂▃▄▅▆▇██
train/grad_norm,█▁▃▅▃▃▃
train/learning_rate,▄█▇▆▄▂▁

0,1
eval/bertscore-f1,0.9845
eval/gen_len,37.3201
eval/loss,0.05817
eval/runtime,9230.0527
eval/samples_per_second,2.245
eval/steps_per_second,0.561
total_flos,1.6842736967614464e+16
train/epoch,0.99994
train/global_step,3886.0
train/grad_norm,0.10105
