In [1]:
!pip install -U wandb evaluate huggingface_hub datasets bert_score evaluate transformers numpy==1.26

Collecting wandb
  Downloading wandb-0.17.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy==1.26
  Downloading numpy-1.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading numpy-1.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading w

In [2]:
import numpy as np
import pandas as pd
import torch
import os
from torch.utils.data import DataLoader, random_split, Dataset
from datasets import load_dataset

# USE RAY TUNE. https://docs.ray.io/en/latest/train/examples/intel_gaudi/bert.html
# deepl translations

In [3]:
import wandb
from huggingface_hub import HfApi, HfFolder
from transformers import set_seed

try: # If it is on Kaggle
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()

    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
    WANDB_KEY = user_secrets.get_secret("WANDB_KEY")

except ModuleNotFoundError: # If it is local
    HF_TOKEN = os.environ["HF_TOKEN"]
    WANDB_KEY = os.environ["WANDB_KEY"]
    

HfFolder.save_token(HF_TOKEN)
wandb.login(key=WANDB_KEY)

# Reproducibility
seed = 1
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(seed)
np.random.seed(seed)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Modeling

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
from transformers import DataCollatorForSeq2Seq

In [5]:
# # Configure any model from HF HUB
# assert input("YOU WILL REMOVE THE HUB MODEL FOR THIS, TYPE 'OK' TO PROCEED: ").upper() == 'OK'
# model_name = "facebook/mbart-large-50-many-to-many-mmt"
# model_name = "facebook/m2m100_1.2B"
# #model_name= "Helsinki-NLP/opus-mt-en-ar"
# model_name= "facebook/nllb-200-distilled-600M"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# generation_config = GenerationConfig(
#     temperature=0.5,
#     max_length=200,
#     forced_bos_token_id=256011, # Arabic
#     num_beams = 4,
#     early_stopping=True,
#     do_sample=True,
#     top_k=50,
    
#     renormalize_logits=True,
    
#     # Testing Config
#       repetition_penalty=0.5,
# #     num_return_sequences=4, # Number of sentences to generate
# #     return_dict_in_generate=True, # Returns the complete generation data from within the model.
# #     output_scores=True, # Score of each token.
# )

# tokenizer.src_lang="eng_Latn"
# tokenizer.tgt_lang="arb_Arab"

# model.push_to_hub("Abdulmohsena/Faseeh")
# tokenizer.push_to_hub("Abdulmohsena/Faseeh")
# generation_config.push_to_hub("Abdulmohsena/Faseeh")

In [6]:
# Instantiating The Model
model_name = "Abdulmohsena/Faseeh"

tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn", tgt_lang="arb_Arab")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
generation_config = GenerationConfig.from_pretrained(model_name)

# https://huggingface.co/docs/transformers/en/main_classes/text_generation

tokenizer_config.json:   0%|          | 0.00/40.1k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/217 [00:00<?, ?B/s]

In [7]:
# Sanity Check
dummy = "And the Saudi Arabian Foreign Minister assured the visitors that security is always a top priority."

encoded_ar = tokenizer(dummy, return_tensors="pt")
generated_tokens = model.generate(**encoded_ar, generation_config=generation_config)

tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

'وآمن وزير خارجية السعودية الزوار أن الأمن دائما أولى.'

In [8]:
dataset = load_dataset("Abdulmohsena/Classic-Arabic-English-Language-Pairs")

dataset = dataset['train']

Downloading readme:   0%|          | 0.00/774 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/91084 [00:00<?, ? examples/s]

In [9]:
preprocess_function = lambda examples: tokenizer(
        examples['source'], text_target=examples['target'], max_length=128, truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.25)

Map:   0%|          | 0/91084 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

# from transformers import DataCollatorForLanguageModeling # NOT GOOD FOR SEMANTIC TRANSLATION

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=True,             # Whether to use Masked Language Modeling (MLM)
#     mlm_probability=0.15  # Probability of masking tokens for MLM
# )

In [11]:
import numpy as np
from evaluate import load
import transformers
from functools import partial

# All metrics to be tested
def bertscore(predictions, references):
    metric = load("bertscore")
    result = metric.compute(predictions=predictions, references=references, lang="ar")
    result = {"bertscore-f1": np.mean(result['f1'])}
    
    return result

def comet(predictions, references):
    metric = load("comet")
    result = metric.compute(predictions=predictions, references=references)
    print(result['scores'])
    result = {"comet-score": np.mean(result['scores'])}
    
    return result


metric = load("bertscore")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

def compute_metrics(eval_preds): 
    
    preds, labels = eval_preds
    
    # Replace unknown labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    # Decode tokens into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Postprocess text for cleaniness
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
    # Get metric scores
    result = bertscore(predictions=decoded_preds, references=decoded_labels)

    # Get avg gen length
    prediction_lengths = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lengths)

    result = {k: round(v, 4) for k, v in result.items()} # Round to 4 figures

    return result

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [12]:
tokenizer(["test"])

{'input_ids': [[256047, 7356, 2]], 'attention_mask': [[1, 1, 1]]}

In [13]:
compute_metrics((tokenizer(["test"])['input_ids'], tokenizer(["تجربة"])['input_ids']))

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

{'bertscore-f1': 0.7917, 'gen_len': 1.0}

## Training

In [14]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
train_batch_size = 4
torch.cuda.empty_cache()

# https://huggingface.co/docs/transformers/v4.42.0/performance
training_args = Seq2SeqTrainingArguments(
    metric_for_best_model="bertscore-f1",
    output_dir=f"Faseeh",
    save_total_limit=1,
    
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=train_batch_size,
    gradient_accumulation_steps=16 // train_batch_size,
    gradient_checkpointing=True,
    # torch_compile=True,
    
    logging_strategy="steps",
    logging_steps=500, 
    
    eval_strategy='epoch',
    
    weight_decay=0.01,
    warmup_steps=1_000,
    learning_rate=3e-5,
    lr_scheduler_type="cosine",
    
    
    num_train_epochs=2,
    
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    report_to='wandb'
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [15]:
from datetime import datetime

wandb.init(project="Faseeh", name=f"Run @ {datetime.now()}")
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mabdulmohsena[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.17.8
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240904_200533-bi47pzt3[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mRun @ 2024-09-04 20:05:33.086993[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/abdulmohsena/Faseeh[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/abdulmohsena/Faseeh/runs/bi47pzt3[0m
  return fn(*args, **kwargs)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Bertscore-f1,Gen Len
0,0.0982,0.111674,0.9746,37.942
1,0.1063,0.109446,0.9749,37.8993


Non-default generation parameters: {'max_length': 200}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 200}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 200}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 200}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 200}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.a