In [1]:
!pip install transformers torch wandb evaluate huggingface_hub datasets  evaluate numpy peft accelerate sacrebleu

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-m

In [2]:
import numpy as np
import pandas as pd
import torch
import os
from torch.utils.data import DataLoader, random_split, Dataset
from datasets import load_dataset

# USE RAY TUNE. https://docs.ray.io/en/latest/train/examples/intel_gaudi/bert.html
# deepl, chatgpt translations

## Setup

In [3]:
import wandb
from huggingface_hub import HfApi, HfFolder
import transformers

try: # If it is on Kaggle
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()

    HF_TOKEN = user_secrets.get_secret("HF_TOKEN")
    WANDB_KEY = user_secrets.get_secret("WANDB_KEY")

except ModuleNotFoundError: # If it is local
    HF_TOKEN = os.environ["HF_TOKEN"]
    WANDB_KEY = os.environ["WANDB_KEY"]
    

HfFolder.save_token(HF_TOKEN)
wandb.login(key=WANDB_KEY)


# Reproducibility

seed = 1
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
transformers.set_seed(seed)
np.random.seed(seed)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabdulmohsena[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
2025-06-15 22:29:05.305652: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750026545.533256      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750026545.603966      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Modeling

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
from transformers import DataCollatorForSeq2Seq

In [5]:
# # # # Configure any model from HF HUB
# assert input("YOU WILL REMOVE THE HUB MODEL FOR THIS, TYPE 'OK' TO PROCEED: ").upper() == 'OK'
# model_name = "facebook/mbart-large-50-many-to-many-mmt"
# model_name = "facebook/m2m100_1.2B"
# #model_name= "Helsinki-NLP/opus-mt-en-ar"
# model_name= "facebook/nllb-200-distilled-600M"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# generation_config = GenerationConfig(
#     temperature=0.5,
#     do_sample=True,
#     max_length=256,
#     forced_bos_token_id = 256011, # Arabic

#     pad_token_id=tokenizer.pad_token_id,
#     bos_token_id= 256011,
#     decoder_start_token_id= 2,
#     eos_token_id= tokenizer.eos_token_id,
    
# #     num_beams = 4,
# #     early_stopping=True,
# #     top_k=50,
    
# #     renormalize_logits=True,
    
# #     # Testing Config
# #       repetition_penalty=0.5,
# #     num_return_sequences=4, # Number of sentences to generate
# #     return_dict_in_generate=True, # Returns the complete generation data from within the model.
# #     output_scores=True, # Score of each token.
# )

# tokenizer.src_lang="eng_Latn"
# tokenizer.tgt_lang="arb_Arab"

# model.push_to_hub("Abdulmohsena/Faseeh_LoRA")
# tokenizer.push_to_hub("Abdulmohsena/Faseeh_LoRA")
# generation_config.push_to_hub("Abdulmohsena/Faseeh_LoRA")

In [6]:
# Instantiating The Model
model_name = "Abdulmohsena/faseeh_alter"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn", tgt_lang="arb_Arab")
# generation_config = GenerationConfig.from_pretrained(model_name)

generation_config = GenerationConfig(
    temperature=0.5,
    do_sample=True,
    max_length=256,
    forced_bos_token_id = 256011, # Arabic

    pad_token_id=tokenizer.pad_token_id,
    bos_token_id= 256011,
    decoder_start_token_id= 2,
    eos_token_id= tokenizer.eos_token_id,
    
#     num_beams = 4,
#     early_stopping=True,
#     top_k=50,
    
#     renormalize_logits=True,
    
#     # Testing Config
#       repetition_penalty=0.5,
#     num_return_sequences=4, # Number of sentences to generate
#     return_dict_in_generate=True, # Returns the complete generation data from within the model.
#     output_scores=True, # Score of each token.
)

tokenizer.src_lang="eng_Latn"
tokenizer.tgt_lang="arb_Arab"

# https://huggingface.co/docs/transformers/en/main_classes/text_generation

config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.1k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/32.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

In [7]:
# # Compressing
# from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training, LoraRuntimeConfig
# from torch.profiler import profile, record_function, ProfilerActivity

# ## Quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# model = AutoModelForSeq2SeqLM.from_pretrained(model_name, quantization_config=bnb_config)

# model.gradient_checkpointing_enable()
# model = prepare_model_for_kbit_training(model) # prepares the whole model for kbit training

# for param in model.parameters():
#     param.requires_grad = False  # freeze the model - train adapters later
#     if param.ndim == 1:
#         # cast the small parameters (e.g. layernorm) to fp32 for stability
#         param.data = param.data.to(torch.float32)
    
# ## Low Rank Adaptation
# lora_config = LoraConfig(
# #     init_lora_weights="olora",
#     use_dora=True,
#     runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=True),
#     task_type=TaskType.SEQ_2_SEQ_LM,
#     inference_mode=False, 
#     r=16, 
#     lora_alpha=16, 
#     lora_dropout=0.05,
#     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
# )


# # model.enable_input_require_grads()
# model = get_peft_model(model, lora_config)


# # # Only train decoder weights, not encoder
# for param in model.get_base_model().model.encoder.parameters():
#     param.requires_grad = False

# model.print_trainable_parameters()

# # Pruning, not valid because we need a sparse util
# # for name, module in model.named_modules():
# #     if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)):
# #         prune.l1_unstructured(module, name='weight', amount=0.4)
# #         prune.remove(module, 'weight')

# # # https://huggingface.co/docs/optimum/en/concept_guides/quantization
# # # https://huggingface.co/docs/peft/en/index
# # # https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py

In [8]:
# Sanity Check
dummy = "And the Egyptian Foreign Minister assured the visitors that security is always a top priority."

model = model.to('cuda')
encoded_ar = tokenizer(dummy, return_tensors="pt").to('cuda')
generated_tokens = model.generate(**encoded_ar, generation_config=generation_config)

tokenizer.decode(generated_tokens[0], skip_special_tokens=True)


'وأكد وزير الخارجية المصري للزوار أن الأمن أولى بالأمور دائما.'

In [9]:
from datasets import concatenate_datasets

dataset = load_dataset("Abdulmohsena/Classic-Arabic-English-Language-Pairs")

dataset = concatenate_datasets([
    dataset['quran'],
    dataset['hadith'],
    dataset['books']
])

dataset = dataset.shuffle(seed=42)

README.md:   0%|          | 0.00/952 [00:00<?, ?B/s]

quran-00000-of-00001.parquet:   0%|          | 0.00/818k [00:00<?, ?B/s]

hadith-00000-of-00001.parquet:   0%|          | 0.00/644k [00:00<?, ?B/s]

books-00000-of-00001.parquet:   0%|          | 0.00/3.00M [00:00<?, ?B/s]

Generating quran split:   0%|          | 0/9474 [00:00<?, ? examples/s]

Generating hadith split:   0%|          | 0/4107 [00:00<?, ? examples/s]

Generating books split:   0%|          | 0/13331 [00:00<?, ? examples/s]

In [10]:
dataset = dataset.filter(lambda x: (len(x['en']) < 256) & (len(x['ar']) < 256))

Filter:   0%|          | 0/26912 [00:00<?, ? examples/s]

In [11]:
preprocess_function = lambda examples: tokenizer(
        examples['en'], text_target=examples['ar'], max_length=256, truncation=True, padding=True, return_tensors='pt')

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.20)

Map:   0%|          | 0/23127 [00:00<?, ? examples/s]

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True, return_tensors='pt')

## Training

### Reward model

In [13]:
from evaluate import load
sacrebleu = load("sacrebleu")

def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [[label.strip()] for label in labels]

        return preds, labels

def compute_metrics(eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Some simple post-processing
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

        bleu_score = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)['score']
        #fluency_score = fluency.compute(texts=decoded_preds)['classical_score']
        
        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

        result = {
                    "bleu": bleu_score,
                    #"fluency": fluency_score,
                    "gen_len": np.mean(prediction_lens)
                }
        
        result = {k: round(v, 4) for k, v in result.items()}
        return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

## TEST

In [14]:
for param in model.get_encoder().parameters():
    param.requires_grad = False

In [15]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from datetime import datetime

training_args = Seq2SeqTrainingArguments(
        output_dir="faseeh_alter",
        eval_strategy="epoch",
        weight_decay=0.01,
        warmup_steps=1_000,
        learning_rate=5e-5,
        lr_scheduler_type="cosine",
        per_device_train_batch_size=6,
        per_device_eval_batch_size=6,
        save_total_limit=2,
        num_train_epochs=8,
        predict_with_generate=True,
        seed = 42,
        report_to="wandb",
        run_name=f'{datetime.now()}',
        logging_strategy = 'steps',
        logging_steps = 300,
        # load_best_model_at_end = True,



        # fp16=True,
        push_to_hub=True,
    )

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'] ,
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [16]:
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250615_223029-jzlotuy9[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m2025-06-15 22:30:29.233821[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/abdulmohsena/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/abdulmohsena/huggingface/runs/jzlotuy9[0m


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.1335,2.389623,18.408,17.385
2,1.891,2.175707,18.8365,17.3986
3,1.7138,1.984601,19.2468,17.3984
4,1.5502,1.864434,19.7279,17.3794
5,1.4501,1.780723,20.092,17.3941
6,1.3854,1.736311,20.2568,17.3632
7,1.3586,1.720416,20.3118,17.3684
8,1.3365,1.717748,20.3282,17.3794


[34m[1mwandb[0m: uploading wandb-summary.json; uploading config.yaml
[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:               eval/bleu ▁▃▄▆▇███
[34m[1mwandb[0m:            eval/gen_len ▅██▄▇▁▂▄
[34m[1mwandb[0m:               eval/loss █▆▄▃▂▁▁▁
[34m[1mwandb[0m:            eval/runtime ▆▃▂▂▁██▇
[34m[1mwandb[0m: eval/samples_per_second ▃▆▇▇█▁▁▂
[34m[1mwandb[0m:   eval/steps_per_second ▃▆▇▇█▁▁▂
[34m[1mwandb[0m:             train/epoch ▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
[34m[1mwandb[0m:       train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█████
[34m[1mwandb[0m:         train/grad_norm ▄▃▃▃▃▃▃▂▃▂▂▄▂█▁▃▅▄▄▄▇▅▇▆▆▃▄▅▆▃▃▅▂▃▆▂▃▅▆▇
[34m[1mwandb[0m:     train/learning_rate ▃▅███████▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[34m[1mwandb[0m:              train/loss ██▇▇▇▆▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mw

In [17]:
# import torch.profiler as profiler
# with profiler.profile(activities=[profiler.ProfilerActivity.CPU, profiler.ProfilerActivity.CUDA]) as prof:
    

# print(prof.key_averages().table(sort_by="cuda_time_total"))