In [1]:
import os
os.environ["TORCH_USE_CUDA_DSA"] = "1" # Enable CUDA DSA
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForCausalLM
import torch
from utils import get_save_dir_path
from huggingface_hub import HfApi

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_NAME = "Gemma-2-2B"
# Set data parameters
TRAIN_ON_ALL_DATA = True
TRAIN_ON_ARABIZI_ONLY = False
TRAIN_ON_ARABIC_LETTERS_ONLY = True


In [3]:
# GPU device map
device_map={'':torch.cuda.current_device()}
print('Current device:', torch.cuda.current_device())

Current device: 0


In [4]:
# Model paths
MODEL_PATHS = {
                "Gemma-2-2B":           "google/gemma-2-2b",
                "Llama-7B" :            "sambanovasystems/SambaLingo-Arabic-Base",
                "Noon-7B":              "Naseej/noon-7b",
                "aragpt2-large" :       "aubmindlab/aragpt2-large",
                "gpt2-small-arabic" :   "akhooli/gpt2-small-arabic",
            }

# Finetuned model name
HUB = "BounharAbdelaziz/Al-Atlas-LLM"

In [5]:
# Set path of base model, where to save the fine-tuned model, and dataset to use for fine-tuning
MODEL_PATH = MODEL_PATHS[MODEL_NAME]
# Where to save the weights
SAVE_MODEL_PATH = './BounharAbdelaziz/Al-Atlas-LLM-r-256-alpha-16/checkpoint-17375'

# Load the model

In [6]:
from peft import LoraConfig, PeftModel

In [7]:
# Reload model and merge LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map='auto',
)

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.10it/s]


In [8]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [9]:
model = PeftModel.from_pretrained(base_model, SAVE_MODEL_PATH)
model = model.merge_and_unload()

In [10]:
# Save model and tokenizer
tokenizer.save_pretrained(SAVE_MODEL_PATH)
model.save_pretrained(SAVE_MODEL_PATH)

# Push the model to the Hugging Face Hub

In [17]:
# The Hugging Face API to push model to the hub
HF_API = HfApi()

In [18]:
# Push model to Hugging Face Hub
HF_API.upload_folder(
    repo_id=HUB,
    folder_path=SAVE_MODEL_PATH,
    repo_type="model",
    create_pr=False
)

adapter_model.safetensors:   0%|          | 0.00/204M [00:00<?, ?B/s]

[A[A



[A[A[A[A


[A[A[A
adapter_model.safetensors:   1%|          | 1.41M/204M [00:00<00:14, 13.9MB/s]


rng_state.pth: 100%|██████████| 14.2k/14.2k [00:00<00:00, 98.8kB/s]


adapter_model.safetensors:   1%|▏         | 2.80M/204M [00:00<00:29, 6.81MB/s]

[A[A


[A[A[A



[A[A[A[A

scheduler.pt: 100%|██████████| 1.06k/1.06k [00:00<00:00, 8.89kB/s]4, 13.2MB/s]



[A[A[A

adapter_model.safetensors:   7%|▋         | 13.7M/204M [00:00<00:06, 29.3MB/s]


[A[A[A



training_args.bin: 100%|██████████| 5.56k/5.56k [00:00<00:00, 50.0kB/s]
adapter_model.safetensors:   8%|▊         | 17.3M/204M [00:00<00:09, 19.7MB/s]


[A[A[A

adapter_model.safetensors:  11%|█         | 22.6M/204M [00:01<00:07, 23.6MB/s]

[A[A

adapter_model.safetensors:  14%|█▍        | 29.1M/204M [00:01<00:07, 23.0MB/s]

[A[A


[A[A[A

adapter_model.safetensors:  16%|█▌        | 32.0M/204M [00:01<00:13, 13.1MB/s]


[A[A

CommitInfo(commit_url='https://huggingface.co/BounharAbdelaziz/Al-Atlas-LLM/commit/f7f71db64f3de7397fe8f5b5e8e9cd92d43454e7', commit_message='Upload folder using huggingface_hub', commit_description='', oid='f7f71db64f3de7397fe8f5b5e8e9cd92d43454e7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/BounharAbdelaziz/Al-Atlas-LLM', endpoint='https://huggingface.co', repo_type='model', repo_id='BounharAbdelaziz/Al-Atlas-LLM'), pr_revision=None, pr_num=None)

# Load from Hub and test

In [9]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("BounharAbdelaziz/Al-Atlas-LLM-Small-v2")
model = AutoModelForCausalLM.from_pretrained("BounharAbdelaziz/Al-Atlas-LLM-Small-v2")

In [6]:
# Run text generation pipeline with our next model
prompt = "الدولة المغربية"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=256, device='cuda:0')
result = pipe(f"{prompt}")
print(result[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


الدولة المغربية دارت بيان على هاد البيان. البيان قال باللي هاد البيان كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي كان كيقول باللي


In [12]:
# Run text generation pipeline with our next model
prompt = "محمد صلاح"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, device='cuda:0')
result = pipe(f"{prompt}")
print(result[0]['generated_text'])

محمد صلاح محمد صلاح محمد صلاح محمد صلاح محمد صلاح محمد صلاح محمد


In [8]:
# Run text generation pipeline with our next model
prompt = "برمجوها بزاف وخاص تبقى"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=128, device='cuda:0')
result = pipe(f"{prompt}")
print(result[0]['generated_text'])

برمجوها بزاف وخاص تبقى...................................................................................................


In [6]:
# Run text generation pipeline with our next model
prompt = "ﺃﻧﺎ كنستغرب ﺻﺮﺍﺣﺔ علاش"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=256)#, device='cuda:0')
result = pipe(f"{prompt}")
print(result[0]['generated_text'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


ﺃﻧﺎ كنستغرب ﺻﺮﺍﺣﺔ علاش ﻓﻲ ﺍﻟﻤﺤﻠﻴﻦ ﻃﻔﺎﺩﻱ ﺑﻠﻴﻦ ﻗﺎﻟﻤﻠﻮﻝ ﺍﻟﻤﻘﻮﺍﻵﻮ ﻣﺎﻣﺎ ﻗﺎﻟﻌﻮﺩﻱ ﺗﻠﻒﻚ ﻣﻦ ﺍﻟﻤﻠﻮﺍﺍﺕ تْ�ْﻮﺍﺣﺘﻲ ﻗﺎﻟ�


In [7]:
# Run text generation pipeline with our next model
prompt = "ﺃﻧﺎ ﺻﺮﺍﺣﺔ "
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=256, device='cuda:0')
result = pipe(f"{prompt}")
print(result[0]['generated_text'])

ﺃﻧﺎ ﺻﺮﺍﺣﺔ ﻛﻴﻒ ﺑﻠﺔ ﻋﻠﻴﻚ ﺑﻼﺃﺓ ﺩﺭﺍﻧﻲ ﻭﺍﻧﺎ ﻣﺴﺎﻓﺮ ﺩﻳﺎﻟﻮ ﻓﺎﻟﺒﻨﺖ ﻋﻠﻴﻚ ﺗﻔﺮﺕ ﻭﻫﻲ ﺍﻟﻨﺘﺎﺋﺞ ﻭﺍﻧﺎ ﻣﺴﺎﻓﺮ ﺃﻧﺎ ﻣﺴ�


In [24]:
prompt = "ﺻﺮﺍﺣﺔ"
print(f"[INFO] An example of output, the prompt is {prompt}...")
print("=" * 80)
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)#, device='cuda:0')
result = pipe(f"{prompt}")
print(result[0]['generated_text'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[INFO] An example of output, the prompt is ﺻﺮﺍﺣﺔ...
ﺻﺮﺍﺣﺔ,,,
-
