In [1]:
!pip install tiktoken 
!pip install transformers
!pip install blobfile
!pip install sentencepiece
!pip install sacremoses



In [2]:
!pip install pandas
!pip install protobuf




In [1]:
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

allam_model = AutoModelForCausalLM.from_pretrained("ALLaM-AI/ALLaM-7B-Instruct-preview").to(device)
tokenizer_allam = AutoTokenizer.from_pretrained("ALLaM-AI/ALLaM-7B-Instruct-preview")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

'\ntotal_params = sum(p.numel() for p in hunayn_model.parameters())\ntrainable_params = sum(p.numel() for p in hunayn_model.parameters() if p.requires_grad)\n\nprint(f"Total parameters: {total_params:,}")\nprint(f"Trainable parameters: {trainable_params:,}")\n'

In [4]:
hunayn_model = MarianMTModel.from_pretrained("Hunayn/Big_Hunayn_at_different_epochs/model_at_epoch7").to(device)
hunayn_tokenizer = MarianTokenizer.from_pretrained("Hunayn/Big_Hunayn_at_different_epochs/model_at_epoch7")

In [6]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, MarianMTModel, MarianTokenizer
from tqdm import tqdm
import re

# reading first 100 from artelingo
file_path = "artelingo_release.csv"
df_arabic = pd.read_csv(
    file_path, 
    skiprows=range(1, 386414), 
    nrows=50, 
    usecols=["utterance_spelled", "image_file"],
    encoding="utf-8"
)

#extracting the artelingo Arabic captions and making three new columns
arabic_column = "utterance_spelled"
df_arabic = df_arabic.rename(columns={"utterance_spelled": "artelingo_arabic"})
df_arabic["allam_arabic"] = ""
df_arabic["translated_english"] = ""
df_arabic["hunayn_arabic"] = ""

def clean_allam_response(text, system_prompt, user_input):
    # sometimes special token are outputted so I'm removing them here
    text = re.sub(r"\[INST\]|\[/INST\]|<<SYS>>|<</SYS>>", "", text).strip()

    # someitmes the system prompt itself is being outputted so I'm deleting it
    if system_prompt in text:
        text = text.split(system_prompt, 1)[-1].strip()

    # someimes the user prompt is being outputted too so I'm taking deleting it
    if user_input in text:
        text = text.split(user_input, 1)[-1].strip()

    return text

def enhance_arabic_allam(text):
    system_msg = "خذ الجملة المعطاة وأعد صياغتها باللغة العربية الفصحى بأسلوبٍ أكثر فصاحةً ورقيًّا، مع الحفاظ على نفس المعنى"
    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": text}
    ]
    input_text = tokenizer_allam.apply_chat_template(messages, tokenize=False)
    inputs = tokenizer_allam(input_text, return_tensors="pt").to(device)
    with torch.no_grad():
        output = allam_model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.6,
            pad_token_id=tokenizer_allam.eos_token_id
        )
    return clean_allam_response(tokenizer_allam.decode(output[0], skip_special_tokens=True), system_msg, text)

def translate_arabic_to_english_allam(text):
    system_msg = "Your job is to strictly only translate the following Arabic sentence to English."
    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": text}
    ]
    input_text = tokenizer_allam.apply_chat_template(messages, tokenize=False)
    inputs = tokenizer_allam(input_text, return_tensors="pt").to(device)
    with torch.no_grad():
        output = allam_model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            top_k=50,
            top_p=0.95,
            temperature=0.6,
            pad_token_id=tokenizer_allam.eos_token_id
        )
    return clean_allam_response(tokenizer_allam.decode(output[0], skip_special_tokens=True), system_msg, text)

def translate_english_to_arabic_hunayn(text):
    input_ids = hunayn_tokenizer([text], return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
    with torch.no_grad():
        output_ids = hunayn_model.generate(input_ids, max_length=len(text.split()) + 10, num_beams=5)
    return hunayn_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# going through each row in the dataframe
for idx, row in tqdm(df_arabic.iterrows(), total=len(df_arabic)):
    modern = row["artelingo_arabic"]

    # enhance Arabic via ALLaM
    enhanced = enhance_arabic_allam(modern)
    df_arabic.at[idx, "allam_arabic"] = enhanced

    # translate Arabic to English via ALLaM
    english = translate_arabic_to_english_allam(modern)
    df_arabic.at[idx, "translated_english"] = english

    # translate English back to Enhanced Arabic via Hunayn
    back_arabic = translate_english_to_arabic_hunayn(english)
    df_arabic.at[idx, "hunayn_arabic"] = back_arabic

output_file = "arabic_enhancement_comparison.csv"
df_arabic.to_csv(output_file, encoding="utf-8-sig", index=False)
print(f"Saved results to {output_file}")

100%|██████████| 50/50 [01:39<00:00,  1.99s/it]

Saved results to arabic_enhancement_comparison.csv





Since ALLAM is the preferred the model over HUNAYN, only ALLAM enhanced will be outputted now:

In [2]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import re

file_path = "artelingo_release.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer_allam.pad_token = tokenizer_allam.eos_token
tokenizer_allam.padding_side = "left"

df_arabic = pd.read_csv(
    file_path, 
    skiprows=range(1, 386414), 
    nrows=386410, 
    usecols=["utterance_spelled", "image_file"],
    encoding="utf-8"
)

df_arabic = df_arabic.rename(columns={"utterance_spelled": "artelingo_arabic"})
df_arabic["allam_arabic"] = ""

def clean_allam_response(text, system_prompt, user_input):
    text = re.sub(r"\[INST\]|\[/INST\]|<<SYS>>|<</SYS>>", "", text).strip()
    if system_prompt in text:
        text = text.split(system_prompt, 1)[-1].strip()
    if user_input in text:
        text = text.split(user_input, 1)[-1].strip()
    return text

def enhance_batch_allam(batch_texts):
    system_msg = "خذ الجملة المعطاة وأعد صياغتها باللغة العربية الفصحى بأسلوبٍ أكثر فصاحةً ورقيًّا، مع الحفاظ على نفس المعنى"
    prompts = [
        tokenizer_allam.apply_chat_template([
            {"role": "system", "content": system_msg},
            {"role": "user", "content": text}
        ], tokenize=False) for text in batch_texts
    ]
    inputs = tokenizer_allam(prompts, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = allam_model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.6,
            pad_token_id=tokenizer_allam.eos_token_id
        )
    decoded = tokenizer_allam.batch_decode(outputs, skip_special_tokens=True)
    return [clean_allam_response(decoded_output, system_msg, original_input) for decoded_output, original_input in zip(decoded, batch_texts)]

batch_size = 50
for start_idx in tqdm(range(0, len(df_arabic), batch_size), desc="Enhancing Batches", leave=False):
    end_idx = min(start_idx + batch_size, len(df_arabic))
    batch = df_arabic.iloc[start_idx:end_idx]
    modern_batch = batch["artelingo_arabic"].tolist()
    enhanced_batch = enhance_batch_allam(modern_batch)
    df_arabic.loc[start_idx:end_idx - 1, "allam_arabic"] = enhanced_batch

output_file = "allam_enhancements_only.csv"
df_arabic = df_arabic[["image_file", "allam_arabic"]]
df_arabic.to_csv(output_file, encoding="utf-8-sig", index=False)
print(f"✅ Saved results to {output_file}")

Enhancing Batches:   0%|          | 0/7729 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
                                                                            

✅ Saved results to allam_enhancements_only.csv


This took 14 hours