In [8]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
import wandb

def translate(model, tokenizer, generation_config, texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True).to('cuda')

    model.eval().to('cuda')
    with torch.inference_mode():
        output = model.generate(**inputs, generation_config=generation_config)
        preds = tokenizer.batch_decode(output, skip_special_tokens=True)
        
    return preds

def sanity_check(model, tokenizer, generation_config):

    texts = [
    "And the Egyptian foreign minister ordered the citizens to stick together.",
    "Hello! It's been a while since we last spoke.",
    "We should stay together hands on hands.",
    "Could you please help me with this task?",
    "Thank you so much for your kindness and support.",
    "Can you pass me the salt, please?",
    "I would rather stay home and read a good book tonight.",
    "I’m sorry for the misunderstanding. It wasn’t my intention.",
    "The sky is so clear and beautiful today.",
    "If I were you, I would reconsider that decision.",
    "He thought for a moment, then replied, 'I believe this is the best choice.'",
    "He felt uncomfortable in their presence that he did not sleep from worry",
    "I want to buy a house in the east side of the town",
    "He said I can do a good job with that",
    "Ammar says hello"
    ]
    
    preds = translate(model, tokenizer, generation_config, texts)
    return [[input_sentence, output_sentence] for input_sentence, output_sentence in list(zip(texts, preds))]

import re
def post_process(text):
    text = re.sub(r"\.\.\.", '', text)
    return text

In [9]:
generation_config = GenerationConfig(
    forced_bos_token_id=256011,
    bos_token_id=0,
    decoder_start_token_id=2,
    eos_token_id=2,

    pad_token_id=1,
    # Beam search settings
    num_beams=5,                   # Set up 5 beams to explore different translation options
    # early_stopping=True,            # Stop generation once optimal sequence is found
    # no_repeat_ngram_size=2,         # Discourage repetition, promoting varied phrase structure

    # # Sampling parameters for nuanced and diverse generation
    do_sample=True,                 # Enable sampling to avoid rigid translations
    # top_k=10,                       # Top-k sampling, choosing from the top 10 likely next tokens
    top_p=0.90,                      # Nucleus sampling with p=0.9 for variability while keeping coherence
    temperature=0.8,                # Moderate temperature to maintain eloquence without randomness

    # Length control parameters
    max_length=200,                  # Set maximum length per translation output
    # min_length=15,                  # Avoid overly terse outputs by setting a minimum length
    length_penalty=0.9,             # Slight penalty to discourage excessive length, favoring conciseness

    # Model confidence adjustment
    repetition_penalty=1.2,         # Penalize exact repetitions to ensure stylistic diversity
)

In [10]:
samples = {}

In [11]:
model = AutoModelForSeq2SeqLM.from_pretrained("AbdulmohsenA/Faseeh", revision='f3e75973e367ca471f5eaefa77f3f7b53b410856')
tokenizer = AutoTokenizer.from_pretrained("AbdulmohsenA/Faseeh", src_lang="eng_Latn", tgt_lang="arb_Arab")
samples['model_1'] = sanity_check(model, tokenizer, generation_config)

model = AutoModelForSeq2SeqLM.from_pretrained("AbdulmohsenA/Faseeh", revision='f76e3d54614dbb98a4991448faed8f84af73e003')
samples['model_2'] = sanity_check(model, tokenizer, generation_config)

model = AutoModelForSeq2SeqLM.from_pretrained("artifacts/DPO-v1")
samples['model_3'] = sanity_check(model, tokenizer, generation_config)

model = AutoModelForSeq2SeqLM.from_pretrained("artifacts/DPO-v2")

samples['model_4'] = sanity_check(model, tokenizer, generation_config)

config.json:   0%|          | 0.00/919 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/227 [00:00<?, ?B/s]

In [5]:
# model = AutoModelForSeq2SeqLM.from_pretrained("AbdulmohsenA/Faseeh", revision='f76e3d54614dbb98a4991448faed8f84af73e003')

In [12]:
n = -1
for key, trans in samples.items():
    print(trans[n])

['Ammar says hello', 'يسلم عمار']
['Ammar says hello', 'وقال عمار مرحبا']
['Ammar says hello', 'قال سلام عمار']
['Ammar says hello', 'يسلم عمار']


In [88]:
translate(model, tokenizer, generation_config, "I am quite suspicious")

['ولعلي أن أقتل']