In [1]:
%pip install openai
%pip install anthropic
%pip install pandas

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
lang_codes = {
    "zgh": "Standard Moroccan Tamazight",
    "eng": "English"
}

# Load FLORES devtest data

In [3]:
zgh_path = "../Tamazight-NLP/Text/floresp-v2.0-rc.2/devtest/devtest.zgh_Tfng"
eng_path = "../Tamazight-NLP/Text/floresp-v2.0-rc.2/devtest/devtest.eng_Latn"

In [4]:
def load_floresp(zgh_path, eng_path):
    zgh_lines = []
    eng_lines = []
    with open(zgh_path, 'r') as zgh_file:
        for line in zgh_file:
            zgh_lines.append(line.strip())
    with open(eng_path, 'r') as eng_file:
        for line in eng_file:
            eng_lines.append(line.strip())

    return zgh_lines, eng_lines

zgh_lines, eng_lines = load_floresp(zgh_path, eng_path)

zgh_lines = zgh_lines[:100]
eng_lines = eng_lines[:100]

data = {"eng": eng_lines, "zgh": zgh_lines}

In [5]:
data

{'eng': ['"We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.',
  'Dr. Ehud Ur, professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the clinical and scientific division of the Canadian Diabetes Association cautioned that the research is still in its early days.',
  'Like some other experts, he is skeptical about whether diabetes can be cured, noting that these findings have no relevance to people who already have Type 1 diabetes.',
  'On Monday, Sara Danius, permanent secretary of the Nobel Committee for Literature at the Swedish Academy, publicly announced during a radio program on Sveriges Radio in Sweden the committee, unable to reach Bob Dylan directly about winning the 2016 Nobel Prize in Literature, had abandoned its efforts to reach him.',
  'Danius said, "Right now we are doing nothing. I have called and sent emails to his closest collaborator and received very friendly replies. For now, that is certainly enoug

# Define the GPT and Claude models used for evaluation

In [6]:
from openai import OpenAI
from abc import ABC, abstractmethod
import socket

class LLMInterface(ABC):
    """
    This is the Large Language Model (LLM) interface. It provides a common interface for different language models.
    Each model should implement the 'get_result' method, which takes a prompt and returns the LLM result.
    """
    name = None
    stop = "\n" # Stop sequence for the LLM (when to stop generating text)
    temperature = 0.7
    max_tokens = 500

    @abstractmethod
    def get_result(self, prompt) -> str:
        raise NotImplementedError

In [7]:
class GPT(LLMInterface):
    """
    OpenAI is a general class for OpenAI's models, which other specific GPT model classes inherit from.
    """
    def __init__(self):
        # Initialize LLM specifics
        self.client = OpenAI(
            base_url=f"http://{socket.gethostname()}.local:1234/v1",
            api_key="lm-studio"
            )
        self.model = None

    def get_result(self, prompt):
        response = self.client.chat.completions.create(
            model=self.model,
            messages=self.create_messages_from_prompt(prompt),
            max_tokens=self.max_tokens,
            temperature=self.temperature,
            stop=self.stop
            )
        result = response.choices[0].message.content.strip()
        return result

    def create_messages_from_prompt(self, prompt):
        messages = [
            # {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": prompt
                }
            ]
        return messages

In [8]:
class GPT_4o(GPT):
    """
    Class for the GPT-4o model. Inherits from GPT.
    """
    name = "GPT-4o"

    def __init__(self):
        super().__init__()
        self.model = "gpt-4o"

In [9]:
import anthropic


class Claude(LLMInterface):
    """
    Claude is a general class for models from Anthropic, which other specific Claude model classes inherit from.
    """
    def __init__(self):
        # Initialize LLM specifics
        self.client = anthropic.Anthropic()
        self.model = None

    def get_result(self, prompt):
        response = self.client.messages.create(
            model=self.model,
            # system="You are a helpful assistant.",
            messages=self.create_messages_from_prompt(prompt),
            max_tokens=self.max_tokens,
            temperature=self.temperature,
            stop_sequences=[self.stop]
            )
        result = response.content[0].text.strip()
        return result

    def create_messages_from_prompt(self, prompt):
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ]
        return messages

In [10]:
class Sonnet_3_5(Claude):
    """
    Class for the Claude 3.5 Sonnet model. Inherits from Claude.
    """
    name = "Sonnet 3.5"

    def __init__(self):
        super().__init__()
        self.model = "claude-3-5-sonnet-20240620"

# Get the translations from the models

In [11]:
from tqdm import tqdm
import pandas as pd

PROMPT_TEMPLATE = "Translate the following {source} text to {target}:\n{source}: {source_text}\n{target}: "

def get_translations(source_code, target_code, data, model, save_to_tsv=True):
    source = lang_codes[source_code]
    target = lang_codes[target_code]

    translations = data.copy()
    translations['Translation'] = []
    source_texts = data[source_code]

    for source_text in tqdm(source_texts):
        prompt = PROMPT_TEMPLATE.format(source=source, target=target, source_text=source_text)
        translation = model.get_result(prompt)
        translations['Translation'].append(translation)

    if save_to_tsv:
        translations = pd.DataFrame(translations)
        # replace "source_code" with "Source" and "target_code" with "Target" in the column names
        translations = translations.rename(columns={source_code: "Source", target_code: "Target"})
        # reorder the columns to have "Source", "Target", and "Translation" as the first three columns
        translations = translations[["Source", "Target", "Translation"]]
        filename = f"{model.model}_{source_code}_to_{target_code}.tsv"
        translations.to_csv(filename, sep="\t", index=False)

    return translations

In [12]:
# Initialize the GPT-4o model
gpt_4o = GPT_4o()

eng_to_zgh = get_translations("eng", "zgh", data, gpt_4o)
eng_to_zgh

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [08:31<00:00,  5.12s/it]


Unnamed: 0,Source,Target,Translation
0,"""We now have 4-month-old mice that are non-dia...",“ⵏⵍⴰ ⴷⵖⵉ ⵉⵖⵔⴷⴰⵢⵏ ⵏ 4 ⵏ ⵉⵢⵢⵉⵔⵏ ⴷ ⵓⵔ ⵉⵍⵍⵉ ⵡⴰⵟⵟⴰⵏ...,"Ḥna ɣayn ṭṭalbawt n iẓḍaḍan tazzut imi 4 zizn,..."
1,"Dr. Ehud Ur, professor of medicine at Dalhousi...","ⴰⴷⵓⴽⵜⵓⵔ ⵉⵀⵓⵔ ⵓⵔ, ⴰⵙⵍⵎⴰⴷ ⵏ ⵜⴰⵙⵏⵓⵊⵢⴰ ⴳ ⵜⵙⴷⴰⵡⵉⵜ ⵏ...","Ḥasan Uṛ (Dr. Ehud Ur), amẓwar Akkub n tigawin..."
2,"Like some other experts, he is skeptical about...","ⴰⵎⵎ ⴽⵔ ⵏ ⵡⵉⵢⵢⴰⴹ ⵉⵎⵓⵣⴰⵢⵏ, ⵉⴳⴰ ⴰⵎⵛⴽⴰⴽ ⴳ ⵓⵢⵏⵏⴰ ⵉⵇ...",Aqbel n ilm d-s d-yat aḥdar amek sɛid ɣ-ttəmtt...
3,"On Monday, Sara Danius, permanent secretary of...","ⴳ ⵓⵢⵏⴰⵙ, ⵜⵏⵏⴰ ⵚⴰⵕⴰ ⴷⴰⵏⵢⵓⵙ, ⵜⴰⵏⴼⵍⵓⵙⵜ ⵜⴰⵎⵖⵍⴰⵍⵜ ⵏ...",ⵙⴰ ⵜⴰⵏⴱⴰ ⴼⴽⵔ ⵍⵎⵓⵇⵉⵟⴳ ⴹⵣⴸ 2016.
4,"Danius said, ""Right now we are doing nothing. ...","ⵜⵏⵏⴰ ⴷⴰⵏⵢⵓⵙ, “ⴷⵖⵉ ⵓⵔ ⴷⴰ ⵏⵙⵙⴽⴰⵔ ⴰⵎⵢⴰ. ⵖⵔⵉⵖ ⴰⵙ ⴰ...","Danius am ḍa, ""Akkʷ n iyyadn as tẓḍaḍt n iman...."
...,...,...,...
95,"According to Japan's nuclear agency, radioacti...","ⵉⵎⴽⵉⵏⵏⴰ ⵜⵏⵏⴰ ⵜⵙⵏⵓⵔⴰⵢⵜ ⵜⴰⵢⴰⴱⴰⵏⵉⵢⵜ ⵏ ⵓⵏⵓⴽⵍⵉⵢⵉⵔ, ...","ⵜⴰ ⵎⴹⴱⴳ ⴼⵙⴻⴷ ⵔⵉⵏⵏ ⵏ ⵓⵔⴳⴰ ⵏ ⵍⵝⵣⴰⴹ, ⵜⴰ ⵎⴺⵄⴱ ⵙⵕⵉⴼ..."
96,Authorities speculate that this indicates that...,ⵖⴰⵍⵏⵜ ⵜⵏⴱⴰⴹⵉⵏ ⵉⵙ ⴷⴰ ⵉⵎⵎⴰⵍ ⵓⵢⴰ ⵖⵉⵢⵏ ⵉⵙⵓⵎⴰⵢⵏ ⵍⵍⵉ...,Ḥḍar n uṛubaʕ yella d aɣ wwal n tibḥit inaɣen ...
97,Dr. Tony Moll discovered the Extremely Drug Re...,ⵢⵓⴼⴰ ⵓⵎⵙⴳⵏⴰⴼ ⵜⵓⵏⵉ ⵎⵓⵍ ⵜⴰⵔⵓⵜ ⵍⵍⵉ ⴱⴰⵀⵔⴰ ⵉⵜⵜⵣⴱⴰⵢⵏ...,"Iḍawn n Tuni Mol, yadɣu ɣemʷaṛ uṭbiṛiẓi tiddrt..."
98,"In an interview, he said the new variant was ""...","ⴳ ⵢⴰⵏ ⵓⵎⵙⴰⵡⴰⵍ, ⵉⵏⵏⴰ ⵉⵙ ⵜⴳⴰ ⵜⵎⵣⴰⵔⴰⵢⵜ ⵜⴰⵎⴰⵢⵏⵓⵜ ""...","Iḍurar nniḍen, ayyi akal tawriqt ad-ukkum itte..."


In [13]:
zgh_to_eng = get_translations("zgh", "eng", data, gpt_4o)
zgh_to_eng

100%|██████████| 100/100 [03:33<00:00,  2.14s/it]


Unnamed: 0,Source,Target,Translation
0,“ⵏⵍⴰ ⴷⵖⵉ ⵉⵖⵔⴷⴰⵢⵏ ⵏ 4 ⵏ ⵉⵢⵢⵉⵔⵏ ⴷ ⵓⵔ ⵉⵍⵍⵉ ⵡⴰⵟⵟⴰⵏ...,"""We now have 4-month-old mice that are non-dia...",I don't see any text to translate. Please prov...
1,"ⴰⴷⵓⴽⵜⵓⵔ ⵉⵀⵓⵔ ⵓⵔ, ⴰⵙⵍⵎⴰⴷ ⵏ ⵜⴰⵙⵏⵓⵊⵢⴰ ⴳ ⵜⵙⴷⴰⵡⵉⵜ ⵏ...","Dr. Ehud Ur, professor of medicine at Dalhousi...",The translation of the Standard Moroccan Tamaz...
2,"ⴰⵎⵎ ⴽⵔ ⵏ ⵡⵉⵢⵢⴰⴹ ⵉⵎⵓⵣⴰⵢⵏ, ⵉⴳⴰ ⴰⵎⵛⴽⴰⴽ ⴳ ⵓⵢⵏⵏⴰ ⵉⵇ...","Like some other experts, he is skeptical about...",I do not have the capability to translate the ...
3,"ⴳ ⵓⵢⵏⴰⵙ, ⵜⵏⵏⴰ ⵚⴰⵕⴰ ⴷⴰⵏⵢⵓⵙ, ⵜⴰⵏⴼⵍⵓⵙⵜ ⵜⴰⵎⵖⵍⴰⵍⵜ ⵏ...","On Monday, Sara Danius, permanent secretary of...",I do not have a translation for the given text...
4,"ⵜⵏⵏⴰ ⴷⴰⵏⵢⵓⵙ, “ⴷⵖⵉ ⵓⵔ ⴷⴰ ⵏⵙⵙⴽⴰⵔ ⴰⵎⵢⴰ. ⵖⵔⵉⵖ ⴰⵙ ⴰ...","Danius said, ""Right now we are doing nothing. ...",Here is the translation of the Standard Morocc...
...,...,...,...
95,"ⵉⵎⴽⵉⵏⵏⴰ ⵜⵏⵏⴰ ⵜⵙⵏⵓⵔⴰⵢⵜ ⵜⴰⵢⴰⴱⴰⵏⵉⵢⵜ ⵏ ⵓⵏⵓⴽⵍⵉⵢⵉⵔ, ...","According to Japan's nuclear agency, radioacti...",I'm happy to help you with the translation. Ho...
96,ⵖⴰⵍⵏⵜ ⵜⵏⴱⴰⴹⵉⵏ ⵉⵙ ⴷⴰ ⵉⵎⵎⴰⵍ ⵓⵢⴰ ⵖⵉⵢⵏ ⵉⵙⵓⵎⴰⵢⵏ ⵍⵍⵉ...,Authorities speculate that this indicates that...,I'm happy to help you translate the text. Howe...
97,ⵢⵓⴼⴰ ⵓⵎⵙⴳⵏⴰⴼ ⵜⵓⵏⵉ ⵎⵓⵍ ⵜⴰⵔⵓⵜ ⵍⵍⵉ ⴱⴰⵀⵔⴰ ⵉⵜⵜⵣⴱⴰⵢⵏ...,Dr. Tony Moll discovered the Extremely Drug Re...,"I'm happy to help translate the text. However,..."
98,"ⴳ ⵢⴰⵏ ⵓⵎⵙⴰⵡⴰⵍ, ⵉⵏⵏⴰ ⵉⵙ ⵜⴳⴰ ⵜⵎⵣⴰⵔⴰⵢⵜ ⵜⴰⵎⴰⵢⵏⵓⵜ ""...","In an interview, he said the new variant was ""...",I'm happy to help translate the text for you. ...
