# Jupyter Notebook for Project "Comparison of LLM Prompting Techniques"

In [44]:
import pandas as pd
import mlflow
import mlflow.pyfunc
import sacrebleu
from llama_cpp import Llama
import time


## 1 Data Loading
In the first step we import the given translations as pandas Dataframes and print a quick overview of the dataframe.

In [45]:
data = pd.read_pickle('machine_translation.pkl')
data

Unnamed: 0,complexity,text_german,text_english
0,easy,Felix hat es satt: Ständig ist Mama unterwegs....,Felix is fed up: Mom is always on the go. But ...
1,news_gen,Die rund 1.400 eingesetzten Beamten haben demn...,"The approximately 1,400 deployed officers have..."
2,news_spec,"Der Staatschef hat zugleich aber das Recht, vo...",The head of state also has the right to appoin...
3,pop_science,Dass der Klimawandel die Hitzewellen in Südasi...,There is no question that climate change is in...
4,science,"Der DSA-110, der sich am Owens Valley Radio Ob...","The DSA-110, situated at the Owens Valley Radi..."


In [46]:
data_info = pd.DataFrame()
data_info['complexity'] = data['complexity']
data_info['text_german_length'] = data['text_german'].str.len()
data_info['text_english_length'] = data['text_english'].str.len()
data_info

Unnamed: 0,complexity,text_german_length,text_english_length
0,easy,485,415
1,news_gen,296,280
2,news_spec,518,484
3,pop_science,542,521
4,science,1003,827


***
## 2 Model Loading
In the second step we import the AI-Models which are given in the specified task. For doing so we use the `llama-cpp-python` library (further documentation can be found [here](https://github.com/abetlen/llama-cpp-python)) and import the models directly from [huggingface](https://huggingface.co/).

Quick overview and installation guide of llama.cpp:
- https://www.datacamp.com/tutorial/llama-cpp-tutorial
- https://christophergs.com/blog/running-open-source-llms-in-python

In [47]:
# Modelle laden
gemma = Llama.from_pretrained(
    repo_id='lmstudio-ai/gemma-2b-it-GGUF',
    filename='gemma-2b-it-q8_0.gguf',
    n_gpu_layers=1,
    verbose=False,
)

llama32 = Llama.from_pretrained(
    repo_id='hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF',
    filename='llama-3.2-3b-instruct-q8_0.gguf',
    n_gpu_layers=1,
    verbose=False,
)

llama31 = Llama.from_pretrained(
    repo_id='lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF',
    filename='Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf',
    verbose=False,
)

aya23 = Llama.from_pretrained(
    repo_id='bartowski/aya-23-35B-GGUF',
    filename='aya-23-35B-Q5_K_M.gguf',
    n_gpu_layers=1,
    verbose=False,
)

MODELS = {
    'gemma': gemma,
    #'llama32': llama32,
    #'llama31': llama31,
    #'aya23': aya23,
}

llama_new_context_with_model: n_ctx_per_seq (512) < n_ctx_train (8192) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_

***

## 3 Pipeline

### 3.1 Prompt Composition

In [48]:
# TODO: Beide Richtungen abbilden: English <-> German
# TODO: Verschiedene Prompt Arten: zero-shot, few-shot und verschiedene Variationen reinbringen

PROMPT_TEMPLATES_ENGLISH_GERMAN = {
    'zero_shot_to-german_english_1': 'Please translate the following text from English to German: \"{text}\"',
    # 'zero_shot_to-german_german_1': 'Bitte übersetze den folgenden Text von Englisch nach Deutsch: {text}',

    #"few-shot-english-1": f"""Please translate a text from English to German.
    #Here are some examples:
    #- English: "Hello" -> German: "Hallo"
    #- English: "Goodbye" -> German: "Auf Wiedersehen"
    #Now translate this text: {text}""",
    #"few-shot-german-1": f"""Bitte übersetze einen Text von Englisch nach Deutsch.
    #Hier sind einige Beispiele:
    #- Englisch: "Hello" -> Deutsch: "Hallo"
    #- Englisch: "Goodbye" -> Deutsch: "Auf Wiedersehen"
    #Jetzt übersetze diesen Text: {text}""",
}

PROMPT_TEMPLATES_GERMAN_ENGLISH = {
    'zero_shot_to-english_englisch_1': 'Please translate the following text from German to English: \"{text}\"',
    # 'zero_shot-to-english_german_1': 'Bitte übersetze den folgenden Text von Deutsch nach Englisch: {text}',

    #"few-shot-english-1": f"""Please translate a text from German to English.
    #Here are some examples:
    #- German: "Hallo" -> English: "Hello"
    #- German: "Auf Wiedersehen" -> English: "Goodbye"
    #Now translate this text: {text}""",
    #"few-shot-german-1": f"""Bitte übersetze einen Text von Deutsch nach Englisch.
    #Hier sind einige Beispiele:
    #- Deutsch: "Hallo" -> Englisch: "Hello"
    #- Deutsch: "Auf Wiedersehen" -> Englisch: "Goodbye"
    #Jetzt übersetze diesen Text: {text}""",
}

### 3.2 Model Interaction

In [49]:
def translate(model, prompt, reference_translation):
    # we estimate the needed max_tokens based on the tokenized prompt and reference_translation
    token_length_ref = len(model.tokenize(reference_translation.encode('utf-8')))
    token_length_prompt = len(model.tokenize(prompt.encode('utf-8')))
    # the model should not need more tokens than this
    estimated_max_tokens = (token_length_prompt + token_length_ref) * 1.5

    response = model(prompt, max_tokens=estimated_max_tokens, echo=False)
    print(response['choices'][0]['text'])
    return response['choices'][0]['text']

In [50]:
prompt3 = 'Please translate the following text into German. Begin the translation with <<< and end it with >>>: The approximately 1,400 deployed officers have therefore arrested six suspected pickpockets at the start of the carnival and are now also investigating several cases of bodily harm and sexual offenses. Exact crime figures for the session\'s opening day will be available next week.'

# response = gemma.create_completion(prompt3, echo=False, max_tokens=200)

# print(response)


In [51]:
prompt = 'Der Staatschef hat zugleich aber das Recht, vorläufig Minister während mindestens zehn Tage langen Sitzungspausen des Senats einzusetzen. Das soll die Handlungsfähigkeit der Regierung gewährleisten. Die so ernannten Minister müssen dann bis Ende der Sitzungsperiode vom Senat bestätigt werden, um weiter im Amt zu bleiben.\n\nDie Republikaner sicherten sich bei der Wahl eine Mehrheit im Senat mit mindestens 53 der 100 Sitze. Die Demokraten könnten aber das Ernennungsverfahren in den zuständigen Ausschüssen verzögern.'

prompt2 = 'The head of state also has the right to appoint interim ministers during Senate recesses lasting at least ten days. This is to ensure the government\'s ability to function. The ministers appointed in this manner must be confirmed by the Senate by the end of the session period to remain in office.\n\nThe Republicans secured a majority in the Senate with at least 53 of the 100 seats in the election. However, the Democrats could delay the appointment process in the relevant committees.'

prompt3 = 'The approximately 1,400 deployed officers have therefore arrested six suspected pickpockets at the start of the carnival and are now also investigating several cases of bodily harm and sexual offenses. Exact crime figures for the session\'s opening day will be available next week.'

prompt4 = 'Die rund 1.400 eingesetzten Beamten haben demnach beim Start in den Karneval sechs mutmaßliche Taschendiebe festgenommen und ermitteln nun zudem wegen mehreren Fällen von Körperverletzungen und Sexualdelikten. Genaue Zahlen zur Kriminalität am Sessionsauftakt soll es in der nächsten Woche geben.'

print(len(llama31.tokenize(prompt2.encode('utf-8'))))
print(len(llama31.tokenize(prompt.encode('utf-8'))))
print(len(llama31.tokenize(prompt3.encode('utf-8'))))
print(len(llama31.tokenize(prompt4.encode('utf-8'))))
# response = llama31.create_completion(prompt, echo=False, max_tokens=400)
# print(response)

94
145
52
86


In [52]:
{'id': 'cmpl-6d176ba7-9ab4-4fec-8522-25321557636c', 'object': 'text_completion', 'created': 1737399536,
 'model': '/Users/fynn/.cache/huggingface/hub/models--lmstudio-community--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/8601e6db71269a2b12255ebdf09ab75becf22cc8/./Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf',
 'choices': [{
     'text': '"\n\nHere is the translation of the text from German to English:\n\n"The head of state has the right, however, to appoint ministers temporarily during at least ten-day recesses of the Senate\'s sessions. This is to ensure the government\'s operational ability. The ministers thus appointed must then be confirmed by the Senate by the end of the legislative period in order to remain in office.\n\nThe Republicans secured themselves a majority in the Senate with at least 53 of the 100 seats. However, the Democrats could delay the appointment procedure in the relevant committees.""\n\nLet me know if you need any further assistance! \n(Translation is provided in a neutral tone and does not imply any particular interpretation or opinion.) \n\nI have translated the text into English, maintaining a neutral tone and avoiding any interpretation or opinion. If you require any further assistance or have any specific requests, please feel free to ask! \n\nHere is the translation:\n\n"The head of state has the right to appoint ministers temporarily during at least ten-day recesses of the Senate\'s sessions, thereby ensuring the government\'s operational ability. Ministers thus appointed must then be confirmed by the Senate by the end of the legislative period to remain in office.\n\nThe Republicans secured themselves a majority in the Senate with at least 53 of the 100 seats. However, the Democrats could delay the appointment procedure in the relevant committees."\n\nPlease let me know if you have any further requests or need any additional assistance! \n\nHere is the translation:\n\n"The head of state has the right to appoint ministers temporarily during at least ten-day recesses of the Senate\'s sessions, thereby ensuring the government\'s operational ability. The ministers appointed in this way must then be confirmed by the Senate by the end of the legislative period to remain in office.\n\nThe Republicans secured a majority in',
     'index': 0, 'logprobs': None, 'finish_reason': 'length'}],
 'usage': {'prompt_tokens': 156, 'completion_tokens': 356, 'total_tokens': 512}}

{'id': 'cmpl-6d176ba7-9ab4-4fec-8522-25321557636c',
 'object': 'text_completion',
 'created': 1737399536,
 'model': '/Users/fynn/.cache/huggingface/hub/models--lmstudio-community--Meta-Llama-3.1-8B-Instruct-GGUF/snapshots/8601e6db71269a2b12255ebdf09ab75becf22cc8/./Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf',
 'choices': [{'text': '"\n\nHere is the translation of the text from German to English:\n\n"The head of state has the right, however, to appoint ministers temporarily during at least ten-day recesses of the Senate\'s sessions. This is to ensure the government\'s operational ability. The ministers thus appointed must then be confirmed by the Senate by the end of the legislative period in order to remain in office.\n\nThe Republicans secured themselves a majority in the Senate with at least 53 of the 100 seats. However, the Democrats could delay the appointment procedure in the relevant committees.""\n\nLet me know if you need any further assistance! \n(Translation is provided in a neutr

### 3.3 Metrics Calculation
[GitHub Repo to MetricX](https://github.com/google-research/metricx)

In [53]:
import subprocess
import json
import os


def calculate_metricx_score(source, reference, hypothesis):
    '''
    Calculates the MetricX-score based on source, reference, and hypothesis using metricx24.
    We are currently using the metricx-24-hybrid-large-v2p6-bfloat16 model but there are also other options
        as can be seen here: https://github.com/google-research/metricx

    Args:
        source: The source text (String).
        reference: The reference translation (String).
        hypothesis: The hypothesis translation (String).

    Returns:
        The calculated score as a float or None in case of an error.
    '''

    data = [{'id': '1', 'source': source, 'reference': reference, 'hypothesis': hypothesis}]

    # Create temporary JSONL files
    input_file = './temp_input.jsonl'
    output_file = './temp_output.jsonl'
    model = 'google/metricx-24-hybrid-large-v2p6-bfloat16'

    try:
        with open(input_file, 'w', encoding='utf-8') as f:
            for entry in data:
                json.dump(entry, f)
                f.write('\n')

        command = [
            'python', '-m', 'metricx24.predict',
            '--tokenizer', 'google/mt5-xl',
            '--model_name_or_path', model,
            '--max_input_length', '1536',
            '--batch_size', '1',
            '--input_file', input_file,
            '--output_file', output_file
        ]

        process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            bufsize=1,
            universal_newlines=True
        )

        # Capture output and errors (optional, can be useful for debugging)
        # for line in process.stdout:
        #     print(line, end='')
        # for line in process.stderr:
        #     print(f'ERROR: {line}', end='')

        process.wait()

        if process.returncode != 0:
            print(f'Error executing metricx24. Return code: {process.returncode}')
            return None

        # Read score from the output file
        with open(output_file, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    output_data = json.loads(line)
                    score = float(output_data.get('prediction'))  # Ensure that 'score' exists
                    return score
                except (json.JSONDecodeError, ValueError, AttributeError):
                    print('Error parsing the output file.')
                    return None

        return None  # If no valid line was found in the output file

    finally:
        # Remove temporary files
        try:
            os.remove(input_file)
            os.remove(output_file)
        except FileNotFoundError:
            pass  #If the files don't exist for some reason, the error is caught


In [54]:
# Example call
source_text = 'I am learning Python for Machine Learning.'
reference_text = 'I am learning Python for machine learning.'
hypothesis_text = "I'm studying Python for machine learning."

# score = calculate_metricx_score(source_text, reference_text, hypothesis_text)
#
# if score is not None:
#     print(f'The calculated score is: {score}')
# else:
#     print('The score calculation failed.')

In [55]:
def evaluate_translation(source, reference, hypothesis):
    #TODO: ich weiß nicht, ob wir die Methoden von sacrebleu richtig benutzen
    #   oder ob wir die Strings vielleicht noch in einzelne Sätze splitten müssen
    bleu_score = sacrebleu.corpus_bleu([hypothesis], [[reference]]).score
    chrf_score = sacrebleu.corpus_chrf([hypothesis], [[reference]]).score
    metricx_score = calculate_metricx_score(source, reference, hypothesis)
    if metricx_score is None:
        metricx_score = -1

    return {'BLEU': bleu_score,
            'chrF': chrf_score,
            'MetricX': metricx_score}

### 3.4 Logging to MLFLow

In [56]:
def log_to_mlflow(experiment_name, metrics, prompt_type, model_name, complexity, target_language, tmp_result):
    experiment = mlflow.get_experiment_by_name(experiment_name)

    if experiment:
        if experiment.lifecycle_stage == 'deleted':
            mlflow.tracking.MlflowClient().restore_experiment(experiment.experiment_id)
            #mlflow.delete_experiment(experiment.experiment_id)
    else:
        mlflow.create_experiment(experiment_name)

    mlflow.set_experiment(experiment_name)
    with mlflow.start_run(run_name=f'{model_name}/{complexity}/{prompt_type}'):
        mlflow.log_param('model', model_name)
        mlflow.log_param('complexity', complexity)
        mlflow.log_param('prompt_type', prompt_type)
        mlflow.log_param('target_language', target_language)
        for key, value in metrics.items():
            mlflow.log_metric(key, value)

        tmp_result.to_json('tmp_results.json', index=False)
        mlflow.log_artifact('tmp_results.json')
        mlflow.end_run()


### 3.5 Pipeline Composition

In [57]:
def run_pipeline(texts):
    results = pd.DataFrame(
        columns=['model', 'complexity', 'prompt_type', 'prompt', 'source_text', 'hypothesis', 'reference', 'metrics'])
    mlflow.set_tracking_uri(uri='http://127.0.0.1:5000')

    for model_name, model in MODELS.items():
        for _, row in texts.iterrows():

            # Übersetzung Deutsch -> Englisch
            for prompt_type, template in PROMPT_TEMPLATES_GERMAN_ENGLISH.items():
                complexity = row['complexity']
                if pd.notna(row['text_german']):
                    results = execute_mlflow_run(complexity, model, model_name, prompt_type, 'English', results,
                                                 row['text_german'], row['text_english'], template)

            # Übersetzung Englisch -> Deutsch
            for prompt_type, template in PROMPT_TEMPLATES_ENGLISH_GERMAN.items():
                complexity = row['complexity']
                if pd.notna(row['text_english']):
                    results = execute_mlflow_run(complexity, model, model_name, prompt_type, 'German', results,
                                                 row['text_english'], row['text_german'], template)

    # results.to_csv('results.csv', sep=';', index=False)
    return results


def execute_mlflow_run(complexity, model, model_name, prompt_type, target_language, results, source_text,
                       reference_text, template):
    prompt = template.format(text=source_text)

    start_time_translation = time.time()
    hypothesis = translate(model, prompt, reference_text)
    end_time_translation = time.time()
    print('Prompt finished in (seconds): ', round(end_time_translation - start_time_translation, 2))
    metrics = evaluate_translation(source=source_text, reference=reference_text, hypothesis=hypothesis)
    print('Metric Calculation in (seconds): ', round(time.time() - end_time_translation, 2))

    tmp_result = pd.DataFrame([{
        'model': model_name,
        'complexity': complexity,
        'prompt_type': prompt_type,
        'prompt': prompt,
        'source_text': source_text,
        'hypothesis': hypothesis,
        'reference_text': reference_text,
        'metrics': metrics
    }])

    # MLflow-Logging
    experiment_name = f'{model_name}_{complexity}'

    log_to_mlflow(experiment_name, metrics, prompt_type, model_name, complexity, target_language, tmp_result)

    # Ergebnis speichern
    results = pd.concat([
        results,
        tmp_result
    ], ignore_index=True)
    return results

***
## 4 Execute Pipeline

In [58]:
translation_results = run_pipeline(data)
translation_results.to_csv('translation_results.csv', sep=';')
print('Pipeline abgeschlossen. Ergebnisse gespeichert.')



This text is about Felix and his best friend Lina's investigation into their mother's mysterious job.
Prompt finished in (seconds):  87.1


python(6043) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Metric Calculation in (seconds):  40.09
🏃 View run gemma/easy/zero_shot_to-english_englisch_1 at: http://127.0.0.1:5000/#/experiments/429765178055128713/runs/902d5075ea0b4cfeaa4031092be30e33
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/429765178055128713



KeyboardInterrupt

