# Jupyter Notebook for Project "Comparison of LLM Prompting Techniques"

In [17]:
import pandas as pd
import numpy as np
import pickle
import mlflow
import mlflow.pyfunc
import sacrebleu
from sacrebleu import corpus_bleu
import hashlib
from llama_cpp import Llama
import time


## Data Loading
In the first step we import the given translations as pandas Dataframes and print a quick overview of the dataframe.

In [18]:
data = pd.read_pickle('machine_translation.pkl')
data

Unnamed: 0,complexity,text_german,text_english
0,easy,Felix hat es satt: Ständig ist Mama unterwegs....,Felix is fed up: Mom is always on the go. But ...
1,news_gen,Die rund 1.400 eingesetzten Beamten haben demn...,"The approximately 1,400 deployed officers have..."
2,news_spec,"Der Staatschef hat zugleich aber das Recht, vo...",The head of state also has the right to appoin...
3,pop_science,Dass der Klimawandel die Hitzewellen in Südasi...,There is no question that climate change is in...
4,science,"Der DSA-110, der sich am Owens Valley Radio Ob...","The DSA-110, situated at the Owens Valley Radi..."


In [19]:
data_info = pd.DataFrame()
data_info['complexity'] = data['complexity']
data_info['text_german_length'] = data['text_german'].str.len()
data_info['text_english_length'] = data['text_english'].str.len()
data_info

Unnamed: 0,complexity,text_german_length,text_english_length
0,easy,485,415
1,news_gen,296,280
2,news_spec,518,484
3,pop_science,542,521
4,science,1003,827


## Model Loading
In the second step we import the AI-Models which are given in the specified task. For doing so we use the `llama-cpp-python` library (further documentation can be found [here](https://github.com/abetlen/llama-cpp-python)) and import the models directly from [huggingface](https://huggingface.co/).

Quick overview and installation guide of llama.cpp:
- https://www.datacamp.com/tutorial/llama-cpp-tutorial
- https://christophergs.com/blog/running-open-source-llms-in-python

In [20]:
# Modelle laden
gemma = Llama.from_pretrained(
    repo_id='lmstudio-ai/gemma-2b-it-GGUF',
    filename='gemma-2b-it-q8_0.gguf',
    n_gpu_layers=1,
    verbose=False,
)

llama32 = Llama.from_pretrained(
    repo_id='hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF',
    filename='llama-3.2-3b-instruct-q8_0.gguf',
    n_gpu_layers=1,
    verbose=False,
)

llama31 = Llama.from_pretrained(
    repo_id='lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF',
    filename='Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf',
    verbose=False,
)

aya23 = Llama.from_pretrained(
    repo_id='bartowski/aya-23-35B-GGUF',
    filename='aya-23-35B-Q5_K_M.gguf',
    n_gpu_layers=1,
    verbose=False,
)

MODELS = {
    'gemma': gemma,
    #'llama32': llama32,
    #'llama31': llama31,
    #'aya23': aya23,
}

llama_new_context_with_model: n_ctx_per_seq (512) < n_ctx_train (8192) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_

## MetricX Setup

[GitHub Repo to MetricX](https://github.com/google-research/metricx)

In [21]:
import subprocess
import json
import os


def calculate_metricx_score(source, reference, hypothesis):
    '''
    Calculates the MetricX-score based on source, reference, and hypothesis using metricx24.
    We are currently using the metricx-24-hybrid-large-v2p6-bfloat16 model but there are also other options
        as can be seen here: https://github.com/google-research/metricx

    Args:
        source: The source text (String).
        reference: The reference translation (String).
        hypothesis: The hypothesis translation (String).

    Returns:
        The calculated score as a float or None in case of an error.
    '''

    data = [{'id': '1', 'source': source, 'reference': reference, 'hypothesis': hypothesis}]

    # Create temporary JSONL files
    input_file = './temp_input.jsonl'
    output_file = './temp_output.jsonl'
    model = 'google/metricx-24-hybrid-large-v2p6-bfloat16'

    try:
        with open(input_file, 'w', encoding='utf-8') as f:
            for entry in data:
                json.dump(entry, f)
                f.write('\n')

        command = [
            'python', '-m', 'metricx24.predict',
            '--tokenizer', 'google/mt5-xl',
            '--model_name_or_path', model,
            '--max_input_length', '1536',
            '--batch_size', '1',
            '--input_file', input_file,
            '--output_file', output_file
        ]

        process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            bufsize=1,
            universal_newlines=True
        )

        # Capture output and errors (optional, can be useful for debugging)
        # for line in process.stdout:
        #     print(line, end='')
        # for line in process.stderr:
        #     print(f'ERROR: {line}', end='')

        process.wait()

        if process.returncode != 0:
            print(f'Error executing metricx24. Return code: {process.returncode}')
            return None

        # Read score from the output file
        with open(output_file, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    output_data = json.loads(line)
                    score = float(output_data.get('prediction'))  # Ensure that 'score' exists
                    return score
                except (json.JSONDecodeError, ValueError, AttributeError):
                    print('Error parsing the output file.')
                    return None

        return None  # If no valid line was found in the output file

    finally:
        # Remove temporary files
        try:
            os.remove(input_file)
            os.remove(output_file)
        except FileNotFoundError:
            pass  #If the files don't exist for some reason, the error is caught


In [22]:
# Example call
source_text = 'I am learning Python for Machine Learning.'
reference_text = 'I am learning Python for machine learning.'
hypothesis_text = "I'm studying Python for machine learning."

# score = calculate_metricx_score(source_text, reference_text, hypothesis_text)
#
# if score is not None:
#     print(f'The calculated score is: {score}')
# else:
#     print('The score calculation failed.')

In [23]:
## zweite Möglichkeit metricx zu nutzen...
# import sys
# from metricx24 import predict
#
# # Simuliere Kommandozeilenargumente
# sys.argv = [
#     'predict.py',  # Name des Skripts (wird ignoriert)
#     '--tokenizer', 'google/mt5-xl',
#     '--model_name_or_path', 'google/metricx-24-hybrid-large-v2p6-bfloat16',
#     '--max_input_length', '1536',
#     '--batch_size', '1',
#     '--input_file', './input.jsonl',
#     '--output_file', './output.jsonl'
# ]
#
# # Rufe die main-Funktion auf
# predict.main()
#
# print('Vorhersage abgeschlossen. Ergebnisse sind in 'output.jsonl' gespeichert.')

***

## Pipeline

### Prompt Composition

In [24]:
# TODO: Beide Richtungen abbilden: English <-> German
# TODO: Verschiedene Prompt Arten: zero-shot, few-shot und verschiedene Variationen reinbringen

PROMPT_TEMPLATES_ENGLISH_GERMAN = {
    'zero_shot-english': 'Please translate the following text from English to German: {text}',
    #"zero_shot-german": f"Bitte übersetze den folgenden Text von Englisch nach Deutsch: {text}",
    #"few-shot-english-1": f"""Please translate a text from English to German.
    #Here are some examples:
    #- English: "Hello" -> German: "Hallo"
    #- English: "Goodbye" -> German: "Auf Wiedersehen"
    #Now translate this text: {text}""",
    #"few-shot-german-1": f"""Bitte übersetze einen Text von Englisch nach Deutsch.
    #Hier sind einige Beispiele:
    #- Englisch: "Hello" -> Deutsch: "Hallo"
    #- Englisch: "Goodbye" -> Deutsch: "Auf Wiedersehen"
    #Jetzt übersetze diesen Text: {text}""",
}

PROMPT_TEMPLATES_GERMAN_ENGLISH = {
    "zero_shot-english": "Please translate the following text from German to English: {text}",
    #"zero_shot-german": f"Bitte übersetze den folgenden Text von Deutsch nach Englisch: {text}",
    #"few-shot-english-1": f"""Please translate a text from German to English.
    #Here are some examples:
    #- German: "Hallo" -> English: "Hello"
    #- German: "Auf Wiedersehen" -> English: "Goodbye"
    #Now translate this text: {text}""",
    #"few-shot-german-1": f"""Bitte übersetze einen Text von Deutsch nach Englisch.
    #Hier sind einige Beispiele:
    #- Deutsch: "Hallo" -> Englisch: "Hello"
    #- Deutsch: "Auf Wiedersehen" -> Englisch: "Goodbye"
    #Jetzt übersetze diesen Text: {text}""",
}

### Model Interaction

In [25]:
def translate(model, prompt):
    response = model(prompt, max_tokens=512, echo=False, stop=['Q:'])
    return response['choices'][0]['text']

### Evaluation

In [26]:
def evaluate_translation(source, reference, hypothesis):
    #TODO: ich weiß nicht, ob wir die Methoden von sacrebleu richtig benutzen
    #   oder ob wir die Strings vielleicht noch in einzelne Sätze splitten müssen
    bleu_score = corpus_bleu([hypothesis], [[reference]]).score
    chrf_score = sacrebleu.corpus_chrf([hypothesis], [[reference]]).score
    metricx_score = calculate_metricx_score(source, reference, hypothesis)
    return {'BLEU': bleu_score,
            'chrF': chrf_score,
            'MetricX': metricx_score}

In [30]:
def log_to_mlflow(experiment_name, metrics, prompt_type, model_name, complexity, duration_translation_generation,
                  duration_metric_calculation, tmp_result):
    experiment = mlflow.get_experiment_by_name(experiment_name)

    if experiment:
        if experiment.lifecycle_stage == 'deleted':
            mlflow.tracking.MlflowClient().restore_experiment(experiment.experiment_id)
            #mlflow.delete_experiment(experiment.experiment_id)
    else:
        mlflow.create_experiment(experiment_name)

    mlflow.set_experiment(experiment_name)
    with mlflow.start_run(run_name=f'{model_name}/{complexity}/{prompt_type}'):
        mlflow.log_param('model', model_name)
        mlflow.log_param('complexity', complexity)
        mlflow.log_param('prompt_type', prompt_type)
        mlflow.log_param('duration_translation_generation', duration_translation_generation)
        mlflow.log_param('duration_metric_calculation', duration_metric_calculation)
        for key, value in metrics.items():
            mlflow.log_metric(key, value)

        tmp_result.to_json('tmp_results.json', index=False)
        mlflow.log_artifact('tmp_results.json')
        mlflow.end_run()


### Main Pipeline Method

In [28]:
def run_pipeline(texts):
    results = pd.DataFrame(
        columns=['model', 'complexity', 'prompt_type', 'prompt', 'source_text', 'hypothesis', 'reference', 'metrics'])
    mlflow.set_tracking_uri(uri='http://127.0.0.1:5000')

    for model_name, model in MODELS.items():
        for _, row in texts.iterrows():

            # Übersetzung Deutsch -> Englisch
            for prompt_type, template in PROMPT_TEMPLATES_GERMAN_ENGLISH.items():
                complexity = row['complexity']
                if pd.notna(row['text_german']):
                    results = execute_mlflow_run(complexity, model, model_name, prompt_type, results,
                                                 row['text_german'], row['text_english'], template)

            # Übersetzung Englisch -> Deutsch
            for prompt_type, template in PROMPT_TEMPLATES_ENGLISH_GERMAN.items():
                complexity = row['complexity']
                if pd.notna(row['text_english']):
                    results = execute_mlflow_run(complexity, model, model_name, prompt_type, results,
                                                 row['text_english'], row['text_german'], template)

    # results.to_csv('results.csv', index=False)
    return results


def execute_mlflow_run(complexity, model, model_name, prompt_type, results, source_text, reference_text, template):
    prompt = template.format(text=source_text)

    start_time_translation = time.time()
    hypothesis = translate(model, prompt)
    end_time_translation = time.time()
    metrics = evaluate_translation(source=source_text, reference=reference_text, hypothesis=hypothesis)
    end_time_metrics = time.time()

    tmp_result = pd.DataFrame([{
                'model': model_name,
                'complexity': complexity,
                'prompt_type': prompt_type,
                'prompt': prompt,
                'source_text': source_text,
                'hypothesis': hypothesis,
                'reference_text': reference_text,
                'metrics': metrics
            }])

    # MLflow-Logging
    duration_translation_generation = round(end_time_translation - start_time_translation, 2)
    duration_metric_calculation = round(end_time_metrics - end_time_translation, 2)
    experiment_name = f'{model_name}_{complexity}'

    log_to_mlflow(experiment_name, metrics, prompt_type, model_name, complexity, duration_translation_generation,
                  duration_metric_calculation, tmp_result)

    # Ergebnis speichern
    results = pd.concat([
        results,
        tmp_result
    ], ignore_index=True)
    return results

### Execute Pipeline

In [31]:
translation_results = run_pipeline(data)
translation_results.to_csv('translation_results.csv')
print('Pipeline abgeschlossen. Ergebnisse gespeichert.')

python(2073) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


🏃 View run gemma/easy/zero_shot-english at: http://127.0.0.1:5000/#/experiments/429765178055128713/runs/df6f32065ef5461f816b9da1279b1e8a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/429765178055128713


python(2096) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


🏃 View run gemma/easy/zero_shot-english at: http://127.0.0.1:5000/#/experiments/429765178055128713/runs/d61e58d4191141568d93bcba893f105e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/429765178055128713


python(2125) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


🏃 View run gemma/news_gen/zero_shot-english at: http://127.0.0.1:5000/#/experiments/391944840061747289/runs/8228b469f219483496e556555f4a549e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/391944840061747289


python(2140) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


🏃 View run gemma/news_gen/zero_shot-english at: http://127.0.0.1:5000/#/experiments/391944840061747289/runs/f87754210575454994047f493de75082
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/391944840061747289


python(2179) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


🏃 View run gemma/news_spec/zero_shot-english at: http://127.0.0.1:5000/#/experiments/713590614913207437/runs/a5759109c8db475db064d886cd9e3c7b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/713590614913207437


KeyboardInterrupt: 

In [None]:
translation_results

In [None]:
mlflow.end_run()