# Jupyter Notebook for Project "Comparison of LLM Prompting Techniques"

In [None]:
import pandas as pd
import numpy as np
import pickle
import mlflow
import mlflow.pyfunc
import sacrebleu
from sacrebleu import corpus_bleu
import hashlib
from llama_cpp import Llama
from sympy import false

## Data Loading

In [None]:
data = pd.read_pickle("machine_translation.pkl")
data

In [None]:
data_info = pd.DataFrame()
data_info['complexity'] = data['complexity']
data_info['text_german_length'] = data['text_german'].str.len()
data_info['text_english_length'] = data['text_english'].str.len()
data_info

## Model Loading

In [None]:
# Modelle laden
gemma = Llama.from_pretrained(
	repo_id="lmstudio-ai/gemma-2b-it-GGUF",
	filename="gemma-2b-it-q8_0.gguf",
    n_gpu_layers=1,
    verbose=False,
)

llama32 = Llama.from_pretrained(
	repo_id="hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF",
	filename="llama-3.2-3b-instruct-q8_0.gguf",
    n_gpu_layers=1,
    verbose=False,
)

llama31 = Llama.from_pretrained(
	repo_id="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
	filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
    verbose=False,
)

aya23 = Llama.from_pretrained(
	repo_id="bartowski/aya-23-35B-GGUF",
	filename="aya-23-35B-Q5_K_M.gguf",
    n_gpu_layers=1,
    verbose=False,
)

MODELS = {
    "gemma": gemma,
    #"llama32": llama32,
    #"llama31": llama31,
    #"aya23": aya23,
}

## Metric Setup

[GitHub Repo to MatricX](https://github.com/google-research/metricx)

In [None]:
import subprocess
import json
import os

def calculate_metricx_score(source, reference, hypothesis):
    """
    Calculates the MetricX-score based on source, reference, and hypothesis using metricx24.
    We are currently using the metricx-24-hybrid-large-v2p6-bfloat16 model but there are also other options
        as can be seen here: https://github.com/google-research/metricx

    Args:
        source: The source text (String).
        reference: The reference translation (String).
        hypothesis: The hypothesis translation (String).

    Returns:
        The calculated score as a float or None in case of an error.
    """

    data = [{"id": "1", "source": source, "reference": reference, "hypothesis": hypothesis}]

    # Create temporary JSONL files
    input_file = "./temp_input.jsonl"
    output_file = "./temp_output.jsonl"
    model = "google/metricx-24-hybrid-large-v2p6-bfloat16"

    try:
        with open(input_file, "w", encoding="utf-8") as f:
            for entry in data:
                json.dump(entry, f)
                f.write("\n")

        command = [
            "python", "-m", "metricx24.predict",
            "--tokenizer", "google/mt5-xl",
            "--model_name_or_path", model,
            "--max_input_length", "1536",
            "--batch_size", "1",
            "--input_file", input_file,
            "--output_file", output_file
        ]

        process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            bufsize=1,
            universal_newlines=True
        )

        # Capture output and errors (optional, can be useful for debugging)
        # for line in process.stdout:
        #     print(line, end="")
        # for line in process.stderr:
        #     print(f"ERROR: {line}", end="")

        process.wait()

        if process.returncode != 0:
             print(f"Error executing metricx24. Return code: {process.returncode}")
             return None

        # Read score from the output file
        with open(output_file, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    output_data = json.loads(line)
                    score = float(output_data.get("prediction"))  # Ensure that "score" exists
                    return score
                except (json.JSONDecodeError, ValueError, AttributeError):
                    print("Error parsing the output file.")
                    return None

        return None # If no valid line was found in the output file

    finally:
        # Remove temporary files
        try:
            os.remove(input_file)
            os.remove(output_file)
        except FileNotFoundError:
            pass #If the files don't exist for some reason, the error is caught


In [None]:
# Example call
source_text = "I am learning Python for Machine Learning."
reference_text = "I am learning Python for machine learning."
hypothesis_text = "I'm studying Python for machine learning."

score = calculate_metricx_score(source_text, reference_text, hypothesis_text)

if score is not None:
    print(f"The calculated score is: {score}")
else:
    print("The score calculation failed.")

In [None]:
## zweite Möglichkeit metricx zu nutzen...
import sys
from metricx24 import predict

# Simuliere Kommandozeilenargumente
sys.argv = [
    "predict.py",  # Name des Skripts (wird ignoriert)
    "--tokenizer", "google/mt5-xl",
    "--model_name_or_path", "google/metricx-24-hybrid-large-v2p6-bfloat16",
    "--max_input_length", "1536",
    "--batch_size", "1",
    "--input_file", "./input.jsonl",
    "--output_file", "./output.jsonl"
]

# Rufe die main-Funktion auf
predict.main()

print("Vorhersage abgeschlossen. Ergebnisse sind in 'output.jsonl' gespeichert.")

***

## Pipeline

### Prompt Composition

In [None]:
# TODO: Beide Richtungen abbilden: English <-> German
# TODO: Verschiedene Prompt Arten: zero-shot, few-shot und verschiedene Variationen reinbringen

PROMPT_TEMPLATES_ENGLISH_GERMAN = {
    "zero_shot-english": "Please translate the following text from English to German: {text}",
    #"zero_shot-german": f"Bitte übersetze den folgenden Text von Englisch nach Deutsch: {text}",
    #"few-shot-english-1": f"""Please translate a text from English to German.
    #Here are some examples:
    #- English: "Hello" -> German: "Hallo"
    #- English: "Goodbye" -> German: "Auf Wiedersehen"
    #Now translate this text: {text}""",
    #"few-shot-german-1": f"""Bitte übersetze einen Text von Englisch nach Deutsch.
    #Hier sind einige Beispiele:
    #- Englisch: "Hello" -> Deutsch: "Hallo"
    #- Englisch: "Goodbye" -> Deutsch: "Auf Wiedersehen"
    #Jetzt übersetze diesen Text: {text}""",
}

PROMPT_TEMPLATES_GERMAN_ENGLISH = {
    "zero_shot-english": "Please translate the following text from German to English: {text}",
    #"zero_shot-german": f"Bitte übersetze den folgenden Text von Deutsch nach Englisch: {text}",
    #"few-shot-english-1": f"""Please translate a text from German to English.
    #Here are some examples:
    #- German: "Hallo" -> English: "Hello"
    #- German: "Auf Wiedersehen" -> English: "Goodbye"
    #Now translate this text: {text}""",
    #"few-shot-german-1": f"""Bitte übersetze einen Text von Deutsch nach Englisch.
    #Hier sind einige Beispiele:
    #- Deutsch: "Hallo" -> Englisch: "Hello"
    #- Deutsch: "Auf Wiedersehen" -> Englisch: "Goodbye"
    #Jetzt übersetze diesen Text: {text}""",
}

### Model Interaction

In [None]:
def translate(model, prompt):
    response = model(prompt, max_tokens=512, echo=False, stop=["Q:"])
    return response['choices'][0]['text']

### Evaluation

In [None]:
def evaluate_translation(reference, translation):
    bleu_score = corpus_bleu([translation], [[reference]]).score
    return {"BLEU": bleu_score}

In [None]:
def log_to_mlflow(experiment_name, metrics, prompt_type, model_name, complexity):
    experiment = mlflow.get_experiment_by_name(experiment_name)

    if experiment:
        if experiment.lifecycle_stage == "deleted":
            mlflow.tracking.MlflowClient().restore_experiment(experiment.experiment_id)
            #mlflow.delete_experiment(experiment.experiment_id)
    else:
        mlflow.create_experiment(experiment_name)

    mlflow.set_experiment(experiment_name)
    with mlflow.start_run(run_name=f"{model_name}/{complexity}/{prompt_type}"):
        mlflow.log_param("model", model_name)
        mlflow.log_param("complexity", complexity)
        mlflow.log_param("prompt_type", prompt_type)
        for key, value in metrics.items():
            mlflow.log_metric(key, value)

### Main Pipeline Method

In [None]:
def run_pipeline(texts):
    results = pd.DataFrame(columns=["model", "complexity", "prompt_type", "prompt", "source_text", "translation", "metrics"])
    mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

    for model_name, model in MODELS.items():
        for _, row in texts.iterrows():

            # Übersetzung Deutsch -> Englisch
            for prompt_type, template in PROMPT_TEMPLATES_GERMAN_ENGLISH.items():
                complexity = row['complexity']
                if pd.notna(row['text_german']):
                    prompt = template.format(text=row['text_german'])
                    translation = translate(model, prompt)
                    metrics = evaluate_translation(reference=row['text_english'], translation=translation)

                    # MLflow-Logging
                    experiment_name = f"{model_name}_{complexity}"
                    log_to_mlflow(experiment_name, metrics, prompt_type, model_name, complexity)

                    # Ergebnis speichern
                    results = pd.concat([
                        results,
                        pd.DataFrame([{
                            "model": model_name,
                            "complexity": complexity,
                            "prompt_type": prompt_type,
                            "prompt": prompt,
                            "source_text": row['text_german'],
                            "translation": translation,
                            "metrics": metrics
                        }])
                    ], ignore_index=True)

                    #results.to_csv("results.csv", index=false)
                    #mlflow.log_artifact("results.csv")

            # Übersetzung Englisch -> Deutsch
            for prompt_type, template in PROMPT_TEMPLATES_ENGLISH_GERMAN.items():
                complexity = row['complexity']
                if pd.notna(row['text_english']):
                    prompt = template.format(text=row['text_english'])
                    translation = translate(model, prompt)
                    metrics = evaluate_translation(reference=row['text_german'], translation=translation)

                    # MLflow-Logging
                    experiment_name = f"{model_name}_{complexity}"
                    log_to_mlflow(experiment_name, metrics, prompt_type, model_name, complexity)

                    # Ergebnis speichern
                    results = pd.concat([
                        results,
                        pd.DataFrame([{
                            "model": model_name,
                            "complexity": complexity,
                            "prompt_type": prompt_type,
                            "prompt": prompt,
                            "source_text": row['text_english'],
                            "translation": translation,
                            "metrics": metrics
                        }])
                    ], ignore_index=True)

                    #results.to_csv("results.csv", index=false)
                    #mlflow.log_artifact("results.csv")

    return results

### Execute Pipeline

In [None]:
translation_results = run_pipeline(data)
translation_results.to_csv("translation_results.csv", index=false)
print("Pipeline abgeschlossen. Ergebnisse gespeichert.")

In [None]:
translation_results

In [None]:
mlflow.end_run()