## Prompting Pipeline for comparison of different prompting techniques

In [14]:
import pandas as pd
import numpy as np
import pickle
import mlflow
import mlflow.pyfunc
import sacrebleu
import hashlib
from llama_cpp import Llama

### Data Loading

In [15]:
data = pd.read_pickle("machine_translation.pkl")
data

Unnamed: 0,complexity,text_german,text_english
0,easy,Felix hat es satt: Ständig ist Mama unterwegs....,Felix is fed up: Mom is always on the go. But ...
1,news_gen,Die rund 1.400 eingesetzten Beamten haben demn...,"The approximately 1,400 deployed officers have..."
2,news_spec,"Der Staatschef hat zugleich aber das Recht, vo...",The head of state also has the right to appoin...
3,pop_science,Dass der Klimawandel die Hitzewellen in Südasi...,There is no question that climate change is in...
4,science,"Der DSA-110, der sich am Owens Valley Radio Ob...","The DSA-110, situated at the Owens Valley Radi..."


In [16]:
data_info = pd.DataFrame()
data_info['complexity'] = data['complexity']
data_info['text_german_length'] = data['text_german'].str.len()
data_info['text_english_length'] = data['text_english'].str.len()
data_info

Unnamed: 0,complexity,text_german_length,text_english_length
0,easy,485,415
1,news_gen,296,280
2,news_spec,518,484
3,pop_science,542,521
4,science,1003,827


### Prompt Composition

In [None]:
# TODO: Beide Richtungen abbilden: English <-> German
# TODO: Verschiedene Prompt Arten: zero-shot, few-shot und verschiedene Variationen reinbringen

prompts = []
for _, row in data.iterrows():
    text = row['text_english']
    cleaned_text = text.replace("\n", " ").strip()
    prompt = f"Please translate the following English text to German: {cleaned_text}"
    prompts.append(prompt)
data['prompt'] = prompts

### Model Interaction

In [None]:
# Modelle laden
gemma = Llama.from_pretrained(
	repo_id="lmstudio-ai/gemma-2b-it-GGUF",
	filename="gemma-2b-it-q8_0.gguf",
    n_gpu_layers=1,
    verbose=False,
)

llama32 = Llama.from_pretrained(
	repo_id="hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF",
	filename="llama-3.2-3b-instruct-q8_0.gguf",
    n_gpu_layers=1,
    verbose=False,
)

llama31 = Llama.from_pretrained(
	repo_id="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
	filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
    verbose=False,
)

aya23 = Llama.from_pretrained(
	repo_id="bartowski/aya-23-35B-GGUF",
	filename="aya-23-35B-Q5_K_M.gguf",
    n_gpu_layers=1,
    verbose=False,
)

### Evaluation

In [None]:
# Übersetzungen mit llama.cpp durchführen
def translate_with_llama(data):
    translations = []

    with mlflow.start_run(run_name="Translation with llama.cpp"):
        mlflow.log_param("model", "gemma-2b-it-GGUF")

        for index, row in data.iterrows():
            prompt = row['prompt']
            response = gemma(prompt, max_tokens=512, echo=False, stop=["Q:"])
            translated_text = response['choices'][0]['text']

            # Metriken loggen
            mlflow.log_metric(f"translation_length_{index}", len(translated_text))

            # Generate a unique filename for logging, using a hash of the prompt
            prompt_hash = hashlib.md5(prompt.encode("utf-8")).hexdigest()[:8]
            translated_text_hash = hashlib.md5(translated_text.encode("utf-8")).hexdigest()[:8]
            mlflow.log_text(f"Prompt_{index}", prompt_hash)
            mlflow.log_text(f"Translation_{index}", translated_text_hash)

            translations.append(translated_text)

        # Ergebnisse als Artefakt speichern
        data['translation'] = translations
        data.to_csv("translations.csv", index=False)
        mlflow.log_artifact("translations.csv")

    return data

# Hauptpipeline
def main_pipeline():
    # Daten laden
    data = load_data()
    print("Daten geladen.")

    # Prompts erstellen
    data = generate_prompts(data)
    print("Prompts erstellt.")

    # Übersetzungen durchführen
    data = translate_with_llama(data)
    print("Übersetzungen durchgeführt.")

    # Ergebnisse speichern
    output_path = "translated_results.csv"
    data.to_csv(output_path, index=False)
    print(f"Ergebnisse gespeichert: {output_path}")

# Ausführung
experiment_name = "Pipeline_FirstTest"
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)

mlflow.set_experiment(experiment_name)

gemma = Llama.from_pretrained(
	repo_id="lmstudio-ai/gemma-2b-it-GGUF",
	filename="gemma-2b-it-q8_0.gguf",
    echo=False,
    stop=["Q:"],
    n_gpu_layers=1,
    verbose=False,
    )

main_pipeline()