## Prompting Pipeline for comparison of different prompting techniques

In [6]:
import pandas as pd
import numpy as np
import pickle
import mlflow
import mlflow.pyfunc
import transformers
from transformers import pipeline
import sacrebleu

### Data Loading

In [11]:
# Daten laden
text = pd.read_pickle('machine_translation.pkl')
text

# Standardisierung ?

Unnamed: 0,complexity,text_german,text_english
0,easy,Felix hat es satt: Ständig ist Mama unterwegs....,Felix is fed up: Mom is always on the go. But ...
1,news_gen,Die rund 1.400 eingesetzten Beamten haben demn...,"The approximately 1,400 deployed officers have..."
2,news_spec,"Der Staatschef hat zugleich aber das Recht, vo...",The head of state also has the right to appoin...
3,pop_science,Dass der Klimawandel die Hitzewellen in Südasi...,There is no question that climate change is in...
4,science,"Der DSA-110, der sich am Owens Valley Radio Ob...","The DSA-110, situated at the Owens Valley Radi..."


### Prompt Composition

In [None]:
def create_prompt(text, mode="zero-shot"):
    if mode == "zero-shot":
        return f"Translate this text into German: {text}"
    elif mode == "few-shot":
        examples = "Example 1: Hello -> Hallo\nExample 2: Goodbye -> Auf Wiedersehen\n"
        return f"{examples}Translate this text into German: {text}"

### Model Interaction

In [None]:
# Modell laden
translation_model = pipeline("translation_en_to_de", model="lmstudio-ai/gemma-2b-it-GGUF")

# Prompt senden und Ergebnisse sammeln
def translate_text(prompt):
    result = translation_model(prompt)
    return result[0]["translation_text"]

### Evaluation

In [15]:
import pandas as pd
import mlflow
import mlflow.pyfunc
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

# Schritt 1: Daten laden
def load_data(file_path):
    data = pd.read_pickle(file_path)
    return data

# Schritt 2: Prompt-Entwicklung
def generate_prompts(data):
    prompts = []
    for _, row in data.iterrows():
        text = row['text_english']
        prompt = f"Translate the following text to German:\n\n{text}\n\n"
        prompts.append(prompt)
    data['prompt'] = prompts
    return data

# Schritt 3: Modell-Integration
def load_models():
    model_paths = {
        "gemma": "lmstudio-ai/gemma-2b-it-GGUF",
        "llama_3_2": "hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF",
        "llama_3_1": "lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
        "bartowski": "bartowski/aya-23-35B-GGUF"
    }
    models = {}
    for name, path in model_paths.items():
        tokenizer = AutoTokenizer.from_pretrained(path)
        model = AutoModelForSeq2SeqLM.from_pretrained(path)
        models[name] = {
            "tokenizer": tokenizer,
            "model": model
        }
    return models

# Schritt 4: Modellinteraktion und MLflow-Logging
def translate_with_models(data, models):
    results = []

    for model_name, components in models.items():
        tokenizer = components['tokenizer']
        model = components['model']
        translation_pipeline = pipeline("translation", model=model, tokenizer=tokenizer)

        with mlflow.start_run(run_name=f"Translation with {model_name}"):
            mlflow.log_param("model_name", model_name)

            for index, prompt in enumerate(data['prompt']):
                translation = translation_pipeline(prompt, max_length=512, truncation=True)
                translated_text = translation[0]['translation_text']

                # Log Prompt und Übersetzung
                mlflow.log_metric(f"translation_length_{index}", len(translated_text))
                mlflow.log_text(f"Prompt {index}", prompt)
                mlflow.log_text(f"Translation {index}", translated_text)

                results.append({
                    "model": model_name,
                    "prompt": prompt,
                    "translation": translated_text
                })

            # Artefakte (Ergebnisse) speichern
            results_df = pd.DataFrame(results)
            results_path = f"translation_results_{model_name}.csv"
            results_df.to_csv(results_path, index=False)
            mlflow.log_artifact(results_path)

    return pd.DataFrame(results)

# Hauptpipeline
def main_pipeline(file_path):
    mlflow.set_tracking_uri("http://localhost:5000")  # Lokales Tracking
    experiment_name = "Pipeline_FirstTest"

    if not mlflow.get_experiment_by_name(experiment_name):
        mlflow.create_experiment(experiment_name)

    mlflow.set_experiment(experiment_name)

    # Daten laden
    data = load_data(file_path)
    print("Daten geladen.")

    # Prompts erstellen
    data = generate_prompts(data)
    print("Prompts erstellt.")

    # Modelle laden
    models = load_models()
    print("Modelle geladen.")

    # Übersetzungen durchführen
    translations = translate_with_models(data, models)
    print("Übersetzungen durchgeführt.")
    return translations

# Ausführung der Pipeline
file_path = "machine_translation.pkl"
results = main_pipeline(file_path)

# Ergebnisse speichern
results.to_csv("/mnt/data/translation_results.csv", index=False)
print("Ergebnisse gespeichert: translation_results.csv")

Daten geladen.
Prompts erstellt.


OSError: Can't load tokenizer for 'lmstudio-ai/gemma-2b-it-GGUF'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'lmstudio-ai/gemma-2b-it-GGUF' is the correct path to a directory containing all relevant files for a GemmaTokenizerFast tokenizer.