# Jupyter Notebook for Project "Comparison of LLM Prompting Techniques"

In [15]:
import pandas as pd
import mlflow
import mlflow.pyfunc
import sacrebleu
from llama_cpp import Llama
import time

## 1 Data Loading
In the first step we import the given translations as pandas Dataframes and print a quick overview of the dataframe.

In [16]:
data = pd.read_pickle('machine_translation.pkl')
data

Unnamed: 0,complexity,text_german,text_english
0,easy,Felix hat es satt: Ständig ist Mama unterwegs....,Felix is fed up: Mom is always on the go. But ...
1,news_gen,Die rund 1.400 eingesetzten Beamten haben demn...,"The approximately 1,400 deployed officers have..."
2,news_spec,"Der Staatschef hat zugleich aber das Recht, vo...",The head of state also has the right to appoin...
3,pop_science,Dass der Klimawandel die Hitzewellen in Südasi...,There is no question that climate change is in...
4,science,"Der DSA-110, der sich am Owens Valley Radio Ob...","The DSA-110, situated at the Owens Valley Radi..."


In [17]:
data_info = pd.DataFrame()
data_info['complexity'] = data['complexity']
data_info['text_german_length'] = data['text_german'].str.len()
data_info['text_english_length'] = data['text_english'].str.len()
data_info

Unnamed: 0,complexity,text_german_length,text_english_length
0,easy,485,415
1,news_gen,296,280
2,news_spec,518,484
3,pop_science,542,521
4,science,1003,827


In [18]:
from enum import Enum


class Language(Enum):
    ENGLISH = 'English'
    GERMAN = 'German'


class Complexity(Enum):
    EASY = 'easy'
    NEWS_GEN = 'news_gen'
    NEWS_SPEC = 'news_spec'
    POP_SCIENCE = 'pop_science'
    SCIENCE = 'science'


ALL_COMPLEXITIES = list(Complexity)

# this constant value is later used to calculate the estimated tokens and context size
# -> it gets later multiplied by the token length of the prompt template + source text + reference text
#       and should be > 1 but not too big
# we identified 1.5 as a good heuristic
ESTIMATED_TOKENS_BUFFER = 1.5

***
## 2 Model Loading
In the second step we import the AI-Models which are given in the specified task. For doing so we use the `llama-cpp-python` library (further documentation can be found [here](https://github.com/abetlen/llama-cpp-python)) and import the models directly from [huggingface](https://huggingface.co/).

Quick overview and installation guide of llama.cpp:
- https://www.datacamp.com/tutorial/llama-cpp-tutorial
- https://christophergs.com/blog/running-open-source-llms-in-python

In [19]:
# Configuration of the models
MODELS = {
    'gemma': {
        'repo_id': 'lmstudio-ai/gemma-2b-it-GGUF',
        'filename': 'gemma-2b-it-q8_0.gguf',
    },
    'llama32': {
        'repo_id': 'hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF',
        'filename': 'llama-3.2-3b-instruct-q8_0.gguf',
    },
    'llama31': {
        'repo_id': 'lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF',
        'filename': 'Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf',
    },
    'aya23': {
        'repo_id': 'bartowski/aya-23-35B-GGUF',
        'filename': 'aya-23-35B-Q5_K_M.gguf',
    },
}

In [20]:
def create_llama_model(repo_id, filename, n_ctx=None):
    """
    Loads and creates the Llama model from the specified repository and file.

    Args:
        repo_id: repository ID of the model.
        filename: filename of the model.
        n_ctx: context window size for the model. Defaults to 512 if None.

    Returns:
        The loaded Llama model, or None if an error occurs.
    """
    try:
        if n_ctx is None:
            # default of llama_cpp
            n_ctx = 512
        if repo_id is not None and filename is not None:
            model = Llama.from_pretrained(
                repo_id=repo_id,
                filename=filename,
                n_ctx=n_ctx,
                # these parameters can be set individually based on the running system
                #n_gpu_layers=n_gpu_layers,
                #n_threads=8,
                verbose=False,
            )
            print(f"Model {repo_id} successfully loaded with n_ctx={n_ctx}")
            return model
        else:
            return None
    except Exception as e:
        print(f"Error occurred when loading the model from file: {filename}: {e}")
        return None

***

## 3 Pipeline

### 3.1 Model Interaction

In [21]:
def translate(model, prompt, reference_translation):
    """
    Translates the given prompt using the provided model.
    estimates the needed max_tokens based on the lengths of the prompt and the reference translation.

    Args:
        model: translation model to be used.
        prompt: text to be translated.
        reference_translation: reference translation used to estimate max_tokens.

    Returns:
        The translated text.
    """
    # we estimate the needed max_tokens based on the tokenized prompt and reference_translation
    token_length_ref = len(model.tokenize(reference_translation.encode('utf-8')))
    token_length_prompt = len(model.tokenize(prompt.encode('utf-8')))
    # the model should not need more tokens than this
    estimated_max_tokens = (token_length_prompt + token_length_ref) * ESTIMATED_TOKENS_BUFFER

    response = model(prompt, max_tokens=estimated_max_tokens, echo=False)
    return response['choices'][0]['text']

### 3.2 Metrics Calculation
[GitHub Repo to MetricX](https://github.com/google-research/metricx)

In [23]:
from rouge_score import rouge_scorer


def evaluate_translation(source, reference, hypothesis):
    """
    Evaluates the quality of a translation (hypothesis) against a reference translation,
    calculating BLEU, chrF, MetricX, and RougeL scores.

    Args:
        source: source text.
        reference: reference translation.
        hypothesis: hypothesis.

    Returns:
        A dictionary containing the BLEU, chrF, RougeL, and MetricX scores.  BLEU, chrF, and RougeL
        are scaled to be between 0 and 100. MetricX will be -1 if it cannot be calculated.
    """

    # Note that BLEU and chrF Scores can only be between 0 and 1
    #   but sacreblue returns floats between 0 and 100
    bleu_score = sacrebleu.corpus_bleu([hypothesis], [[reference]]).score
    chrf_score = sacrebleu.corpus_chrf([hypothesis], [[reference]]).score

    metricx_score = calculate_metricx_score(source, reference, hypothesis)
    if metricx_score is None:
        metricx_score = -1

    rougel_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rougel_score = rougel_scorer.score(reference, hypothesis)

    return {'BLEU': bleu_score,
            'chrF': chrf_score,
            # we also edited the rougeL score to lie between 0 and 100 (to be similar to BLEU and chrF)
            'rougeL': (rougel_score['rougeL'].fmeasure * 100),
            'MetricX': metricx_score}


In [22]:
import subprocess
import json
import os


def calculate_metricx_score(source, reference, hypothesis):
    '''
    Calculates the MetricX-score based on source, reference, and hypothesis using metricx24.
    We are currently using the metricx-24-hybrid-large-v2p6-bfloat16 model but there are also other options
        as can be seen here: https://github.com/google-research/metricx

    Args:
        source: The source text (String).
        reference: The reference translation (String).
        hypothesis: The hypothesis translation (String).

    Returns:
        The calculated score as a float or None in case of an error.
    '''


    # Create temporary JSONL files
    input_file = './temp_input.jsonl'
    output_file = './temp_output.jsonl'
    # this is the model that is used for evaluation
    model = 'google/metricx-24-hybrid-large-v2p6-bfloat16'

    tmp_data = [{'id': '1', 'source': source, 'reference': reference, 'hypothesis': hypothesis}]
    try:
        with open(input_file, 'w', encoding='utf-8') as f:
            for entry in tmp_data:
                json.dump(entry, f)
                f.write('\n')

        command = [
            'python', '-m', 'metricx24.predict',
            '--tokenizer', 'google/mt5-xl',
            '--model_name_or_path', model,
            '--max_input_length', '1536',
            '--batch_size', '1',
            '--input_file', input_file,
            '--output_file', output_file
        ]

        process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            bufsize=1,
            universal_newlines=True
        )

        # Capture output and errors (optional, can be useful for debugging)
        #for line in process.stdout:
        #    print(line, end='')
        #for line in process.stderr:
        #    print(f'ERROR: {line}', end='')

        # wait for the metric calculation process to terminate
        process.wait()

        if process.returncode != 0:
            print(f'Error executing metricx24. Return code: {process.returncode}')
            return None

        # Read score from the output file
        with open(output_file, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    output_data = json.loads(line)
                    score = float(output_data.get('prediction'))
                    return score
                except (json.JSONDecodeError, ValueError, AttributeError):
                    print('Error parsing the output file.')
                    return None

        return None  # If no valid line was found in the output file

    finally:
        # Remove temporary files
        try:
            os.remove(input_file)
            os.remove(output_file)
        except FileNotFoundError:
            pass  #If the files don't exist for some reason, the error is caught




In [24]:
#evaluate_translation("Felix hat es satt: Ständig ist Mama unterwegs. Doch warum das so ist, will ihm niemand verraten. Für Felix ist daher klar: Seine Mutter ist eine Geheimagentin. Als er an seinem zehnten Geburtstag einen rätselhaften Brief erhält, scheint sich seine Vermutung zu bestätigen. Zusammen mit seiner besten Freundin Lina macht er sich daran, das Geheimnis um Mamas Arbeit zu lüften. Ehe sie sich versehen, stecken die beiden mitten in ihrem ersten spannenden Fall als angehende Geheimagenten.", "Felix is fed up: Mom is always on the go. But nobody will tell him why that is. For Felix, it's clear: his mother is a secret agent. When he receives a mysterious letter on his tenth birthday, his suspicion seems to be confirmed. Together with his best friend Lina, he sets out to uncover the secret of mom's job. Before they know it, the two are in the middle of their first exciting case as budding secret agents.", "\n\n**English Translation:**\n\nFelix had sat: Mama was constantly on the go. But why this is the case, no one will tell him. Therefore, clear to Felix: his mother is a covert agent. When he receives a cryptic letter on his eleventh birthday, it seems his suspicion is confirmed. Together with his best friend Lina, he starts unraveling the mystery of his mother's job. When they finally manage to solve the case, they stick to their first exciting clue like detectives.")

### 3.3 Logging to MLFLow

In [25]:
def log_to_mlflow(experiment_name, template_name, metrics, prompt_type, model_name, complexity, target_language, tmp_result,
                  prompt_language):
    """
    Logs results of a run to MLflow.
    creates the respective experiment if it does not already exist

    Args:
        experiment_name: name of the MLflow experiment.
        template_name: name of the prompt template used.
        metrics: dictionary of metrics to log.
        prompt_type: prompting technique that was used.
        model_name: name of the model.
        complexity: complexity level of the source text.
        target_language: target language of the translation.
        tmp_result: Pandas DataFrame containing temporary results.
        prompt_language: language of the prompt.
    """
    experiment = mlflow.get_experiment_by_name(experiment_name)

    if experiment:
        if experiment.lifecycle_stage == 'deleted':
            mlflow.tracking.MlflowClient().restore_experiment(experiment.experiment_id)
    else:
        mlflow.create_experiment(experiment_name)

    mlflow.set_experiment(experiment_name)
    with mlflow.start_run(run_name=f'{model_name}/{complexity}/{template_name}'):
        mlflow.log_param('model', model_name)
        mlflow.log_param('complexity', complexity)
        mlflow.log_param('prompt_type', prompt_type)
        mlflow.log_param('target_language', target_language)
        mlflow.log_param('prompt_language', prompt_language)
        for key, value in metrics.items():
            mlflow.log_metric(key, value)

        tmp_result.to_json('tmp_results.json', index=False)
        mlflow.log_artifact('tmp_results.json')
        mlflow.end_run()


### 3.4 Pipeline Composition

In [26]:
import gc


def run_pipeline(texts):
    """
    Runs the translation pipeline for a given set of texts, iterating through the different MODELS (from above),
    complexities, and PROMPT_TEMPLATES. Also logs the results to MLflow and returns them as a DataFrame.

    Args:
        texts: Pandas DataFrame containing the source and reference texts, as well as the
            complexity level for each text.

    Returns:
        Pandas DataFrame containing the results of all translation runs.
    """
    # this is the result Dataframe where all runs are stored
    results = pd.DataFrame(
        columns=['model', 'complexity', 'prompt_type', 'prompt', 'source_text', 'hypothesis', 'reference', 'metrics',
                 'prompt_language'])

    # this is just for mlflow and can be changed individually
    mlflow.set_tracking_uri(uri='http://127.0.0.1:5000')

    for model_name, model_config in MODELS.items():
        for _, row in texts.iterrows():
            model = createModel(model_config, row)
            complexity_enum = next(c for c in Complexity if c.value == row['complexity'])

            # translations German -> English
            for template_name, template_data in PROMPT_TEMPLATES_GERMAN_ENGLISH.items():
                if pd.notna(row['text_german']) and complexity_enum in template_data['complexities']:
                    results = execute_mlflow_run(template_name, complexity_enum.value, model, model_name, Language.ENGLISH, results,
                                                 row['text_german'], row['text_english'], template_data)

            # translations English -> German
            for template_name, template_data in PROMPT_TEMPLATES_ENGLISH_GERMAN.items():
                if pd.notna(row['text_english']) and complexity_enum in template_data['complexities']:
                    results = execute_mlflow_run(template_name, complexity_enum.value, model, model_name, Language.GERMAN, results,
                                                 row['text_english'], row['text_german'], template_data)

            # we dont need the model anymore so we delete it
            del model
            gc.collect()
    return results


def createModel(model_config, row):
    """
    Creates the desired model with a context window (n_ctx) that is estimated
    based on the token length of the prompt, source and reference text.

    Args:
        model_config: configuration for the language model.
        row: row from the input DataFrame containing the source and reference texts (needed for token estimation).

    Returns:
        The created language model.
    """
    # at first we just use the dummyModel for the tokenization of text
    dummyModel = create_llama_model(model_config['repo_id'], model_config['filename'])

    # then we determine the minimal tokens needed for a translation (prompt + source text + reference text)
    combined_text = f"{row['text_german']} {row['text_english']}"
    text_tokens = len(dummyModel.tokenize(combined_text.encode('utf-8')))
    # we want to tokenize the longest template/ prompt
    max_promp_template = max(
        (t['template'] for d in (PROMPT_TEMPLATES_GERMAN_ENGLISH, PROMPT_TEMPLATES_ENGLISH_GERMAN) for t in d.values()),
        key=len)
    prompt_tokens = len(dummyModel.tokenize(max_promp_template.encode('utf-8')))

    # now we delete the dummyModel to free up memory
    del dummyModel
    gc.collect()

    # and then create the final model based on the estimated_max_tokens
    estimated_max_tokens = (text_tokens + prompt_tokens) * ESTIMATED_TOKENS_BUFFER
    n_ctx = int(estimated_max_tokens * 1.1)
    print(f"estimated_max_tokens: {estimated_max_tokens}; n_ctx: {n_ctx}")
    model = create_llama_model(model_config['repo_id'], model_config['filename'], n_ctx=n_ctx)
    return model


def execute_mlflow_run(template_name, complexity, model, model_name, target_language: Language, results, source_text,
                       reference_text, template_data):
    """
    Executes a single translation run, including prompt creation, translation, evaluation, and logging to MLflow.

    Args:
        template_name: name of the prompt template used.
        complexity: complexity level of the prompt.
        model: language model used for translation.
        model_name: name of the model.
        target_language: target language of the translation.
        results: Pandas DataFrame to store the results.
        source_text: source text to be translated.
        reference_text: reference translation.
        template_data: data associated with the prompt template.

    Returns:
        the updated results DataFrame.
    """
    # this is the actual composition of the prompt where '{text}' gets replaced with the source text
    prompt = template_data['template'].format(text=source_text)


    start_time_translation = time.time()
    hypothesis = translate(model, prompt, reference_text)
    end_time_translation = time.time()
    print('Prompt finished in (seconds): ', round(end_time_translation - start_time_translation, 2))

    metrics = evaluate_translation(source=source_text, reference=reference_text, hypothesis=hypothesis)
    print('Metric Calculation finished in (seconds): ', round(time.time() - end_time_translation, 2))

    prompt_language = template_data['prompt_language']
    prompt_type = template_data['prompt_type']
    tmp_result = pd.DataFrame([{
        'model': model_name,
        'complexity': complexity,
        'prompt_type': prompt_type,
        'prompt': prompt,
        'source_text': source_text,
        'hypothesis': hypothesis,
        'reference_text': reference_text,
        'metrics': metrics,
        'prompt_language': prompt_language.value  # .value for the string value
    }])

    experiment_name = f'{model_name}_{complexity}'

    log_to_mlflow(experiment_name, template_name, metrics, prompt_type, model_name, complexity, target_language.value, tmp_result,
                  prompt_language.value)

    # add tmp_results Dataframe to overall results
    results = pd.concat([
        results,
        tmp_result
    ], ignore_index=True)
    return results

### 3.5 Prompt Composition


In [27]:
PROMPT_TEMPLATES_ENGLISH_GERMAN = {
    'few_shot_style_persona_to-de_en_2': {
         'template': 'Act like a professional interpreter. When you receive a text, adapt to the tone of the text and translate the English text directly into German without any notes or questions. Examples: English = "Apple" -> German = "Apfel", English = "Car" -> German = "Auto", English = "House" -> German = "Haus", English = "Water" -> German = "Wasser", English = "Sky" -> German = "Himmel". You will now be given this text: ["{text}"].',
         'prompt_language': Language.ENGLISH,
         'prompt_type': 'few_shot_style_persona',
         'complexities': ALL_COMPLEXITIES
    },
    'few_shot_style_persona_to-de_de_2': {
        'template': 'Verhalte dich wie ein professioneller Dolmetscher. Wenn du einen Text bekommst, passt du dich dem Ton des Textes an und übersetzt den englischen Text direkt ins Deutsche ohne Anmerkungen oder Rückfragen. Beispiele: Englisch = "Apple" -> German = "Apfel", Englisch = "Car" -> German = "Auto", Englisch = "House" -> German = "Haus", Englisch = "Water" -> German = "Wasser", Englisch = "Sky" -> German = "Himmel". Dir wird nun dieser Text gegeben: ["{text}"].',
        'prompt_language': Language.GERMAN,
        'prompt_type': 'few_shot_style_persona',
        'complexities': ALL_COMPLEXITIES
    },
}

PROMPT_TEMPLATES_GERMAN_ENGLISH = {
    'few_shot_style_persona_to-en_en_2': {
        'template': 'Act like a professional interpreter. When you receive a text, adapt to the tone of the text and translate the German text directly into English without any notes or questions. Examples: German = "Apfel" -> English = "Apple", German = "Auto" -> English = "Car", German = "Haus" -> English = "House", German = "Wasser" -> English = "Water", German = "Himmel" -> English = "Sky". You will now be given this text: ["{text}"].',
        'prompt_language': Language.ENGLISH,
        'prompt_type': 'few_shot_style_persona',
         'complexities': ALL_COMPLEXITIES
    },
    'few_shot_style_persona_to-en_de_2': {
        'template': 'Verhalte dich wie ein professioneller Dolmetscher. Wenn du einen Text bekommst, passt du dich dem Ton des Textes an und übersetzt den deutschen Text direkt ins Englische ohne Anmerkungen oder Rückfragen. Beispiele: Deutsch = "Apfel" -> Englisch = "Apple", Deutsch = "Auto" -> Englisch = "Car", Deutsch = "Haus" -> Englisch = "House", Deutsch = "Wasser" -> Englisch = "Water", Deutsch = "Himmel" -> Englisch = "Sky". Dir wird nun dieser Text gegeben: ["{text}"].',
        'prompt_language': Language.GERMAN,
        'prompt_type': 'few_shot_style_persona',
        'complexities': ALL_COMPLEXITIES
    },
}

***
## 4 Execute Pipeline

In [28]:
translation_results = run_pipeline(data)
translation_results.to_csv('translation_results.csv', sep=';')
print('Pipeline abgeschlossen. Ergebnisse gespeichert.')

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


Model lmstudio-ai/gemma-2b-it-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 481.5; n_ctx: 529


llama_init_from_model: n_ctx_per_seq (544) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


Model lmstudio-ai/gemma-2b-it-GGUF erfolgreich geladen mit n_ctx=529
Prompt finished in (seconds):  15.94
Metric Calculation in (seconds):  32.27
🏃 View run gemma/easy/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/429765178055128713/runs/0b8eaa3286a64791ab268c41c8519c18
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/429765178055128713
Prompt finished in (seconds):  28.18
Metric Calculation in (seconds):  47.67
🏃 View run gemma/easy/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/429765178055128713/runs/a02819a4d5d94ed483ac0a80524c0082
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/429765178055128713
Prompt finished in (seconds):  17.61
Metric Calculation in (seconds):  32.85
🏃 View run gemma/easy/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/429765178055128713/runs/8da3296fa3ef429f8782e6da2e04d8e1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/429765178055128713
Prompt fin

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


Model lmstudio-ai/gemma-2b-it-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 354.0; n_ctx: 389


llama_init_from_model: n_ctx_per_seq (416) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


Model lmstudio-ai/gemma-2b-it-GGUF erfolgreich geladen mit n_ctx=389
Prompt finished in (seconds):  9.91
Metric Calculation in (seconds):  22.97
🏃 View run gemma/news_gen/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/391944840061747289/runs/b730b2a2c15549ad9fab1809c0cb3a22
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/391944840061747289
Prompt finished in (seconds):  12.97
Metric Calculation in (seconds):  26.82
🏃 View run gemma/news_gen/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/391944840061747289/runs/5fe9bcf96de0480f9f2a55da741608ab
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/391944840061747289
Prompt finished in (seconds):  9.21
Metric Calculation in (seconds):  23.03
🏃 View run gemma/news_gen/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/391944840061747289/runs/12696af455804652882e2828545c6532
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/391944840061747289


llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


Model lmstudio-ai/gemma-2b-it-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 489.0; n_ctx: 537


llama_init_from_model: n_ctx_per_seq (544) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


Model lmstudio-ai/gemma-2b-it-GGUF erfolgreich geladen mit n_ctx=537
Prompt finished in (seconds):  13.5
Metric Calculation in (seconds):  31.28
🏃 View run gemma/news_spec/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/713590614913207437/runs/cfd3f138d92d468dacaf9f6867448f8c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/713590614913207437
Prompt finished in (seconds):  14.69
Metric Calculation in (seconds):  31.45
🏃 View run gemma/news_spec/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/713590614913207437/runs/2dff08beb7cd4ce2ab1403d8b3ce2ed1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/713590614913207437
Prompt finished in (seconds):  16.15
Metric Calculation in (seconds):  51.41
🏃 View run gemma/news_spec/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/713590614913207437/runs/377b880f3350450997aea0d460760914
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/713590614913207

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


Model lmstudio-ai/gemma-2b-it-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 499.5; n_ctx: 549


llama_init_from_model: n_ctx_per_seq (576) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


Model lmstudio-ai/gemma-2b-it-GGUF erfolgreich geladen mit n_ctx=549
Prompt finished in (seconds):  15.99
Metric Calculation in (seconds):  55.86
🏃 View run gemma/pop_science/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/819534162661537410/runs/007249e360af4290a9fa944fe3e8f238
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/819534162661537410
Prompt finished in (seconds):  15.16
Metric Calculation in (seconds):  31.96
🏃 View run gemma/pop_science/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/819534162661537410/runs/fd0a27bed744488692e80346829cf74d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/819534162661537410
Prompt finished in (seconds):  17.75
Metric Calculation in (seconds):  35.51
🏃 View run gemma/pop_science/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/819534162661537410/runs/ead2ded70c984affbef586bf44f7eed0
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/81953416

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


Model lmstudio-ai/gemma-2b-it-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 916.5; n_ctx: 1008


llama_init_from_model: n_ctx_per_seq (1024) < n_ctx_train (8192) -- the full capacity of the model will not be utilized


Model lmstudio-ai/gemma-2b-it-GGUF erfolgreich geladen mit n_ctx=1008
Prompt finished in (seconds):  31.56
Metric Calculation in (seconds):  64.37
🏃 View run gemma/science/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/895795565739563094/runs/e90abb9cce034c18b49ad0b3c1b54f31
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/895795565739563094
Prompt finished in (seconds):  30.65
Metric Calculation in (seconds):  63.01
🏃 View run gemma/science/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/895795565739563094/runs/dd08555b215e43a393a81a9b7c950aa7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/895795565739563094
Prompt finished in (seconds):  4.51
Metric Calculation in (seconds):  42.41
🏃 View run gemma/science/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/895795565739563094/runs/754523fa70474fb7a7e76b52497cb003
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/895795565739563094
P

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 547.5; n_ctx: 602


llama_init_from_model: n_ctx_per_seq (608) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF erfolgreich geladen mit n_ctx=602
Prompt finished in (seconds):  24.62
Metric Calculation in (seconds):  36.31
🏃 View run llama32/easy/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/170611153105097398/runs/3dcd8eb620e644a0849a063f84e5bd36
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/170611153105097398
Prompt finished in (seconds):  50.71
Metric Calculation in (seconds):  48.96
🏃 View run llama32/easy/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/170611153105097398/runs/67df6f8491344c678c2873766534fbf2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/170611153105097398
Prompt finished in (seconds):  23.63
Metric Calculation in (seconds):  35.04
🏃 View run llama32/easy/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/170611153105097398/runs/98ea765e5ecc4b52bd058ef936ee08f2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/17061

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 406.5; n_ctx: 447


llama_init_from_model: n_ctx_per_seq (448) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF erfolgreich geladen mit n_ctx=447
Prompt finished in (seconds):  17.95
Metric Calculation in (seconds):  26.53
🏃 View run llama32/news_gen/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/379656095678811037/runs/d41f1d24f31e4e87930c1169e68b850a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/379656095678811037
Prompt finished in (seconds):  14.05
Metric Calculation in (seconds):  23.62
🏃 View run llama32/news_gen/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/379656095678811037/runs/b4f7d36c3f0544de978fbd329e7e99bb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/379656095678811037
Prompt finished in (seconds):  20.64
Metric Calculation in (seconds):  27.76
🏃 View run llama32/news_gen/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/379656095678811037/runs/203a924636064d12bc0513a20197599a
🧪 View experiment at: http://127.0.0.1:5000/#/exper

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 558.0; n_ctx: 613


llama_init_from_model: n_ctx_per_seq (640) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF erfolgreich geladen mit n_ctx=613
Prompt finished in (seconds):  18.99
Metric Calculation in (seconds):  32.61
🏃 View run llama32/news_spec/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/194158564450517751/runs/4b22b8658e1946ec92898228e3485195
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/194158564450517751
Prompt finished in (seconds):  52.52
Metric Calculation in (seconds):  49.22
🏃 View run llama32/news_spec/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/194158564450517751/runs/389cefdf937b492ebd980093e691f7d5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/194158564450517751
Prompt finished in (seconds):  40.05
Metric Calculation in (seconds):  40.48
🏃 View run llama32/news_spec/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/194158564450517751/runs/240d5f991b1f4554a00d60b8dc4c77d2
🧪 View experiment at: http://127.0.0.1:5000/#/ex

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 570.0; n_ctx: 627


llama_init_from_model: n_ctx_per_seq (640) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF erfolgreich geladen mit n_ctx=627
Prompt finished in (seconds):  64.35
Metric Calculation in (seconds):  53.06
🏃 View run llama32/pop_science/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/553009959054917340/runs/081be6ea0d5d4fb19ee235b8c44eba0f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/553009959054917340
Prompt finished in (seconds):  52.95
Metric Calculation in (seconds):  47.79
🏃 View run llama32/pop_science/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/553009959054917340/runs/eb279752ca794036847bb0c2babf5c15
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/553009959054917340
Prompt finished in (seconds):  60.94
Metric Calculation in (seconds):  52.89
🏃 View run llama32/pop_science/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/553009959054917340/runs/c0d8d25321d44b829ea82f24316838a5
🧪 View experiment at: http://127.0.0.1:500

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 973.5; n_ctx: 1070


llama_init_from_model: n_ctx_per_seq (1088) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF erfolgreich geladen mit n_ctx=1070
Prompt finished in (seconds):  70.27
Metric Calculation in (seconds):  105.71
🏃 View run llama32/science/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/248431796146488211/runs/ee3b4982d5284f5f82dd62ebcef27def
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/248431796146488211
Prompt finished in (seconds):  86.08
Metric Calculation in (seconds):  116.97
🏃 View run llama32/science/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/248431796146488211/runs/fb25a95d8a4348408447d4d86dc33d13
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/248431796146488211
Prompt finished in (seconds):  51.22
Metric Calculation in (seconds):  71.27
🏃 View run llama32/science/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/248431796146488211/runs/eb569f171ba249e1a6f192a1ea7d6212
🧪 View experiment at: http://127.0.0.1:5000/#/exper

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 547.5; n_ctx: 602


llama_init_from_model: n_ctx_per_seq (608) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF erfolgreich geladen mit n_ctx=602
Prompt finished in (seconds):  88.9
Metric Calculation in (seconds):  54.82
🏃 View run llama31/easy/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/496790586212187793/runs/ce7255f1878a44e6b91ebc26d01c1714
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/496790586212187793
Prompt finished in (seconds):  83.99
Metric Calculation in (seconds):  50.91
🏃 View run llama31/easy/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/496790586212187793/runs/a9162e29b87b4a14b9563ff6054aa99b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/496790586212187793
Prompt finished in (seconds):  94.35
Metric Calculation in (seconds):  56.69
🏃 View run llama31/easy/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/496790586212187793/runs/c79a4ce3c2d743569276f5729cc4216f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/49

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 406.5; n_ctx: 447


llama_init_from_model: n_ctx_per_seq (448) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF erfolgreich geladen mit n_ctx=447
Prompt finished in (seconds):  63.2
Metric Calculation in (seconds):  37.25
🏃 View run llama31/news_gen/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/378317835055917962/runs/31d10fe58e7f4bfb92c07925d0ee5f87
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/378317835055917962
Prompt finished in (seconds):  59.42
Metric Calculation in (seconds):  31.78
🏃 View run llama31/news_gen/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/378317835055917962/runs/d0d788c5567f4b9495333c64bcd5d508
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/378317835055917962
Prompt finished in (seconds):  67.66
Metric Calculation in (seconds):  37.17
🏃 View run llama31/news_gen/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/378317835055917962/runs/cc501f2df94c41e59d20d6ebb8835917
🧪 View experiment at: http://127.0.0.1:5000/#/ex

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 558.0; n_ctx: 613


llama_init_from_model: n_ctx_per_seq (640) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF erfolgreich geladen mit n_ctx=613
Prompt finished in (seconds):  94.12
Metric Calculation in (seconds):  62.39
🏃 View run llama31/news_spec/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/278262578605400461/runs/a4c87b193cd74ce6bb45c39d0f085ada
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/278262578605400461
Prompt finished in (seconds):  90.49
Metric Calculation in (seconds):  43.82
🏃 View run llama31/news_spec/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/278262578605400461/runs/182b5aedc80e437ca2254dd074c4e9eb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/278262578605400461
Prompt finished in (seconds):  100.96
Metric Calculation in (seconds):  59.41
🏃 View run llama31/news_spec/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/278262578605400461/runs/385a9d6b2b15495e95c4a2fde1f771ec
🧪 View experiment at: http://127.0.0.1:5000

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 570.0; n_ctx: 627


llama_init_from_model: n_ctx_per_seq (640) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF erfolgreich geladen mit n_ctx=627
Prompt finished in (seconds):  93.15
Metric Calculation in (seconds):  55.81
🏃 View run llama31/pop_science/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/358136329887025901/runs/2b2869d4ae954fa0a088bf56ce4c7009
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/358136329887025901
Prompt finished in (seconds):  89.41
Metric Calculation in (seconds):  52.71
🏃 View run llama31/pop_science/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/358136329887025901/runs/8128d38ed37442028d2dafc9e12ca28b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/358136329887025901
Prompt finished in (seconds):  100.93
Metric Calculation in (seconds):  55.55
🏃 View run llama31/pop_science/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/358136329887025901/runs/63678a46d4fd46dbab409a974711dc11
🧪 View experiment at: http://127.0.0.

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF erfolgreich geladen mit n_ctx=512
estimated_max_tokens: 973.5; n_ctx: 1070


llama_init_from_model: n_ctx_per_seq (1088) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


Model lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF erfolgreich geladen mit n_ctx=1070
Prompt finished in (seconds):  163.18
Metric Calculation in (seconds):  128.44
🏃 View run llama31/science/few_shot_style_persona_to-en_en_2 at: http://127.0.0.1:5000/#/experiments/958464439758456234/runs/d7012e29c7344549894104115421a4e8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/958464439758456234
Prompt finished in (seconds):  157.81
Metric Calculation in (seconds):  123.42
🏃 View run llama31/science/few_shot_style_persona_to-en_de_2 at: http://127.0.0.1:5000/#/experiments/958464439758456234/runs/9103ffbcd797460196be31a92b2a9274
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/958464439758456234
Prompt finished in (seconds):  180.4
Metric Calculation in (seconds):  126.31
🏃 View run llama31/science/few_shot_style_persona_to-de_en_2 at: http://127.0.0.1:5000/#/experiments/958464439758456234/runs/c7ffd83de70b4f56ae7d20bc0ef25576
🧪 View experiment at: http://127.0.0.1:5000/