In [None]:
def benchmark_translation(n_ctx_value, queries, metadata_lookup, batch_size=20, output_file=None):
    from llama_cpp import Llama
    import time

    llm = Llama(
        model_path="/Users/pierr/Desktop/mistral-7b-instruct-v0.1.Q4_K_M.gguf",
        n_ctx=n_ctx_value
    )

    def translate_query(query, metadata_lookup):
        # Dynamically add metadata descriptions for each field used in the query
        query_fields = [field.strip("[]") for field in query.split() if field.startswith("[") and field.endswith("]")]
        
        # Ensure that metadata descriptions exist for each field used in the query
        metadata_descriptions = [metadata_lookup.get(field, f"No metadata found for {field}") for field in query_fields]

        # Combine metadata descriptions into one string
        metadata = "\n".join(metadata_descriptions)

        # Formulate the prompt using query and the gathered metadata
        prompt = f"Translate the following ESGish query into a natural English sentence:\n\nQuery: {query}\n\nMetadata: {metadata}\n\n### Response:"

        # Truncate query and metadata if needed to fit within token limits
        prompt_tokens = llm.tokenize(prompt.encode("utf-8"))
        prompt_token_count = len(prompt_tokens)

        # Calculate how many tokens are available for the response
        max_response_tokens = n_ctx_value - prompt_token_count
        if max_response_tokens <= 0:
            raise ValueError(f"Prompt too long for context size {n_ctx_value}. Try increasing n_ctx or shortening input.")
        
        # # If the prompt exceeds n_ctx, truncate the query or metadata (whichever is larger)
        # if prompt_token_count > n_ctx_value:
        #     # First, try truncating the query
        #     available_tokens_for_query = n_ctx_value - len(llm.tokenize(f"Query: {metadata}".encode("utf-8"))) - 50  # leave 50 tokens for response
        #     query_tokens = llm.tokenize(query.encode("utf-8"))
        #     query_tokens = query_tokens[:available_tokens_for_query]  # Truncate the query if necessary
        #     query = llm.detokenize(query_tokens).decode("utf-8", errors="ignore")
            
        #     # Regenerate the prompt after truncation
        #     prompt = f"Translate the following ESGish query into a natural English sentence:\n\nQuery: {query}\n\nMetadata: {metadata}\n\n### Response:"

        #     # Check again if we still exceed the context size and truncate metadata if necessary
        #     prompt_tokens = llm.tokenize(prompt.encode("utf-8"))
        #     prompt_token_count = len(prompt_tokens)

        if prompt_token_count > n_ctx_value:
            available_tokens_for_metadata = n_ctx_value - len(llm.tokenize(f"Query: {query}".encode("utf-8"))) - 50
            metadata_tokens = llm.tokenize(metadata.encode("utf-8"))
            metadata_tokens = metadata_tokens[:available_tokens_for_metadata]  # Truncate the metadata if necessary
            metadata = llm.detokenize(metadata_tokens).decode("utf-8", errors="ignore")
            # Regenerate the final prompt with truncated metadata
            prompt = f"Translate the following ESGish query into a natural English sentence:\n\nQuery: {query}\n\nMetadata: {metadata}\n\n### Response:"

        # Now send the truncated (or non-truncated) prompt to the model
        response = llm(prompt, max_tokens=max_response_tokens, temperature=0.1)
        return response["choices"][0]["text"].strip()

    translated_all = []
    start = time.time()

    # Loop through each query and process the translation in batches
    for i in range(0, len(queries), batch_size):
        batch = queries[i:i + batch_size]
        
        # For each query in the batch, get the translation using the metadata lookup
        translated_batch = [translate_query(query, metadata_lookup) for query in batch]
        translated_all.extend(translated_batch)

    end = time.time()
    duration = end - start
    avg_time = duration / len(queries)

    return {
        "n_ctx": n_ctx_value,
        "total_time_sec": duration,
        "avg_time_per_query_sec": avg_time,
        "num_queries": len(queries)
    }


In [None]:
import pandas as pd
import json

# Assuming the file is named 'metadata.json'
with open('metadata.json', 'r') as f:
    raw_metadata = json.load(f)

metadata_lookup = {
    entry["code"]: f'{entry["name"]}. {entry["description"]}' 
    for entry in raw_metadata
}

results = []
context_sizes = [1024, 2048, 3072, 4096]

df = pd.read_csv("results2.csv")
queries = df["query_text"].tolist()

for ctx in context_sizes:
    result = benchmark_translation(ctx, queries, metadata_lookup)
    results.append(result)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

df_results = pd.DataFrame(results)

plt.figure(figsize=(10, 6))
plt.plot(df_results["n_ctx"], df_results["avg_time_per_query_sec"], marker='o')
plt.title("LLM Performance vs Context Size")
plt.xlabel("Context Size (n_ctx)")
plt.ylabel("Avg Time per Query (sec)")
plt.grid(True)
plt.show()
