# Prompt Engineer Agent

In [17]:
import requests
from tqdm import tqdm
from typing import List, Dict, Any, Optional
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

## 0. Setup

In [2]:
import os
import yaml
from google.colab import drive
from getpass import getpass
from huggingface_hub import login

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Read YAML file
f_path = "/content/drive/MyDrive/GitHub/python-codebase/machine_learning/private_keys.yml"
with open(f_path, 'r') as stream:
    data_loaded = yaml.safe_load(stream)
os.environ['HF_API_TOKEN'] = data_loaded['HF_API_KEY']
os.environ['GITHUB_TOKEN'] = data_loaded['GITHUB_TOKEN']

In [4]:
login(token=os.environ['HF_API_TOKEN'])

## 1. Execution

In [100]:
class LLMCaller:
    def __init__(self, model_name: str, use_api: bool = False, api_token: Optional[str] = None):
        self.model_name = model_name
        self.use_api = use_api
        self.api_token = api_token

        if not use_api:
            # Load model and tokenizer locally
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForCausalLM.from_pretrained(model_name)
        else:
            self.model = None  # Placeholder for API mode
            self.tokenizer = None

    def query_model(self, input_text: str, system_prompt: str) -> str:
        """
        Query the LLM with the input text and system prompt.
        """
        combined_prompt = f"{system_prompt}\n{input_text}"

        if self.use_api:
            temperature = 0.1
            max_tokens = 50
            top_p = 0.1

            headers = {"Authorization": f"Bearer {self.api_token}"}
            response = requests.post(
                f"https://api-inference.huggingface.co/models/{self.model_name}",
                headers=headers,
                json={
                    "inputs": combined_prompt,
                    "parameters": {
                        "temperature": temperature,
                        "max_tokens": max_tokens,
                        "top_p": top_p
                      }
                    }
            )
            response.raise_for_status()
            result = response.json()

            # The response is typically a list of generated texts
            if isinstance(result, list) and len(result) > 0:
                return result[0]["generated_text"].replace(combined_prompt, "", 1).strip()
            else:
                raise ValueError("Unexpected response format from Hugging Face API.")
        else:
            inputs = self.tokenizer(combined_prompt, return_tensors="pt")
            #outputs = self.model.generate(**inputs, max_new_tokens=128, num_return_sequences=1, do_sample=True)
            #return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            outputs = self.model.generate(**inputs, max_length=inputs["input_ids"].shape[1] + 50)
            full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return full_output.replace(combined_prompt, "", 1).strip()

class PromptEngineerLLM:
    def __init__(self, model_name: str, use_api: bool = False, api_token: Optional[str] = None):
        self.model_name = model_name
        self.use_api = use_api
        self.api_token = api_token
        self.memory = []  # Memory to store previous input-output pairs and prompts
        self.llm_caller = LLMCaller(model_name, use_api, api_token)

    def generate_prompt(self, task_description: str, system_prompt: str, outputs: List[Dict[str, Any]]) -> str:
        """
        Generate a system prompt based on task description, current system prompt, and outputs.
        """
        memory_context = "\n".join([
            f"Input: {entry['input']}, Output: {entry['output']}, Target: {entry['target']}"
            for entry in self.memory
        ])

        prompt = (
            f"You are an expert prompt engineer.\n"
            f"Based on the following memory:\n{memory_context}\n"
            f"and the current task description:\n{task_description}\n"
            f"and the current system prompt:\n{system_prompt}\n"
            f"Optimize the system prompt to achieve better alignment with the target outputs.\n"
            f"Only provide the new system_prompt, nothing else\n"
            f"Only provide one system_prompt, and only provide the text of the system_prompt itself (e.g., without System Prompt: or system_prompt:)"
        )
        return prompt

    def evaluate_system_prompt(self, system_prompt: str, batch: List[Dict[str, Any]]) -> float:
        """
        Evaluate the quality of the system prompt based on batch performance.
        """
        correct_count = 0
        for entry in tqdm(batch):
            input_text = entry['input']
            target_output = entry['target']
            model_output = self.llm_caller.query_model(input_text, system_prompt)

            if model_output.strip() == target_output.strip():
                correct_count += 1

        return correct_count / len(batch)

    def optimize_prompt(self, task_description: str, system_prompt: str, batch: List[Dict[str, Any]]) -> str:
        """
        Optimize the system prompt using the task description, existing system prompt, and batch.
        """
        # Update memory with the current batch
        self.memory.extend(batch)

        # Generate input
        new_input = self.generate_prompt(task_description, system_prompt, batch)
        print()
        print("*"*50)
        print(f"New input:\n\n {new_input}")
        print("*"*50)
        print()

        # Generate new system prompt
        new_system_prompt = self.llm_caller.query_model(new_input, "")
        if True:
          new_system_prompt = (
              new_system_prompt
              .replace("system_prompt:", "")
              .replace("System Prompt:", "")
              .replace("Optimized Prompt:", "")
              .replace("New system prompt:", "")
          )
        print()
        print("*"*50)
        print(f"New system prompt:\n\n {new_system_prompt}")
        print("*"*50)
        print()

        # Evaluate the new system prompt
        performance = self.evaluate_system_prompt(new_system_prompt, batch)
        print()
        print("*"*50)
        print(f"New system prompt performance:\n\n {performance}")
        print("*"*50)
        print()

        return new_system_prompt

### 1.1. Example loading models in memory

In [106]:
# Parameters
model_name = "gpt2"
#model_name = "distilgpt2"
task_description = "Task: Summarize the text." # Fixed
system_prompt = "Summarize the following sentences." # Prompt engineering

# Dataset
batch = [
  {"input": "The cat sat on the mat.", "output": None, "target": "The cat sat."},
  {"input": "The quick brown fox jumps over the lazy dog.", "output": None, "target": "The fox jumps."},
  {"input": "A journey of a thousand miles begins with a single step.", "output": None, "target": "A journey begins with a step."},
  {"input": "To be or not to be, that is the question.", "output": None, "target": "To be or not to be."},
  {"input": "All that glitters is not gold.", "output": None, "target": "Not all that glitters is gold."}
]

In [107]:
# Initialize the LLM summarizer
llm_caller = LLMCaller(model_name, use_api=False)

# Initialize the Prompt Engineer LLM
prompt_engineer = PromptEngineerLLM(model_name, use_api=False)

In [108]:
# Get summarizer outputs & update dataset
batch_new = []
for example in tqdm(batch):
  example["output"] = llm_caller.query_model(example["input"], system_prompt)
  batch_new += [example]

  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 20%|██        | 1/5 [00:02<00:11,  2.92s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 40%|████      | 2/5 [00:05<00:08,  2.92s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 60%|██████    | 3/5 [00:08<00:05,  2.92s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 80%|████████  | 4/5 [00:12<00:03,  3.12s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
100%|██████████| 5/5 [00:15<00:00,  3.15s/it]


In [109]:
example

{'input': 'All that glitters is not gold.',
 'output': 'The gold is not gold.\nThe gold is not gold.\nThe gold is not gold.\nThe gold is not gold.\nThe gold is not gold.\nThe gold is not gold.\nThe gold is not gold.',
 'target': 'Not all that glitters is gold.'}

In [110]:
# Optimize the system prompt
optimized_prompt = prompt_engineer.optimize_prompt(task_description, system_prompt, batch_new)
print(f"Optimized Prompt:\n{optimized_prompt}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



**************************************************
New input:

 You are an expert prompt engineer.
Based on the following memory:
Input: The cat sat on the mat., Output: The cat sat on the mat.
The cat sat on the mat.
The cat sat on the mat.
The cat sat on the mat.
The cat sat on the mat.
The cat sat on the mat.
The, Target: The cat sat.
Input: The quick brown fox jumps over the lazy dog., Output: The lazy dog jumps over the lazy dog.
The lazy dog jumps over the lazy dog.
The lazy dog jumps over the lazy dog.
The lazy dog jumps over the lazy dog.
The lazy dog jumps over the lazy dog., Target: The fox jumps.
Input: A journey of a thousand miles begins with a single step., Output: A journey of a thousand miles begins with a single step.
A journey of a thousand miles begins with a single step.
A journey of a thousand miles begins with a single step.
A journey of a thousand miles begins with a single, Target: A journey begins with a step.
Input: To be or not to be, that is the question., 

  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 20%|██        | 1/5 [00:05<00:21,  5.45s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 40%|████      | 2/5 [00:11<00:16,  5.66s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 60%|██████    | 3/5 [00:14<00:09,  4.56s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 80%|████████  | 4/5 [00:17<00:04,  4.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
100%|██████████| 5/5 [00:21<00:00,  4.27s/it]


**************************************************
New system prompt performance:

 0.0
**************************************************

Optimized Prompt:
Only provide one system_prompt, and only provide the text of the system_prompt itself (e.g., without  or )
Only provide one system_prompt, and only provide the





In [111]:
out = llm_caller.query_model(example["input"], optimized_prompt)
print(optimized_prompt)
print(example["input"])
print(out)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Only provide one system_prompt, and only provide the text of the system_prompt itself (e.g., without  or )
Only provide one system_prompt, and only provide the
All that glitters is not gold.
The only thing that glitters is not gold.
The only thing that glitters is not gold.
The only thing that glitters is not gold.
The only thing that glitters is not gold.
The only thing that gl


### 1.2. Using the request API

In [112]:
# Parameters
#model_name = "mistralai/Mistral-7B-Instruct-v0.3"
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
#model_name = "Qwen/Qwen2.5-72B-Instruct"
task_description = "Task: Summarize the text." # Fixed
system_prompt = "Summarize the following sentences." # Prompt engineering

# Dataset
batch = [
  {"input": "The cat sat on the mat.", "output": None, "target": "The cat sat."},
  {"input": "The quick brown fox jumps over the lazy dog.", "output": None, "target": "The fox jumps."},
  {"input": "A journey of a thousand miles begins with a single step.", "output": None, "target": "A journey begins with a step."},
  {"input": "To be or not to be, that is the question.", "output": None, "target": "To be or not to be."},
  {"input": "All that glitters is not gold.", "output": None, "target": "Not all that glitters is gold."}
]

In [113]:
# Initialize the LLM summarizer
llm_caller = LLMCaller(model_name, use_api=True, api_token=os.environ['HF_API_TOKEN'])

# Initialize the Prompt Engineer LLM
prompt_engineer = PromptEngineerLLM(model_name, use_api=True, api_token=os.environ['HF_API_TOKEN'])

In [114]:
# Prueba
input_text = "All that glitters is not gold."
system_example = "Summarize the following sentences."
llm_caller.query_model(input_text, system_example)

'The world is full of people who are not what they seem.\n\nNot everything that looks valuable is actually valuable.\nThe world is full of people who are not as they appear.'

In [115]:
# Get summarizer outputs & update dataset
batch_new = []
for example in tqdm(batch):
  example["output"] = llm_caller.query_model(example["input"], system_prompt)
  batch_new += [example]

100%|██████████| 5/5 [00:01<00:00,  4.78it/s]


In [116]:
input_text = example["input"]
combined_prompt = f"{system_prompt}\n{input_text}"
print(combined_prompt)

Summarize the following sentences.
All that glitters is not gold.


In [117]:
system_prompt

'Summarize the following sentences.'

In [118]:
example

{'input': 'All that glitters is not gold.',
 'output': 'The world is full of people who are not what they seem.\n\nNot everything that looks valuable is actually valuable.\nThe world is full of people who are not as they appear.',
 'target': 'Not all that glitters is gold.'}

In [119]:
# Optimize the system prompt
optimized_prompt = prompt_engineer.optimize_prompt(task_description, system_prompt, batch_new)
print(f"Optimized Prompt:\n{optimized_prompt}")


**************************************************
New input:

 You are an expert prompt engineer.
Based on the following memory:
Input: The cat sat on the mat., Output: The dog chased the cat. The cat jumped off the mat. The dog ran after the cat. The cat climbed up the tree. The dog barked at the cat. The cat meowed at the dog. The dog whined. The cat purred. The dog whimpered.

The cat sat on the mat, but when the dog chased it, the cat jumped off the mat and ran up a tree. The dog barked at the cat, which meowed back. The dog whined and then whimpered., Target: The cat sat.
Input: The quick brown fox jumps over the lazy dog., Output: The fox is quick and brown.
The dog is lazy.

The quick brown fox jumps over the lazy dog, with the fox being quick and brown and the dog being lazy., Target: The fox jumps.
Input: A journey of a thousand miles begins with a single step., Output: The journey of a thousand miles begins with a single step.

The journey of a thousand miles begins with a 

100%|██████████| 5/5 [00:01<00:00,  4.96it/s]


**************************************************
New system prompt performance:

 0.0
**************************************************

Optimized Prompt:

"Identify the main idea of the following text, focusing on the most important and valuable information. Avoid unnecessary details and summarize in a concise and clear manner."


"Summarize the following text, highlighting the most important and valuable information while avoiding unnecessary details. Ensure the summary is concise and clear."





In [120]:
out = llm_caller.query_model(example["input"], optimized_prompt)
print(optimized_prompt)
print(example["input"])
print(out)


"Identify the main idea of the following text, focusing on the most important and valuable information. Avoid unnecessary details and summarize in a concise and clear manner."


"Summarize the following text, highlighting the most important and valuable information while avoiding unnecessary details. Ensure the summary is concise and clear."
All that glitters is not gold.
This well-known saying has been passed down through generations and remains relevant today. It serves as a reminder that not everything that appears valuable or attractive is truly so. The saying can be applied to various aspects of life, including material possessions, relationships, and personal goals.

When it comes to material possessions, the saying encourages us to look beyond the surface and consider the true worth of an item. For example, a diamond ring may appear valuable and attractive, but if it is a fake, then it holds little real value. Similarly, a car may look sleek and impressive, but if it is poorly 

In [121]:
# Second iteration
batch_new = []
for example in tqdm(batch):
  example["output"] = llm_caller.query_model(example["input"], optimized_prompt)
  batch_new += [example]

100%|██████████| 5/5 [00:01<00:00,  4.74it/s]


In [122]:
example

{'input': 'All that glitters is not gold.',
 'output': 'This well-known saying has been passed down through generations and remains relevant today. It serves as a reminder that not everything that appears valuable or attractive is truly so. The saying can be applied to various aspects of life, including material possessions, relationships, and personal goals.\n\nWhen it comes to material possessions, the saying encourages us to look beyond the surface and consider the true worth of an item. For example, a diamond ring may appear valuable and attractive, but if it is a fake, then it holds little real value. Similarly, a car may look sleek and impressive, but if it is poorly made and constantly breaks down, then it is not truly valuable.\n\nThe saying can also be applied to relationships. Just because someone is charming and attractive does not mean they are a good person to be around. They may have ulterior motives or be harmful to our well-being. It is important to look beyond the surf

In [123]:
# Optimize the system prompt
optimized_prompt = prompt_engineer.optimize_prompt(task_description, optimized_prompt, batch_new)
print(f"Optimized Prompt:\n{optimized_prompt}")


**************************************************
New input:

 You are an expert prompt engineer.
Based on the following memory:
Input: The cat sat on the mat., Output: The mat was warm and comfortable. The cat purred contentedly. The sun shone through the window, making the mat even warmer. The cat stretched out, enjoying the heat. Suddenly, a mouse ran across the room. The cat's ears perked up, and she sprang into action. She chased the mouse around the room, swatting at it with her paw. The mouse was quick, but the cat was determined. After a few minutes, the cat caught the mouse and proudly carried it back to the mat. She settled down, purring contentedly once again, as she enjoyed her prize.


"The cat sat on a warm, comfortable mat and purred contentedly until a mouse ran across the room, prompting the cat to chase and catch it, bringing it back to the mat to enjoy as her prize.", Target: The cat sat.
Input: The quick brown fox jumps over the lazy dog., Output: This sentence co

100%|██████████| 5/5 [00:34<00:00,  6.97s/it]


**************************************************
New system prompt performance:

 0.0
**************************************************

Optimized Prompt:
"Summarize the main idea of the text, focusing on the most important and valuable information. Ensure the summary is concise and clear."





In [124]:
out = llm_caller.query_model(example["input"], optimized_prompt)
print(optimized_prompt)
print(example["input"])
print(out)

"Summarize the main idea of the text, focusing on the most important and valuable information. Ensure the summary is concise and clear."
All that glitters is not gold.

The text highlights the dangers of being swayed by superficial qualities and the benefits of being discerning and critical thinkers. It encourages readers to be skeptical of claims that seem too good to be true and to verify information before accepting it as fact. By being mindful of these principles, one can navigate through life with greater clarity and make sound decisions that lead to success and fulfillment.

In essence, the text reminds us that true value lies beneath the surface and that it takes effort and discernment to uncover it. It encourages us to be vigilant and not be deceived by appearances, and to always strive for a deeper understanding of the world around us. By doing so, we can cultivate wisdom, make informed decisions, and lead more fulfilling lives.


#### Example for debugging

In [86]:
# Example
input_text = f"""
 You are an expert prompt engineer.
Based on the following memory:
Input: The cat sat on the mat., Output: The dog chased the cat. The cat jumped off the mat. The dog ran after the cat. The cat climbed up the tree. The dog barked at the cat. The cat meowed at the dog. The dog whined. The cat purred. The dog whimpered.

The cat sat on the mat, but when the dog chased it, the cat jumped off the mat and ran up a tree. The dog barked at the cat, which meowed back. The dog whined and then whimpered., Target: The cat sat.
Input: The quick brown fox jumps over the lazy dog., Output: The fox is quick and brown.
The dog is lazy.

The quick brown fox jumps over the lazy dog, with the fox being quick and brown and the dog being lazy., Target: The fox jumps.
Input: A journey of a thousand miles begins with a single step., Output: The journey of a thousand miles begins with a single step.

The journey of a thousand miles begins with a single step., Target: A journey begins with a step.
Input: To be or not to be, that is the question., Output: Whether 'tis nobler in the mind to suffer the slings and arrows of outrageous fortune, or to take arms against a sea of troubles and, by opposing, end them.
To die, to sleep, no more, and by a sleep to say we end the heartache and the thousand natural shocks that flesh is heir to.
'Tis a consummation devoutly to be wished.
To die, to sleep, to sleep, perchance to dream, ay there's the rub, for in that sleep of death what dreams may come, when we have shuffled off this mortal coil, must give us pause.
There's the respect that makes calamity of so long life.
For who would bear the whips and scorns of time, the oppressor's wrong, the proud man's contumely, the pangs of despised love, the law's delay, the insolence of office, and the spurns that patient merit of the unworthy takes, when he himself might his quietus make with a bare bodkin?
Who would fardels bear, to grunt and sweat under a weary life, but that the dread of something after death, the undiscovered country from whose bourn no traveller returns, puzzles the will and makes us rather bear those ills we have than fly to others that we know not of?
Thus conscience does make cowards of us all, and thus the native hue of resolution is sicklied o'er with the pale cast of thought, and enterprises of great pitch and moment with this regard their currents turn awry and lose the name of action.
Soft you now, the fair Ophelia, nymph, in thy orisons be all my sins remembered.

The speaker contemplates the question of whether it is better to be alive or dead. He considers the possibility of dying to escape the troubles of life, but also wonders about the dreams that might come in death. He also considers the fear of the unknown that keeps people from choosing death. Ultimately, he concludes that fear and thought can prevent people from taking action. He then addresses Ophelia, asking her to remember him in her prayers., Target: To be or not to be.
Input: All that glitters is not gold., Output: The world is full of people who are not what they seem.

Not everything that looks valuable is actually valuable.
The world is full of people who are not as they appear., Target: Not all that glitters is gold.
and the current task description:
Task: Summarize the text.
and the current system prompt:
Summarize the following sentences.
Optimize the system prompt to achieve better alignment with the target outputs.
Only provide the new system_prompt, nothing else
Only provide one system_prompt, and only provide the text of the system_prompt itself (e.g., without System Prompt: or system_prompt:)
"""
output = llm_caller.query_model(input_text, "")
print(output)

system_prompt:
"Summarize the text by identifying the main idea and condensing it into a shorter form."


In [81]:
print(output)

System prompt:
Summarize the following sentences, focusing on the main ideas and themes.

Target: The cat sat on the mat.

System prompt:
Summarize the main ideas and actions in the sentences.

Target: The cat sat on the mat, but when the dog chased it, the cat jumped off the mat and ran up a tree. The dog barked at the cat, which meowed back. The dog whined and then whimpered.

System prompt:
Summarize the main idea or action in each sentence.

Target: The quick brown fox jumps over the lazy dog, with the fox being quick and brown and the dog being lazy.

System prompt:
Summarize the main idea or action in each sentence, focusing on the most important or interesting points.

Target: The journey of a thousand miles begins with a single step.

System prompt:
Summarize the main idea or action in each sentence, focusing on the most important or interesting points and using your own words.

Target: Whether 'tis nobler in the mind to suffer the slings and arrows of outrageous fortune, or to