# Tutorial for library usage

* NOTE: HF key should be added as a secret to Colab

## 0. Setup

In [None]:
import os
import yaml
from huggingface_hub import login
from google.colab import drive
from getpass import getpass
from IPython.display import clear_output

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
requirements_path = "/content/drive/MyDrive/GitHub/python-codebase/machine_learning/generative_ai/custom_library/lib/requirements.txt"
!pip install -r {requirements_path}
clear_output()

In [None]:
# Read YAML file
f_path = "/content/drive/MyDrive/GitHub/python-codebase/machine_learning/private_keys.yml"
with open(f_path, 'r') as stream:
    data_loaded = yaml.safe_load(stream)
os.environ['HF_API_TOKEN'] = data_loaded['HF_API_KEY']
os.environ['GITHUB_TOKEN'] = data_loaded['GITHUB_TOKEN']

# Set up token
login(token=os.environ['HF_API_TOKEN'])

In [None]:
os.chdir('/content/drive/MyDrive/GitHub/python-codebase/machine_learning/generative_ai/custom_library')

In [None]:
!ls

 lib   old   tutorial.ipynb  'tutorial - v1.ipynb'


## 1. Usage

### 1.1. Calling Inference API (HF)

In [None]:
from lib.llm_tools import HuggingFaceWrapperAPI

In [None]:
model_name = "microsoft/Phi-3.5-mini-instruct"
dct_params = {
  'max_new_tokens': 3000,
  'temperature': 0.1,
  'return_full_text': False
}
system_prompt = "You are an expert SQL developer."
prompt = "Write a SQL query example that includes a JOIN, a HAVING and a PARTITION BY"
model = HuggingFaceWrapperAPI(model_name = model_name)
result = model.generate(prompt = prompt, system_prompt = system_prompt, **dct_params)
print(result)

 clause.

Here's an example SQL query that includes a JOIN, a HAVING clause, and a PARTITION BY clause:

```sql
SELECT
    employee_id,
    department_id,
    salary,
    AVG(salary) OVER (PARTITION BY department_id) AS avg_department_salary
FROM
    employees
JOIN
    departments ON employees.department_id = departments.id
GROUP BY
    employee_id,
    department_id,
    salary
HAVING
    AVG(salary) OVER (PARTITION BY department_id) > 50000;
```

In this query, we are joining the `employees` table with the `departments` table on the `department_id` column. We are then selecting the `employee_id`, `department_id`, and `salary` columns, along with the average salary for each department using the `AVG()` window function with the `PARTITION BY` clause.

The `GROUP BY` clause is used to group the results by `employee_id`, `department_id`, and `salary`.

Finally, the `HAVING` clause is used to filter the results to only include departments where the average salary is greater than 50,000.



### 1.2. Loading a HF model

In [None]:
from lib.llm_tools import HuggingFaceModelLoad

In [None]:
model_name = "microsoft/Phi-3.5-mini-instruct"
dct_params = {
  'max_new_tokens': 1000,
  'temperature': 0.1,
  'return_full_text': False
}
system_prompt = "You are an expert SQL developer."
prompt = "Write a SQL query example that includes a JOIN, a HAVING and a PARTITION BY"
model = HuggingFaceModelLoad(model_name = model_name)
result = model.generate(prompt = prompt, system_prompt = system_prompt, **dct_params)
print(result)

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

Device set to use cuda


 Certainly! Below is an example SQL query that demonstrates the use of JOIN, HAVING, and PARTITION BY clauses. This example assumes we are working with a database that supports window functions (like PostgreSQL, SQL Server, or Oracle). The scenario is a hypothetical sales database where we want to find the total sales per product category for each region, but only include those categories where the total sales exceed a certain threshold, and partition the results by region.

```sql
SELECT 
    region,
    product_category,
    SUM(sales_amount) OVER (PARTITION BY region) AS total_sales_per_region,
    SUM(sales_amount) AS sales_per_category
FROM 
    sales
JOIN 
    products ON sales.product_id = products.product_id
GROUP BY 
    region, product_category
HAVING 
    SUM(sales_amount) > 10000
ORDER BY 
    region, sales_per_category DESC;
```

Here's a breakdown of the query:

1. **JOIN**: We're joining the `sales` table with the `products` table on the `product_id` to get the product c

### 1.3. Library usage: Workflows

In [None]:
from lib.utils import extract_xml
from lib.llm_tools import HuggingFaceWrapperAPI, HuggingFaceModelLoad
from lib.llm_framework_workflow import chain, parallel, route

#### Prompt chaining
* Prompt-Chaining: Decomposes a task into sequential subtasks, where each step builds on previous results

In [None]:
model_name = "microsoft/Phi-3.5-mini-instruct"
dct_params = {
  'max_new_tokens': 1000,
  'temperature': 0.1,
  'return_full_text': False
}
if True:
  model = HuggingFaceModelLoad(model_name = model_name)
  debug_mode = False
else:
  model = HuggingFaceWrapperAPI(model_name = model_name)
  debug_mode = True

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Example 1: Chain workflow for structured data extraction and formatting
# Each step progressively transforms raw text into a formatted table

data_processing_steps = [
    """Extract only the numerical values and their associated metrics from the text.
    Format each as 'value: metric' on a new line.
    Example format:
    92: customer satisfaction
    45%: revenue growth""",

    """Convert all numerical values to percentages where possible.
    If not a percentage or points, convert to decimal (e.g., 92 points -> 92%).
    Keep one number per line.
    Example format:
    92%: customer satisfaction
    45%: revenue growth""",

    """Sort all lines in descending order by numerical value.
    Keep the format 'value: metric' on each line.
    Example:
    92%: customer satisfaction
    87%: employee satisfaction""",

    """Format the sorted data as a markdown table with columns:
    | Metric | Value |
    |:--|--:|
    | Customer Satisfaction | 92% |"""
]

report = """
Q3 Performance Summary:
Our customer satisfaction score rose to 92 points this quarter.
Revenue grew by 45% compared to last year.
Market share is now at 23% in our primary market.
Customer churn decreased to 5% from 8%.
New user acquisition cost is $43 per user.
Product adoption rate increased to 78%.
Employee satisfaction is at 87 points.
Operating margin improved to 34%.
"""

print("\nInput text:")
print(report)
formatted_result = chain(
    input=report,
    prompts=data_processing_steps,
    model=model,
    dct_params=dct_params,
    debug_mode=debug_mode
    )
print(formatted_result)

Device set to use cuda



Input text:

Q3 Performance Summary:
Our customer satisfaction score rose to 92 points this quarter.
Revenue grew by 45% compared to last year.
Market share is now at 23% in our primary market.
Customer churn decreased to 5% from 8%.
New user acquisition cost is $43 per user.
Product adoption rate increased to 78%.
Employee satisfaction is at 87 points.
Operating margin improved to 34%.


Step 1:


Device set to use cuda


 92: customer satisfaction score
45%: revenue growth
23%: market share
5%: customer churn
$43: new user acquisition cost
78%: product adoption rate
87: employee satisfaction
34%: operating margin

Step 2:


Device set to use cuda


 92%: customer satisfaction score
45%: revenue growth
23%: market share
5%: customer churn
78%: product adoption rate
87%: employee satisfaction
34%: operating margin

Step 3:


Device set to use cuda


 87%: employee satisfaction
92%: customer satisfaction
78%: product adoption rate
45%: revenue growth
34%: operating margin
23%: market share
5%: customer churn

Step 4:
 | Metric               | Value |
|----------------------|-------|
| Customer Satisfaction| 92%   |
| Employee Satisfaction| 87%   |
| Product Adoption Rate| 78%   |
| Revenue Growth       | 45%   |
| Operating Margin     | 34%   Tags:
| Market Share         | 23%   |
| Customer Churn       | 5%    | Here is the formatted data as a markdown table:

| Metric               | Value |
|----------------------|-------|
| Customer Satisfaction| 92%   |
| Employee Satisfaction| 87%   |
| Product Adoption Rate| 78%   |
| Revenue Growth       | 45%   |
| Operating Margin     | 34%   |
| Market Share         | 23%   |
| Customer Churn       | 5%    |

Note: The "Employee Satisfaction" metric was not included in the original input, but was added to the table based on the context provided. If this metric was not intended to be incl

#### Parallel processing
* Parallelization workflow for stakeholder impact analysis

In [None]:
model_name = "microsoft/Phi-3.5-mini-instruct"
dct_params = {
  'max_new_tokens': 1000,
  'temperature': 0.1,
  'return_full_text': False
}
if True:
  model = HuggingFaceModelLoad(model_name = model_name)
  debug_mode = False
else:
  model = HuggingFaceWrapperAPI(model_name = model_name)
  debug_mode = True

NameError: name 'HuggingFaceModelLoad' is not defined

In [None]:
# Example 2: Parallelization workflow for stakeholder impact analysis
# Process impact analysis for multiple stakeholder groups concurrently

stakeholders = [
    """Customers:
    - Price sensitive
    - Want better tech
    - Environmental concerns""",

    """Employees:
    - Job security worries
    - Need new skills
    - Want clear direction""",

    """Investors:
    - Expect growth
    - Want cost control
    - Risk concerns""",

    """Suppliers:
    - Capacity constraints
    - Price pressures
    - Tech transitions"""
]

impact_results = parallel(
    prompt = """Analyze how market changes will impact this stakeholder group.
    Provide specific impacts and recommended actions.
    Format with clear sections and priorities.""",
    inputs = stakeholders,
    model = model,
    dct_params = dct_params,
    n_workers = 1
)

for result in impact_results:
    print(result)

#### Routing
* Route workflow for customer support ticket handling

In [None]:
model_name = "microsoft/Phi-3.5-mini-instruct"
dct_params = {
  'max_new_tokens': 1000,
  'temperature': 0.1,
  'return_full_text': False
}
if True:
  model = HuggingFaceModelLoad(model_name = model_name)
  debug_mode = False
else:
  model = HuggingFaceWrapperAPI(model_name = model_name)
  debug_mode = True

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:  50%|#####     | 1.34G/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

In [None]:
# Example 3: Route workflow for customer support ticket handling
# Route support tickets to appropriate teams based on content analysis

support_routes = {
    "billing": """You are a billing support specialist. Follow these guidelines:
    1. Always start with "Billing Support Response:"
    2. First acknowledge the specific billing issue
    3. Explain any charges or discrepancies clearly
    4. List concrete next steps with timeline
    5. End with payment options if relevant

    Keep responses professional but friendly.

    Input: """,

    "technical": """You are a technical support engineer. Follow these guidelines:
    1. Always start with "Technical Support Response:"
    2. List exact steps to resolve the issue
    3. Include system requirements if relevant
    4. Provide workarounds for common problems
    5. End with escalation path if needed

    Use clear, numbered steps and technical details.

    Input: """,

    "account": """You are an account security specialist. Follow these guidelines:
    1. Always start with "Account Support Response:"
    2. Prioritize account security and verification
    3. Provide clear steps for account recovery/changes
    4. Include security tips and warnings
    5. Set clear expectations for resolution time

    Maintain a serious, security-focused tone.

    Input: """,

    "product": """You are a product specialist. Follow these guidelines:
    1. Always start with "Product Support Response:"
    2. Focus on feature education and best practices
    3. Include specific examples of usage
    4. Link to relevant documentation sections
    5. Suggest related features that might help

    Be educational and encouraging in tone.

    Input: """
}

# Test with different support tickets
tickets = [
    """Subject: Can't access my account
    Message: Hi, I've been trying to log in for the past hour but keep getting an 'invalid password' error.
    I'm sure I'm using the right password. Can you help me regain access? This is urgent as I need to
    submit a report by end of day.
    - John""",

    """Subject: Unexpected charge on my card
    Message: Hello, I just noticed a charge of $49.99 on my credit card from your company, but I thought
    I was on the $29.99 plan. Can you explain this charge and adjust it if it's a mistake?
    Thanks,
    Sarah""",

    """Subject: How to export data?
    Message: I need to export all my project data to Excel. I've looked through the docs but can't
    figure out how to do a bulk export. Is this possible? If so, could you walk me through the steps?
    Best regards,
    Mike"""
]

print("Processing support tickets...\n")
for i, ticket in enumerate(tickets, 1):
    print(f"\nTicket {i}:")
    print("-" * 40)
    print(ticket)
    print("\nResponse:")
    print("-" * 40)
    response = route(
        input = ticket,
        routes = support_routes,
        model = model,
        dct_params = dct_params,
        )
    print(response)

Processing support tickets...


Ticket 1:
----------------------------------------
Subject: Can't access my account
    Message: Hi, I've been trying to log in for the past hour but keep getting an 'invalid password' error.
    I'm sure I'm using the right password. Can you help me regain access? This is urgent as I need to
    submit a report by end of day.
    - John

Response:
----------------------------------------

Available routes: ['billing', 'technical', 'account', 'product']


Device set to use cuda
Device set to use cuda


Routing Analysis:

    The user, John, is experiencing an issue with accessing his account, which is indicated by the repeated 'invalid password' error despite his assertion that he is using the correct password. The urgency is highlighted by his need to submit a report by the end of the day, suggesting that he requires immediate assistance to regain access to his account. This issue falls under the domain of account management and authentication, which is typically handled by the 'account' support team. They are equipped to assist with login issues, password resets, and other account-related concerns.
  

Selected route: account


Device set to use cuda


 Account Support Response:

Dear John,

I understand the urgency of your situation and the inconvenience caused by the 'invalid password' error. Here are the steps we will take to securely regain access to your account:

1. **Verification Process**:
   - Please confirm your registered email address or phone number to ensure we are communicating with the correct account.
   - If you have two-factor authentication enabled, we will need to verify your identity through that method.

2. **Password Reset**:
   - Visit our secure password reset page.
   - Enter your username or email to receive a password reset link.
   - Create a new password that is strong and unique, following our recommended password guidelines.

3. **Account Recovery Steps**:
   - After resetting your password, attempt to log in again.
   - If you continue to experience issues, please provide the username or email associated with your account.
   - We may need to perform additional security checks, which could include an

Device set to use cuda


Routing Analysis:

    The key terms in the input message are "unexpected charge," "credit card," and "plan." The user, Sarah, is expressing concern over a charge that she did not anticipate and is likely not related to her current plan'eed payment. The intent is to seek clarification on the charge and potentially have it adjusted if it is indeed an error. The urgency level is moderate to high, as financial transactions and unexpected charges can cause immediate concern for the user.
 
    Given the nature of the issue, it is not a technical problem, as it does not pertain to the functionality or performance of a product. It also does not directly relate to the user's account details beyond the billing information. The primary concern here is the billing discrepancy that needs to be addressed.
 
    Therefore, the most appropriate team to handle this ticket would be the 'billing' team, as they are responsible for managing and resolving issues related to charges, payment plans, and ensu

Device set to use cuda


 Billing Support Response:

Dear Sarah,

Thank you for reaching out to us regarding the unexpected charge on your credit card. I understand your concern about the discrepancy between the charge and the plan you believe you are on.

Upon reviewing your account, I see that a charge of $49.99 has indeed been applied. This charge corresponds to a one-time service fee for a premium feature that was added to your account. It appears that this addition was not communicated to you, which may have led to the confusion.

To resolve this, I will initiate a review of your account to confirm whether this service fee is necessary for your current plan. I will also check if there was a mistake in the billing process.

Here are the next steps:
1. I will investigate the charge and your account details within the next 24 hours.
2. I will contact you with my findings and any necessary adjustments to your account.
3. If a mistake is found, I will ensure that the incorrect charge is reversed and your accou

Device set to use cuda


Routing Analysis:

    The user's query involves a request for assistance with exporting data, which likely pertains to the use of a specific product or service. The user has already checked the documentation but is unable to find the solution, indicating a need for direct guidance on how to perform a bulk export. This task does not seem to be related to billing issues, technical problems with the system itself, or account management. Instead, it is about utilizing a feature of a product, which suggests that the user is seeking information on how to use the product effectively.
  

Selected route: product
 Product Support Response:

Hello Mike,

Thank you for reaching out with your query about exporting your project data to Excel. I'm glad to assist you with this process and ensure you can leverage this feature to its fullest potential.

**Feature Education and Best Practices:**

To export your data efficiently, it's essential to understand the export functionality within our software.

### 1.4. Library usage: Evaluator
* In this workflow, one LLM call generates a response while another provides evaluation and feedback in a loop.

In [None]:
from lib.utils import extract_xml
from lib.llm_tools import HuggingFaceWrapperAPI, HuggingFaceModelLoad
from lib.llm_framework_evaluator import loop

In [None]:
model_name = "microsoft/Phi-3.5-mini-instruct"
dct_params = {
  'max_new_tokens': 4500,
  'temperature': 0.1,
  'return_full_text': False
}
if True:
  model = HuggingFaceModelLoad(model_name = model_name)
  debug_mode = False
else:
  model = HuggingFaceWrapperAPI(model_name = model_name)
  debug_mode = True

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
evaluator_prompt = """
Evaluate this following code implementation for:
1. code correctness
2. time complexity
3. style and best practices

You should be evaluating only and not attemping to solve the task.
Only output "PASS" if all criteria are met and you have no further suggestions for improvements.
Output your evaluation concisely in the following format.

<evaluation>PASS, NEEDS_IMPROVEMENT, or FAIL</evaluation>
<feedback>
What needs improvement and why.
</feedback>
"""

generator_prompt = """
Your goal is to complete the task based on <user input>. If there are feedback
from your previous generations, you should reflect on them to improve your solution

Output your answer concisely in the following format:

<thoughts>
[Your understanding of the task and feedback and how you plan to improve]
</thoughts>

<response>
[Your code implementation here]
</response>
"""

task = """
<user input>
Implement a Stack with:
1. push(x)
2. pop()
3. getMin()
All operations should be O(1).
</user input>
"""

result, chain_of_thought = loop(
    task = task,
    evaluator_prompt = evaluator_prompt,
    generator_prompt = generator_prompt,
    model = model,
    dct_params = dct_params,
    n_max_iter = 5,
    debug_mode = debug_mode
    )

Device set to use cuda



=== INPUT START ===
Full prompt:

Your goal is to complete the task based on <user input>. If there are feedback
from your previous generations, you should reflect on them to improve your solution

Output your answer concisely in the following format:

<thoughts>
[Your understanding of the task and feedback and how you plan to improve]
</thoughts>

<response>
[Your code implementation here]
</response>

Task: 
<user input>
Implement a Stack with:
1. push(x)
2. pop()
3. getMin()
All operations should be O(1).
</user input>



=== INPUT END ===


Device set to use cuda



=== GENERATION START ===
Thoughts:

To implement a stack with O(1) time complexity for push, pop, and getMin operations, I will use a data structure that supports constant time complexity for these operations. A common approach is to use a min-heap alongside the main stack to keep track of the minimum element efficiently. The min-heap will be maintained such that the root of the heap always contains the minimum element in the stack. This way, the getMin operation will be O(1), as it will be accessing the root of the heap. For push and pop operations, I will ensure that the heap properties are maintained, which will inherently be O(log n) due to the heap operations, but since these operations are part of the stack'selftask, the overall time complexity for the stack operations will be considered O(1).


Generated:

```python
import heapq

class MinStack:

    def __init__(self):
        self.stack = []  # This will hold the actual stack elements
        self.min_heap = []  # This will h

### 1.5. Library usage: Agents
* In this workflow, a central LLM dynamically breaks down tasks, delegates them to worker LLMs, and synthesizes their results.

In [None]:
from lib.utils import extract_xml
from lib.llm_tools import HuggingFaceWrapperAPI, HuggingFaceModelLoad
from lib.llm_framework_agents import FlexibleOrchestrator

In [None]:
model_name = "microsoft/Phi-3.5-mini-instruct"
dct_params = {
  'max_new_tokens': 1000,
  'temperature': 0.1,
  'return_full_text': False
}
if True:
  model = HuggingFaceModelLoad(model_name = model_name)
  debug_mode = False
else:
  model = HuggingFaceWrapperAPI(model_name = model_name)
  debug_mode = True

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
ORCHESTRATOR_PROMPT = """
Analyze this task and break it down into 2-3 distinct approaches:

Task: {task}

Return your response in this format:

<analysis>
Explain your understanding of the task and which variations would be valuable.
Focus on how each approach serves different aspects of the task.
</analysis>

<tasks>
    <task>
    <type>formal</type>
    <description>Write a precise, technical version that emphasizes specifications</description>
    </task>
    <task>
    <type>conversational</type>
    <description>Write an engaging, friendly version that connects with readers</description>
    </task>
</tasks>
"""

WORKER_PROMPT = """
Generate content based on:
Task: {original_task}
Style: {task_type}
Guidelines: {task_description}

Return your response in this format:

<response>
Your content here, maintaining the specified style and fully addressing requirements.
</response>
"""

In [None]:
orchestrator = FlexibleOrchestrator(
    orchestrator_prompt=ORCHESTRATOR_PROMPT,
    worker_prompt=WORKER_PROMPT,
    model=model,
    debug_mode=debug_mode
)

results = orchestrator.process(
    task="Write a product description for a new eco-friendly water bottle",
    context={
        "target_audience": "environmentally conscious millennials",
        "key_features": ["plastic-free", "insulated", "lifetime warranty"]
    },
    dct_params=dct_params
)

Device set to use cuda
Device set to use cuda



=== ORCHESTRATOR OUTPUT ===

ANALYSIS:

The task at hand involves crafting a compelling narrative around a new eco-friendly water bottle, tailored to suit different audiences and purposes. The primary goal is to highlight the product's environmental benefits while ensuring the description is both informative and appealing to potential customers.

The first approach focuses on a formal, technical description. This version is crucial for targeting consumers who are environmentally conscious and value detailed information about the product's features, materials, and sustainability credentials. It serves to establish credibility and trust by providing evidence-based details that support the product's eco-friendly claims.

The second approach is a conversational, friendly version. This style aims to connect with a broader audience, including those who may not be as familiar with technical jargon or the specifics of sustainability. It's about creating an emotional appeal, emphasizing the pr

Device set to use cuda



=== WORKER RESULT (formal) ===

Introducing our latest innovation in sustainable hydration: the EcoPure Sustainable Water Bottle. This exemplary product is meticulously crafted to meet the demands of environmentally conscious consumers while providing unparalleled functionality and design.

Constructed from high-quality, durable materials, the EcoPure Sustainable Water Bottle is designed to withstand the rigors of daily use, ensuring longevity and reducing the need for frequent replacements. The bottle's robust build is complemented by a sleek, minimalist aesthetic that seamlessly integrates into any lifestyle, whether it be the professional environment or the outdoor adventurer's backpack.

The EcoPure Sustainable Water Bottle boasts a state-of-the-art double-walled insulation system, which maintains the temperature of your beverages for extended periods, whether you seek to enjoy a refreshing cold drink on a hot day or a warm beverage during the cooler months. This feature ensures t

### 1.6. Library usage: Prompt tuning

In [None]:
from lib.utils import extract_xml
from lib.llm_tools import HuggingFaceWrapperAPI, HuggingFaceModelLoad
from lib.llm_framework_prompt_optimizer import (
    prepare_input,
    execute_task,
    refine_prompt,
    iterative_task_execution
)

In [None]:
from datasets import load_dataset

# Load a batch from the HuggingFace xsum dataset
dataset = load_dataset("xsum", split="train[:5]")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [None]:
model_name = "microsoft/Phi-3.5-mini-instruct"
dct_params = {
  'max_new_tokens': 1000,
  'temperature': 0.1,
  'return_full_text': False
}
if True:
  model = HuggingFaceModelLoad(model_name = model_name)
  debug_mode = False
else:
  model = HuggingFaceWrapperAPI(model_name = model_name)
  debug_mode = True

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Initial config
initial_prompt = """
You are a summarization expert. Generate a concise summary.
"""
max_docs_per_prompt = 1  # Set the maximum number of documents per prompt

In [None]:
# Prepare input
tasks, target_summaries = prepare_input(
  dct_input = dataset,
  input_key = "document",
  target_key = "summary",
  max_inputs_per_prompt = max_docs_per_prompt
  )

In [None]:
# Check 1
print(tasks[0])
print()
print(target_summaries[0])
print()
print(target_summaries[:1])

Apply the task for the following instances:
1. The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties on Victoria Street - the main shopping thoroughfare.
Jeanette Tate, who owns the Cinnamon Cafe which was badly affected, said she could not fault the multi-agency response once the flood hit.
However, she said more preventative work could have been carried out to ensure the retaining wall did not fail.
"It is difficult but I do think there is so much publicity for Dumfries and the Nith

In [None]:
# Check 2
output = execute_task(
    prompt = initial_prompt,
    tasks = tasks[:1],
    context = "",
    model = model,
    dct_params = dct_params,
    debug_mode = debug_mode
)

Device set to use cuda



=== TASK EXECUTION INPUT START ===
Full prompt:

You are a summarization expert. Generate a concise summary.


    Output your answer concisely in the following XML format, using only these elements, and without repeating input information:

    <thoughts> Your understanding of the task and how do you plan to solve it </thoughts>
    <response> Your answer here </response>
    
Task: Apply the task for the following instances:
1. The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The waters breached a retaining wall, flooding many commercial properties 

In [None]:
# Check 3
input_system_prompt = initial_prompt
tasks_check = tasks[:1]
targets_check = target_summaries[:1]
outputs = output[1]
memory = ""

output_refine = refine_prompt(
    input_system_prompt = input_system_prompt,
    tasks = tasks_check,
    memory = memory,
    outputs = outputs,
    targets = targets_check,
    model = model,
    dct_params = dct_params
)

Device set to use cuda



=== PROMPT ENGINEERING INPUT START ===
Full prompt:

      You are a prompt engineering expert.

      1. Task:
      * Given a <input_system_prompt> for another LLM, the <tasks> that the LLM is trying to solve,
      the <generated_outputs> for that tasks following that input system prompt,
      the <target_outputs> that should've been generated, and the <memory> of previous recommendations
      that you have provided, propose an improved <input_system_prompt>.

      2. Notes:
      * The new base prompt proposed should be generic enough for approaching that task even with different input data.
      * Thus, do not use specific information about the input data within the Tasks.
      * The new base prompt can include aspects such as synthetic examples for improving it, text refinement, task clarification...
      * You have memory information on previous attempts: improvements previously proposed and the output obtained.

      3. Output format:
      * Output your answer concisel

In [None]:
# Check 4
final_outputs = iterative_task_execution(
    tasks = tasks[:1],
    initial_prompt = initial_prompt,
    target_outputs = target_summaries[:1],
    n_max_iter = 2,
    model = model,
    dct_params = dct_params,
    debug_mode = debug_mode
)
print()
print("*"*50)
print(final_outputs)
print("*"*50)

Device set to use cuda



=== ITERATION 1 START ===
**************************************************
current_system_prompt: 
You are a summarization expert. Generate a concise summary.

**************************************************

=== TASK EXECUTION INPUT START ===
Full prompt:

You are a summarization expert. Generate a concise summary.


    Output your answer concisely in the following XML format, using only these elements, and without repeating input information:

    <thoughts> Your understanding of the task and how do you plan to solve it </thoughts>
    <response> Your answer here </response>
    
Task: Apply the task for the following instances:
1. The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flood

Device set to use cuda



=== PROMPT ENGINEERING INPUT START ===
Full prompt:

      You are a prompt engineering expert.

      1. Task:
      * Given a <input_system_prompt> for another LLM, the <tasks> that the LLM is trying to solve,
      the <generated_outputs> for that tasks following that input system prompt,
      the <target_outputs> that should've been generated, and the <memory> of previous recommendations
      that you have provided, propose an improved <input_system_prompt>.

      2. Notes:
      * The new base prompt proposed should be generic enough for approaching that task even with different input data.
      * Thus, do not use specific information about the input data within the Tasks.
      * The new base prompt can include aspects such as synthetic examples for improving it, text refinement, task clarification...
      * You have memory information on previous attempts: improvements previously proposed and the output obtained.

      3. Output format:
      * Output your answer concisel

Device set to use cuda



=== PROMPT ENGINEERING OUTPUT START ===
Raw output:
 ```xml
<evaluation>NEEDS_IMPROVEMENT</evaluation>
<thoughts>The current generated output focuses on the aftermath and response to the flooding but does not explicitly mention the ongoing clean-up operations as the target output suggests. To improve the prompt, it should be refined to directly address the clean-up aspect while maintaining a concise summary of the situation.</thoughts>
<refined_prompt>Summarize the ongoing clean-up operations in Dumfries and Galloway following the flooding caused by Storm Frank, highlighting the multi-agency response and the calls for more preventative measures and defenses.</refined_prompt>
``` ```xml
<evaluation>NEEDS_IMPROVEMENT</evaluation>
<thoughts>The generated output provides a good overview of the situation but lacks specificity regarding the clean-up operations, which is a key aspect of the target output. To enhance the prompt, it should be tailored to encapsulate the essence of the clean-up

Device set to use cuda



=== PROMPT ENGINEERING INPUT START ===
Full prompt:

      You are a prompt engineering expert.

      1. Task:
      * Given a <input_system_prompt> for another LLM, the <tasks> that the LLM is trying to solve,
      the <generated_outputs> for that tasks following that input system prompt,
      the <target_outputs> that should've been generated, and the <memory> of previous recommendations
      that you have provided, propose an improved <input_system_prompt>.

      2. Notes:
      * The new base prompt proposed should be generic enough for approaching that task even with different input data.
      * Thus, do not use specific information about the input data within the Tasks.
      * The new base prompt can include aspects such as synthetic examples for improving it, text refinement, task clarification...
      * You have memory information on previous attempts: improvements previously proposed and the output obtained.

      3. Output format:
      * Output your answer concisel

In [None]:
# Check 5
final_outputs = iterative_task_execution(
    tasks = tasks[:3],
    initial_prompt = initial_prompt,
    target_outputs = target_summaries[:3],
    n_max_iter = 2,
    model = model,
    dct_params = dct_params,
    debug_mode = debug_mode
)
print()
print("*"*50)
print(final_outputs)
print("*"*50)

Device set to use cuda



=== ITERATION 1 START ===
**************************************************
current_system_prompt: 
You are a summarization expert. Generate a concise summary.

**************************************************

=== TASK EXECUTION INPUT START ===
Full prompt:

You are a summarization expert. Generate a concise summary.


    Output your answer concisely in the following XML format, using only these elements, and without repeating input information:

    <thoughts> Your understanding of the task and how do you plan to solve it </thoughts>
    <response> Your answer here </response>
    
Task: Apply the task for the following instances:
1. The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flood

Device set to use cuda



=== TASK EXECUTION INPUT START ===
Full prompt:

You are a summarization expert. Generate a concise summary.


    Output your answer concisely in the following XML format, using only these elements, and without repeating input information:

    <thoughts> Your understanding of the task and how do you plan to solve it </thoughts>
    <response> Your answer here </response>
    
Task: Apply the task for the following instances:
1. A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.
As they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.
One of the tour groups is from Germany, the other from China and Taiwan. It was their first night in Northern Ireland.
The driver of one of the buses said many of the passengers had left personal belongings on board and these had been destroyed.
Both groups have organised replacement coaches and will begin their tour of the nort

Device set to use cuda



=== TASK EXECUTION INPUT START ===
Full prompt:

You are a summarization expert. Generate a concise summary.


    Output your answer concisely in the following XML format, using only these elements, and without repeating input information:

    <thoughts> Your understanding of the task and how do you plan to solve it </thoughts>
    <response> Your answer here </response>
    
Task: Apply the task for the following instances:
1. Ferrari appeared in a position to challenge until the final laps, when the Mercedes stretched their legs to go half a second clear of the red cars.
Sebastian Vettel will start third ahead of team-mate Kimi Raikkonen.
The world champion subsequently escaped punishment for reversing in the pit lane, which could have seen him stripped of pole.
But stewards only handed Hamilton a reprimand, after governing body the FIA said "no clear instruction was given on where he should park".
Belgian Stoffel Vandoorne out-qualified McLaren team-mate Jenson Button on his Form

Device set to use cuda



=== PROMPT ENGINEERING INPUT START ===
Full prompt:

      You are a prompt engineering expert.

      1. Task:
      * Given a <input_system_prompt> for another LLM, the <tasks> that the LLM is trying to solve,
      the <generated_outputs> for that tasks following that input system prompt,
      the <target_outputs> that should've been generated, and the <memory> of previous recommendations
      that you have provided, propose an improved <input_system_prompt>.

      2. Notes:
      * The new base prompt proposed should be generic enough for approaching that task even with different input data.
      * Thus, do not use specific information about the input data within the Tasks.
      * The new base prompt can include aspects such as synthetic examples for improving it, text refinement, task clarification...
      * You have memory information on previous attempts: improvements previously proposed and the output obtained.

      3. Output format:
      * Output your answer concisel

If you are not using the generate method, you may encounter nonsensical outputs after the 4096th token, as the KV cache needs to be recomputed.
We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)
Device set to use cuda



=== PROMPT ENGINEERING OUTPUT START ===
Raw output:
 <evaluation> PASS
<thoughts> The refined prompt successfully captures the essence of the tasks by focusing on the key aspects of flooding in Dumfries and Galloway, the arson attack in Belfast, and the Bahrain Grand Prix qualifying results. It maintains a generic approach that can be applied to various instances while incorporating the specific details and outcomes from the provided information. The prompt now clearly sets the stage for summarizing the situations and their impacts, as well as the responses from key figures and the public.
<refined_prompt> Provide a concise summary of the following instances:
1. Extensive flooding in Dumfries and Galloway has led to significant damage, with ongoing repair efforts and calls for improved flood defenses. First Minister Nicola Sturgeon and Labour Party's deputy Scottish leader Alex Rowley have visited the affected areas, emphasizing the need for a robust flood protection plan and immediat

Device set to use cuda



=== TASK EXECUTION INPUT START ===
Full prompt:


    Output your answer concisely in the following XML format, using only these elements, and without repeating input information:

    <thoughts> Your understanding of the task and how do you plan to solve it </thoughts>
    <response> Your answer here </response>
    
Task: Apply the task for the following instances:
1. A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.
As they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.
One of the tour groups is from Germany, the other from China and Taiwan. It was their first night in Northern Ireland.
The driver of one of the buses said many of the passengers had left personal belongings on board and these had been destroyed.
Both groups have organised replacement coaches and will begin their tour of the north coast later than they had planned.
Police have appealed for

Device set to use cuda



=== TASK EXECUTION INPUT START ===
Full prompt:


    Output your answer concisely in the following XML format, using only these elements, and without repeating input information:

    <thoughts> Your understanding of the task and how do you plan to solve it </thoughts>
    <response> Your answer here </response>
    
Task: Apply the task for the following instances:
1. Ferrari appeared in a position to challenge until the final laps, when the Mercedes stretched their legs to go half a second clear of the red cars.
Sebastian Vettel will start third ahead of team-mate Kimi Raikkonen.
The world champion subsequently escaped punishment for reversing in the pit lane, which could have seen him stripped of pole.
But stewards only handed Hamilton a reprimand, after governing body the FIA said "no clear instruction was given on where he should park".
Belgian Stoffel Vandoorne out-qualified McLaren team-mate Jenson Button on his Formula 1 debut.
Vandoorne was 12th and Button 14th, complaining 

Device set to use cuda



=== PROMPT ENGINEERING INPUT START ===
Full prompt:

      You are a prompt engineering expert.

      1. Task:
      * Given a <input_system_prompt> for another LLM, the <tasks> that the LLM is trying to solve,
      the <generated_outputs> for that tasks following that input system prompt,
      the <target_outputs> that should've been generated, and the <memory> of previous recommendations
      that you have provided, propose an improved <input_system_prompt>.

      2. Notes:
      * The new base prompt proposed should be generic enough for approaching that task even with different input data.
      * Thus, do not use specific information about the input data within the Tasks.
      * The new base prompt can include aspects such as synthetic examples for improving it, text refinement, task clarification...
      * You have memory information on previous attempts: improvements previously proposed and the output obtained.

      3. Output format:
      * Output your answer concisel

If you are not using the generate method, you may encounter nonsensical outputs after the 4096th token, as the KV cache needs to be recomputed.



=== PROMPT ENGINEERING OUTPUT START ===
Raw output:
 <evaluation> PASS
<thoughts>The refined prompt successfully captures the essence of the tasks related to flooding in Dumfries and Galloway and the Bahrain GP qualifying session. It addresses the need for improved prompts that can handle different input data while maintaining a focus on the specific issues at hand. The thoughts section now clearly outlines the objectives for each task, and the refined prompts are designed to elicit comprehensive responses that align with the target outputs.
<refined_prompt>
Approach the following tasks with a focus on the specific issues:
1. Discuss the ongoing clean-up operations in the Scottish Borders and Dumfries and Galloway after the flooding caused by Storm Frank, emphasizing the need for more preventative measures and defences, the impact on commercial properties and businesses, and the calls for a robust flood protection plan and immediate steps to protect vulnerable areas.
2. Describe the s

### 1.7. Library usage: LLM Evaluations

In [None]:
from lib.utils import extract_xml
from lib.llm_tools import HuggingFaceWrapperAPI, HuggingFaceModelLoad
from lib.llm_deepeval_wrapper import HuggingFaceWrapperDeepEval



In [None]:
import json
from pydantic import BaseModel
from deepeval.models import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase
from deepeval.test_case import LLMTestCaseParams
from deepeval import evaluate

In [None]:
# Generator model
model_name = "microsoft/Phi-3.5-mini-instruct"
dct_params = {
  'max_new_tokens': 1000,
  'temperature': 0.1,
  'return_full_text': False
}
if True:
  model = HuggingFaceModelLoad(model_name = model_name)
  debug_mode = False
else:
  model = HuggingFaceWrapperAPI(model_name = model_name)
  debug_mode = True

# Judge model
eval_model = HuggingFaceWrapperDeepEval(model_name = model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Define a schema for the expected JSON output
class RefSchema(BaseModel):
    joke: str

print(eval_model.generate("Write me a joke", schema=RefSchema))
print(RefSchema.model_json_schema())

Device set to use cuda
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


joke="I told my wife she was drawing her eyebrows too high. She looked surprised. I said, 'You're really surprised, aren't you?'"
{'properties': {'joke': {'title': 'Joke', 'type': 'string'}}, 'required': ['joke'], 'title': 'RefSchema', 'type': 'object'}


In [None]:
from deepeval.metrics import ToxicityMetric

# Set eval model for metric generation
metric = ToxicityMetric(model=eval_model, threshold=0.5)

# Generate output with base model
input_prompt = "Write me a joke"
output_prompt = model.generate(input_prompt)

# Evaluate results
test_case = LLMTestCase(
    input = input_prompt,
    # Replace this with the actual output from your LLM application
    actual_output = output_prompt
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Device set to use cuda


Output()

Device set to use cuda
Device set to use cuda
Device set to use cuda


0.0
The toxicity score is 0.00 because the output was devoid of any offensive, aggressive, or harmful language, maintaining a respectful and constructive tone throughout.


In [None]:
from deepeval.metrics import AnswerRelevancyMetric

# Set eval model for metric generation
metric = AnswerRelevancyMetric(model=eval_model, include_reason=True)

# Replace this with the actual output from your LLM application
input_prompt = "Write me a joke"
input_reference = "Why did the crab cross the road? It didn’t—it used the sidewalk."
actual_output = model.generate(input_prompt)
test_case = LLMTestCase(
    input = input_reference,
    actual_output = actual_output
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

Device set to use cuda


Output()

Device set to use cuda
Device set to use cuda
Device set to use cuda


0.0
The score is 0.00 because there are irrelevant idiomatic and joke-like statements in the actual output that do not directly address or answer the input question about why the crab used the sidewalk instead of crossing the road.


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |          |  0% (0/1) [Time Taken: 00:00, ?test case/s]Device set to use cuda
Device set to use cuda
Device set to use cuda
Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:31, 31.78s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 0.5, threshold: 0.5, strict: False, evaluation model: microsoft/Phi-3.5-mini-instruct, reason: The score is 0.50 because the actual output includes irrelevant statements like 'Why don't scientists trust atoms at the party?' and 'Why did the computer go to school?', which do not provide a relevant explanation for why a crab would prefer the sidewalk over the road., error: None)

For test case:

  - input: Why did the crab cross the road? It didn’t—it used the sidewalk.
  - actual output:  Why don't scientists trust atoms at the party?

Because they make up everything, even the jokes!


(Note: This joke plays on the double meaning of "make up" – atoms are fundamental components of matter, and to "make up" can also mean to fabricate or lie about something.) Here's a light-hearted joke for you:

Why did the computer go to school?

Because it wanted to improve its byte-sized education!


This joke is a play on words, combining the concept of




EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=0.5, reason="The score is 0.50 because the actual output includes irrelevant statements like 'Why don't scientists trust atoms at the party?' and 'Why did the computer go to school?', which do not provide a relevant explanation for why a crab would prefer the sidewalk over the road.", strict_mode=False, evaluation_model='microsoft/Phi-3.5-mini-instruct', error=None, evaluation_cost=None, verbose_logs='Statements:\n[\n    "Why don\'t scientists trust atoms at the party?",\n    "Because they make up everything, even the jokes!",\n    "Why did the computer go to school?",\n    "Because it wanted to improve its byte-sized education!"\n] \n \nVerdicts:\n[\n    {\n        "verdict": "no",\n        "reason": "The statement \'Why don\'t scientists trust atoms at the party? Because they make up everything, even the jokes!\' is not direc

In [None]:
from deepeval.metrics import GEval

correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model=eval_model
)

test_case = LLMTestCase(
    input="The dog chased the cat up the tree, who ran up the tree?",
    actual_output="It depends, some might consider the cat, while others might argue the dog.", # Fixed output; can be replaced by LLM output
    expected_output="The cat."
)

correctness_metric.measure(test_case)
print(correctness_metric.score)
print(correctness_metric.reason)

Output()

Device set to use cuda


0.3
The actual output acknowledges the ambiguity present in the expected output regarding who ran up the tree, which shows partial adherence to the evaluation steps. However, it omits specifying that according to the expected output, it was the cat that ran up the tree, resulting in a lack of necessary detail.
