In [1]:
import re
import pandas as pd
from tqdm.auto import tqdm
from datasets import load_dataset
from huggingface_hub import InferenceClient, notebook_login

In [2]:
# Install necessary packages
!pip install huggingface_hub datasets pandas tqdm -q

# Set up tqdm for progress bars
tqdm.pandas()

# Set pandas display options
pd.set_option("display.max_colwidth", None)

# Log in to Hugging Face Hub if needed
notebook_login()

# Initializing the InferenceClient with the LLM model
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
llm_client = InferenceClient(model=repo_id, timeout=120)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Testing the LLM client
response = llm_client.text_generation(prompt="What are your plans for today?", max_new_tokens=20)
print("LLM Response:", response)


LLM Response: 

I’m going to the gym, then I’m going to the grocery store,


In [4]:
# Prompt for LLM Evaluation
JUDGE_PROMPT = """
You will be given an Instruction (for the generation of an SAS code) and Generated Solution (SAS code) couple.
Your task is to provide a 'total rating' scoring how well the Generated Solution answers the user concerns expressed in the Instruction.
Give your answer on a scale of 1 to 5, where 1 means that the Generated Solution is not helpful at all, and 5 means that the Generated Solution completely and helpfully addresses the Instruction.



Here is the scale you should use to build your answer:
1: The Generated Solution is terrible: completely irrelevant to the Instruction given, very partial, or generates an error when run
2: The Generated Solution is mostly not helpful: misses some key aspects of the Instruction, or if there is a syntax error
3: The Generated Solution is mostly helpful: provides support, but still could be improved
4: The Generated Solution is excellent: relevant, direct, detailed, and addresses all the concerns raised in the Instruction
5: The Generated Solution exceeds expectations: relevant, direct, detailed, addresses all the concerns raised in the Instruction, and demonstrates exceptional innovation, optimization, or elegance.

Provide your feedback as follows:

Feedback:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the instruction and generated solution.

Instruction: {instruction}
Generated Solution: {generated_solution}

Provide your feedback.
Feedback:::
Evaluation: """

In [5]:
def extract_judge_score(answer: str, split_str: str = "Total rating:") -> int:
    try:
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        digit_groups = [el.strip() for el in re.findall(r"\d+(?:\.\d+)?", rating)]
        return float(digit_groups[0])
    except Exception as e:
        print("Error in extract_judge_score:", e)
        return None

In [6]:
# Function to evaluate generated solutions using the LLM
def evaluate_with_llm(instruction, generated_solution):
    try:
        response = llm_client.text_generation(
            prompt=JUDGE_PROMPT.format(instruction=instruction, generated_solution=generated_solution),
            max_new_tokens=1000,
        )
        print("LLM Response:", response)  # Debugging line
        score = extract_judge_score(response)
        print("Extracted Score:", score)  # Debugging line
        return score
    except Exception as e:
        print("Error occurred during LLM generation:", e)
        return None

In [7]:
# Example usage
instruction = "Write an SAS code that generates a random number between 1 and 10"
generated_solution = """

data _null_;
    random_number = ceil(rand('uniform') * 10);
    put "Random number between 1 and 10: " random_number;
run;

"""
llm_evaluation_score = evaluate_with_llm(instruction, generated_solution)
print("LLM Evaluation Score:", llm_evaluation_score)


LLM Response: 
The generated solution is excellent. It is relevant, direct, detailed, and addresses all the concerns raised in the instruction. The code generates a random number between 1 and 10, and the put statement prints the result.
Total rating: 4
Extracted Score: 4.0
LLM Evaluation Score: 4.0


In [8]:
# Example usage
instruction = "Write an SAS code that generates a random number between 1 and 10"
generated_solution = """
import random
random_number = arandom.randit(1, 10)
print("Random number between 1 and 10:", random_number)
"""
llm_evaluation_score = evaluate_with_llm(instruction, generated_solution)
print("LLM Evaluation Score:", llm_evaluation_score)


LLM Response: 
The Generated Solution is terrible: completely irrelevant to the Instruction given, very partial, or generates an error when run.
The Generated Solution is written in Python, not SAS.
Total rating: 1
Extracted Score: 1.0
LLM Evaluation Score: 1.0


In [9]:
# Example usage
instruction = "Give a sentence in which you describe an apple."
generated_solution = "An apple is like a car but also like a banana."
llm_evaluation_score = evaluate_with_llm(instruction, generated_solution)
print("LLM Evaluation Score:", llm_evaluation_score)


LLM Response: 
The Generated Solution is mostly not helpful. It does not describe an apple, but rather compares it to a car and a banana.
Total rating: 2
Extracted Score: 2.0
LLM Evaluation Score: 2.0


In [10]:
# Function to evaluate generated solutions using the LLM and return the mean score
def evaluate_with_llm2(instructions, generated_solutions):
    scores = []
    for instruction, generated_solution in zip(instructions, generated_solutions):
        try:
            response = llm_client.text_generation(
                prompt=JUDGE_PROMPT.format(instruction=instruction, generated_solution=generated_solution),
                max_new_tokens=1000,
            )
            score = extract_judge_score(response)
            if score is not None:
                scores.append(score)
        except Exception as e:
            print("Error occurred during LLM generation:", e)
    if scores:
        return sum(scores) / len(scores)
    else:
        return None
