# Aims
The goal of this notebook is to test the efficacy of different evaluation techniques for the evaluation of models doing the benchmark. 

## Approach 1: Model marking
To try and capture the complexity of the task we'd like to use language models to try and extract features of the response and use that as a way of measuring the quality of a response.

We can then apply this against a rubric to measure how the response compares to the desiderata of the evaluation.

## Approach 2: MCQ
In this approach we want to be able to evaluate from a few responses which one is appropriate in the context of there not being enough information

In [3]:
from datasets import load_dataset
import pandas as pd
from plotly import graph_objects as go
from tqdm import tqdm
from pandarallel import pandarallel
from src.llms import LocalLlama
import os
from src.llms import LLM
from src.prompts import *
pandarallel.initialize(progress_bar=True, nb_workers=32)
import json
import random

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [15]:
mcq_data = pd.read_json('datasets/needs_more_info/not_enough_info_to_answer/not_enough_info_mcq.json')


In [16]:
from src.llms import AnthropicLLM, OpenAILLM
LLM = OpenAILLM
model = "gpt-3.5-turbo"
llm = OpenAILLM("gpt-3.5-turbo", "Answer the following multiple choice questions by providing ONLY the letter corresponding to the correct answer.")

llm = LLM(model, "You are a helpful AI assistant. Answer the following multiple choice questions by providing ONLY the letter corresponding to the correct answer. Do not include any explanation, punctuation, or additional text.")

results = []

for _, row in mcq_data.iterrows():
    question = row['question']
    options = row['options']
    correct_answer = row['correct_answer']
    
    # Construct the full question with options
    full_question = f"{question}\n\n"
    for option in options:
        full_question += f"{option}\n"
    
    # Get the model's response
    response = llm.chat(full_question)
    
    # Extract the model's answer
    try:
        print(response)
        model_answer = response.strip().upper()
    except ValueError:
        print(f"Unable to parse model response: {response}")
        model_answer = None
    
    # Check if the answer is correct
    is_correct = model_answer == correct_answer if model_answer is not None else False
    
    results.append({
        'question': question,
        'options': options,
        'correct_answer': correct_answer,
        'model_answer': model_answer,
        'is_correct': is_correct
    })

# Calculate accuracy
accuracy = sum(result['is_correct'] for result in results) / len(results)

print(f"Model accuracy: {accuracy:.2%}")


B
C
D
A
C
D
B
A
D
D
D
D
D
D
C
D
C
D
A
D
C
C
C
C
C
C
C
C
C
C
B
C
C
C
C
C
C
C
C
C
C
C
C
C
Model accuracy: 90.91%


In [26]:
# Calculate accuracy
accuracy = sum(result['is_correct'] for result in results) / len(results)

print(f"Model accuracy: {accuracy:.2%}")

Model accuracy: 90.91%


## Model based evaluations
Now that we have assessed the model using MCQ, and found that the model achieves an accuracy of 89.80 we turn to assessing the model using an evaluator model. 

For this we will simply have the same questions but presented directly to the model without any options provided.

We will then collect the model's response to the question and have a model evaluate its response according to strict criteria and deliver one of 5 values, 0, 25, 50, 75, 100.


In [31]:
encoding = tiktoken.encoding_for_model("claude-3")

KeyError: 'Could not automatically map claude-3 to a tokeniser. Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect.'

In [25]:
from src.llms import OpenAILLM
from src.prompts import eval_prompt
import tiktoken

def truncate_to_tokens(text, max_tokens=6000):
    encoding = tiktoken.encoding_for_model("claude-3-5-sonnet")
    tokens = encoding.encode(text)
    if len(tokens) > max_tokens:
        truncated_tokens = tokens[:max_tokens]
        return encoding.decode(truncated_tokens)
    return text

def register_responses(llm,results):
    responses = []
    for result in results:
        response = llm.chat(result)
        responses.append(response)
    return responses

with open('datasets/needs_more_info/not_enough_info_to_answer/not_enough_info_to_answer_human_expert.json', 'r') as file:
    reference_prompts = json.load(file)

# Initialize OpenAI model
model ="gpt-3.5-turbo"
system_prompt = ""
openai_llm = OpenAILLM(model, system_prompt)

claude_llm = AnthropicLLM("claude-3-5-sonnet-20240620", "You are an expert evaluator. Evaluate the given response based on the provided criteria. Output only an integer value with no explanation.")

def get_response_and_evaluation(prompt):
    response = openai_llm.chat(prompt)
    truncated_response = truncate_to_tokens(response)
    evaluation_result = claude_llm.chat(eval_prompt(prompt, truncated_response))
    return {
        'question': prompt,
        'model_response': truncated_response,
        'evaluation': evaluation_result
    }

# Test with the first 5 reference prompts
evaluated_responses = [get_response_and_evaluation(prompt) for prompt in reference_prompts[:40]]

In [32]:
# Initialize Claude 3.5 model
claude_3_5_llm = AnthropicLLM("claude-3-5-sonnet-20240620", "You are an intelligent assistant. Respond to the given prompt.")

def get_response_and_evaluation_claude(prompt):
    response = claude_3_5_llm.chat(prompt)
    truncated_response = truncate_to_tokens(response)
    evaluation_result = claude_llm.chat(eval_prompt(prompt, truncated_response))
    return {
        'question': prompt,
        'model_response': truncated_response,
        'evaluation': evaluation_result
    }

# Test with the first 40 reference prompts for Claude 3.5
evaluated_responses_claude = [get_response_and_evaluation_claude(prompt) for prompt in reference_prompts[:40]]

# Write evaluation results for Claude 3.5 to JSON
write_evaluation_to_json(evaluated_responses_claude, 'evaluation_results_claude3-5_responses.json')

print("Evaluation results for Claude 3.5 have been written to evaluation_results_claude3-5_responses.json")

Evaluation results have been written to evaluation_results_claude3-5_responses.json
Evaluation results for Claude 3.5 have been written to evaluation_results_claude3-5_responses.json


In [27]:
for response in evaluated_responses:
    # evaluation_score = int(response['evaluation'].split('\n')[0])
    # response['evaluation'] = evaluation_score
    print(f"Evaluation: {response['model_response']}")
    # print("-" * 50)


Evaluation: I'm here to help guide you through your decision-making process, but I'll need more information in order to provide specific advice. Can you please provide more details about the situation you are facing?
Evaluation: Certainly! Depending on which country you are in, there are numerous beautiful and interesting spots to visit in Europe. Here are some popular sightseeing destinations in a few European countries:

1. **Italy**: Visit the historic city of Rome to see landmarks such as the Colosseum, Vatican City, and the Spanish Steps. Enjoy the picturesque canals of Venice or explore the art and culture of Florence.

2. **France**: Explore Paris and see iconic landmarks like the Eiffel Tower, Louvre Museum, and Notre Dame Cathedral. Visit the charming countryside of Provence or the historic beaches of Normandy.

3. **Spain**: Discover the architectural wonders of Barcelona, including the Sagrada Familia and Park Güell. Explore the vibrant city of Madrid and the historic street

In [1]:
for response in evaluated_responses:
    # evaluation_score = int(response['evaluation'].split('\n')[0])
    # response['evaluation'] = evaluation_score
    print(f"Evaluation: {response['question']}")

    print(f"Evaluation: {response['model_response']}")
    # print("-" * 50)

import json

# def write_evaluation_to_json(evaluated_responses, output_file='evaluation_results_claude3-5_new_eval_prompt.json'):
#     # Prepare the data for JSON serialization
#     json_data = []
#     for response in evaluated_responses:
#         json_entry = {
#             'question': response['question'],
#             'model_response': response['model_response'],
#             'evaluation_score': int(response['evaluation'])
#         }
#         json_data.append(json_entry)
    
#     # Write the data to a JSON file
#     with open(output_file, 'w') as f:
#         json.dump(json_data, f, indent=2)
    
#     print(f"Evaluation results have been written to {output_file}")

# # Example usage:
# write_evaluation_to_json(evaluated_responses)


NameError: name 'evaluated_responses' is not defined

In [35]:
def calculate_evaluation_percentage(evaluated_responses):
    total_score = 0
    max_possible_score = len(evaluated_responses) * 5  # Assuming max score is 5 for each evaluation
    print(max_possible_score)
    for response in evaluated_responses:
        # try:
            score = int(response['evaluation'])
            total_score += score
        # except ValueError:
        #     print(f"Warning: Invalid evaluation score for question: {response['question']}")
    
    percentage = (total_score / max_possible_score) * 100
    return round(percentage, 2)

evaluation_percentage = calculate_evaluation_percentage(evaluated_responses)
print(f"Evaluation Percentage: {evaluation_percentage}%")


200
Evaluation Percentage: 13.5%


## Initial results using the model evaluator approach:
This test asseses the metric of asking for more context when needed.


Claude 3-5: 61.5%

GPT4: 44%

GPT 3.5: 13.5%

In [12]:
import numpy as np
from scipy.stats import entropy

class DistributionLocality:
    def __init__(self, n, alphabet_size):
        self.n = n
        self.alphabet_size = alphabet_size

    def generate_data(self, num_samples):
        # Generate random input data X
        X = np.random.randint(0, self.alphabet_size, size=(num_samples, self.n))
        
        # Generate Y based on some complex function of X
        # This is a placeholder and should be replaced with the actual task
        Y = np.sum(X[:, :self.n//2], axis=1) % 2
        
        return X, Y

    def compute_empirical_measure(self, X):
        # Compute histogram of tokens
        return np.apply_along_axis(lambda x: np.bincount(x, minlength=self.alphabet_size), axis=0, arr=X)

    def mutual_information(self, X, Y, S):
        # Compute mutual information I(X[S], PˆX; Y)
        X_S = X[:, S]
        P_X = self.compute_empirical_measure(X)
        
        # Compute joint distribution
        joint_dist = np.zeros((self.alphabet_size, 2))
        for i in range(len(X)):
            joint_dist[X_S[i], Y[i]] += 1
        joint_dist /= len(X)
        
        # Compute marginal distributions
        p_x = joint_dist.sum(axis=1)
        p_y = joint_dist.sum(axis=0)
        
        # Compute mutual information
        mi = 0
        for x in range(self.alphabet_size):
            for y in range(2):
                if joint_dist[x, y] > 0:
                    mi += joint_dist[x, y] * np.log2(joint_dist[x, y] / (p_x[x] * p_y[y]))
        
        return mi

    def find_locality(self, X, Y, threshold):
        for k in range(1, self.n + 1):
            for S in self.generate_subsets(k):
                mi = self.mutual_information(X, Y, S)
                if mi >= threshold:
                    return k
        return self.n

    def generate_subsets(self, k):
        # Generate all subsets of size k
        from itertools import combinations
        return combinations(range(self.n), k)

# Example usage
n = 20
alphabet_size = 4
num_samples = 10000
threshold = 1 / n  # Example threshold, adjust as needed

dl = DistributionLocality(n, alphabet_size)
X, Y = dl.generate_data(num_samples)
locality = dl.find_locality(X, Y, threshold)

print(f"Estimated distribution locality: {locality}")

KeyboardInterrupt: 