In [1]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

os.getenv('OPENAI_API_KEY')
client = OpenAI()

In [9]:
IMPROVED_JUDGE_PROMPT = """
You will be given a user_question and system_answer couple.
Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question.
Give your answer on a scale of 1 to 4, where 1 means that the system_answer is not helpful at all, and 4 means that the system_answer completely and helpfully addresses the user_question.

Here is the scale you should use to build your answer:
1: The system_answer is terrible: completely irrelevant to the question asked, or very partial
2: The system_answer is mostly not helpful: misses some key aspects of the question
3: The system_answer is mostly helpful: provides support, but still could be improved
4: The system_answer is excellent: relevant, direct, detailed, and addresses all the concerns raised in the question

Provide your feedback as follows:

Feedback:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 4)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and answer.

Question: {question}
Answer: {answer}

Provide your feedback. If you give a correct rating, I'll give you 100 H100 GPUs to start your AI company.
Feedback:::
Evaluation: """

In [12]:
import pandas as pd

df = pd.read_csv('QuestionSet/Q1.csv')
df.head()

Unnamed: 0,Question ID,Category,Question
0,1,math,How many straight lines can be formed by 8 poi...
1,2,math,How many triangles can be formed by 8 points o...
2,3,math,How many committees of 5 students can be selec...
3,4,math,How many 10-letter patterns can be formed from...
4,5,math,A box contains 12 black and 8 green marbles. H...


In [18]:
def get_answer(question):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": question},
        ]
    )
    return response.choices[-1].message.content

In [19]:
# Get the answers for the questions
answers = []

for i in range(len(df)):
    question = df.iloc[i]['Question']
    answer = get_answer(question)
    answers.append(answer)

df['Answer'] = answers

In [20]:
# Evaluator

review = []

for i in range(len(df)):
    question = df.iloc[i]['Question']
    answer = df.iloc[i]['Answer']
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": IMPROVED_JUDGE_PROMPT.format(question=question, answer=answer)},
        ]
    )

    review.append(response.choices[-1].message.content)

df['Review'] = review
df['Rating'] = df['Review'].apply(lambda x: int(x.split('Total rating: ')[1].split('\n')[0]))

In [21]:
df.to_csv('QuestionSet/Q1_evaluated.csv', index=False)
df.head()

Unnamed: 0,Question ID,Category,Question,Answer,Review,Rating
0,1,math,How many straight lines can be formed by 8 poi...,To determine how many straight lines can be fo...,Evaluation: The system_answer is mostly correc...,3
1,2,math,How many triangles can be formed by 8 points o...,To determine how many triangles can be formed ...,"Evaluation: The system_answer is clear, detail...",4
2,3,math,How many committees of 5 students can be selec...,To determine how many committees of 5 students...,Evaluation: The system_answer thoroughly expla...,4
3,4,math,How many 10-letter patterns can be formed from...,To determine how many 10-letter patterns can b...,Evaluation: The system_answer is excellent for...,4
4,5,math,A box contains 12 black and 8 green marbles. H...,To determine how many ways we can choose 3 bla...,Evaluation: The system_answer comprehensively ...,4


In [22]:
df[df['Rating'] < 4]

Unnamed: 0,Question ID,Category,Question,Answer,Review,Rating
0,1,math,How many straight lines can be formed by 8 poi...,To determine how many straight lines can be fo...,Evaluation: The system_answer is mostly correc...,3
15,16,math,Assume that there are 100 people in a room and...,Let's break down the problem step by step.\n\n...,Evaluation: The system_answer attempts to addr...,1
