In [1]:
from openai import OpenAI

In [2]:
client = OpenAI()

In [3]:
batch_file = client.files.create(
  file=open("../../eval_data/batch_tasks_question_generation.jsonl", "rb"),
  purpose="batch"
)

In [16]:
print(batch_file)

FileObject(id='file-DqEhjEnFkoJruLAhTphdp2', bytes=92980819, created_at=1750638917, filename='batch_tasks_question_generation.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


In [5]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

In [17]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

Batch(id='batch_6858a16e8e3c8190bdbfd3d1abb48d63', completion_window='24h', created_at=1750638958, endpoint='/v1/chat/completions', input_file_id='file-DqEhjEnFkoJruLAhTphdp2', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1750639474, error_file_id=None, errors=None, expired_at=None, expires_at=1750725358, failed_at=None, finalizing_at=1750639419, in_progress_at=1750638962, metadata=None, output_file_id='file-Ftjjzx6PGAWWzuXeC2un13', request_counts=BatchRequestCounts(completed=115, failed=0, total=115))


In [18]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [19]:
result_file_name = "../../eval_data/batch_job_results_image_eval.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [20]:
import json

In [21]:
# Loading data from saved file
results = []
with open(result_file_name, 'r') as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

In [26]:
sources_file = "../../eval_data/batch_tasks_question_generation_sources.json"

with open(sources_file, "r") as f:
    sources = json.load(f)

In [28]:
results[0]

{'id': 'batch_req_6858a33c7bd881909580f98d51ad2505',
 'custom_id': 'task-0',
 'response': {'status_code': 200,
  'request_id': '06a89d4882fad41bd8bf039b97946cf0',
  'body': {'id': 'chatcmpl-BlPXN6mlC7DjeXRTyhREoC2QOchU7',
   'object': 'chat.completion',
   'created': 1750639281,
   'model': 'gpt-4o-mini-2024-07-18',
   'choices': [{'index': 0,
     'message': {'role': 'assistant',
      'content': '{"questions":[{"input":"What is the course code for the Python for Data Science class?","expected_output":"ECE 20875"},{"input":"Who are the authors of the Python for Data Science material?","expected_output":"Qiang Qiu, Murat Kocaoglu, and Anuran Makur"},{"input":"What topic is covered in the ECE 20875 course?","expected_output":"Probability and Random Variables"}]}',
      'refusal': None,
      'annotations': []},
     'logprobs': None,
     'finish_reason': 'stop'}],
   'usage': {'prompt_tokens': 37075,
    'completion_tokens': 89,
    'total_tokens': 37164,
    'prompt_tokens_details': 

In [37]:
eval_dataset = []

for result in results:
    custom_id: str = result["custom_id"]
    idx = int(custom_id[custom_id.index("-")+1:])
    eval_dataset.extend([{**qa_data, **sources[idx]} for qa_data in json.loads(result["response"]["body"]["choices"][0]["message"]["content"])["questions"]])

In [39]:
with open("../../eval_data/image_eval_data.json", "w") as f:
    json.dump(eval_dataset, f, indent=4)

In [22]:
import json
with open("../../eval_data/image_eval_data.json", "r") as f:
    outputs = json.load(f)

In [None]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context."""

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to students taking the course "ECE20875: Python for Data Science".
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question."""

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independent this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1. It is assumed that questions are about the course "ECE20875: Python for Data Science", so they should not be marked down for context specific to the course.
The questions can contain obscure technical nouns or acronyms like PDF, CDF, or z-test and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What do the bars labeled A1, A2, and A3 represent in a probability distribution?" should receive a 1, since there is an implicit mention of a context, thus the question is not independent from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question."""

In [4]:
from tqdm import tqdm
from pydantic import BaseModel, Field

In [5]:
class Critique(BaseModel):
    evaluation: str
    total_rating: int = Field(ge=1, le=5)

In [6]:
from PIL import Image
from io import BytesIO
import base64

def encode_image(image: Image.Image) -> str:
    buffer = BytesIO()
    image.save(buffer, format="JPEG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")

In [18]:
def call_llm(query: str, question: str, context: Image.Image | None = None):
    messages = [
        {
            "role": "system",
            "content": query
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": f"Question: {question}"
                }
            ]
        }
    ]
    if context:
        messages[1]["content"].append(
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{encode_image(context)}"
                },
            }
        )
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0.1,
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "critique",
                "schema": Critique.model_json_schema(),
            }
        },
        messages=messages,
    )

    return json.loads(response.choices[0].message.content)

In [8]:
import os
from pdf2image import convert_from_path

pdf_dir = "../../documents"
pdf_images = {}

for filename in os.listdir(pdf_dir):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, filename)
        images = convert_from_path(pdf_path)
        pdf_images[filename] = images

In [11]:
from openai import OpenAI
client = OpenAI()

In [26]:
print("Generating critique for each QA couple...")
for i, output in enumerate(tqdm(outputs[10:])):
    evaluations = {
        "groundedness": call_llm(
            question_groundedness_critique_prompt,
            question=output["input"],
            context=pdf_images[output["source_file"][10:]][output["page_index"]]
        ),
        "relevance": call_llm(
            question_relevance_critique_prompt,
            question=output["input"]
        ),
        "standalone": call_llm(
            question_standalone_critique_prompt,
            question=output["input"]
        ),
    }
    try:
        for criterion, evaluation in evaluations.items():
            output.update(
                {
                    f"{criterion}_score": evaluation["total_rating"],
                    f"{criterion}_eval": evaluation["evaluation"],
                }
            )
        # checkpoint every 10 outputs
        if i % 10 == 0:
            with open("../../eval_data/image_eval_data_checkpoint.json", "w") as f:
                json.dump(outputs, f, indent=4)
    except Exception as e:
        continue

Generating critique for each QA couple...


  5%|▍         | 21/444 [03:12<1:04:36,  9.17s/it]


KeyboardInterrupt: 

In [42]:
from openai import OpenAI
import json
from io import BytesIO
import base64

client = OpenAI()

# Prepare batch input for the next 10 outputs (as an example)
batch_inputs = []
for idx, output in enumerate(outputs):
    # Prepare messages for each criterion
    for criterion, prompt in [
        ("groundedness", question_groundedness_critique_prompt),
        ("relevance", question_relevance_critique_prompt),
        ("standalone", question_standalone_critique_prompt),
    ]:
        messages = [
            {"role": "system", "content": prompt},
            {"role": "user", "content": [
                {"type": "text", "text": f"Question: {output['input']}"}
            ]}
        ]
        # Only add image context for groundedness
        if criterion == "groundedness":
            source_file = output["source_file"][10:]
            page_index = output["page_index"]
            image = pdf_images[source_file][page_index]
            buffer = BytesIO()
            image.save(buffer, format="JPEG")
            image_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
            messages[1]["content"].append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{image_b64}"
                }
            })
        batch_inputs.append({
            "custom_id": f"{idx}-{criterion}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "messages": messages,
                "response_format": {
                    "type": "json_schema",
                    "json_schema": {
                        "name": "critique",
                        "schema": Critique.model_json_schema(),
                    }
                },
                "model": "gpt-4o-mini",
                "temperature": 0.1,
            }
        })

In [50]:
# Split batch_inputs into two batches
midpoint = len(batch_inputs) // 2
batch_inputs_1 = batch_inputs[:midpoint]
batch_inputs_2 = batch_inputs[midpoint:]

# Save each batch to a separate file
batch_input_file_1 = "../../eval_data/batch_eval_critique_part1.jsonl"
batch_input_file_2 = "../../eval_data/batch_eval_critique_part2.jsonl"

with open(batch_input_file_1, "w") as f1:
    for item in batch_inputs_1:
        f1.write(json.dumps(item) + "\n")

with open(batch_input_file_2, "w") as f2:
    for item in batch_inputs_2:
        f2.write(json.dumps(item) + "\n")

In [60]:
# Upload file and create batch job
batch_file = client.files.create(
    file=open(batch_input_file_2, "rb"),
    purpose="batch"
)

In [62]:
batch_file

FileObject(id='file-74iDvE1Ut4yJxZBzJmNFZh', bytes=183697581, created_at=1751693368, filename='batch_eval_critique_part2.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)

In [82]:
batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

In [84]:
batch_job

Batch(id='batch_6868ba6599188190b2d89bba43402ba8', completion_window='24h', created_at=1751693925, endpoint='/v1/chat/completions', input_file_id='file-74iDvE1Ut4yJxZBzJmNFZh', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1751780325, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [110]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)

Batch(id='batch_6868ba6599188190b2d89bba43402ba8', completion_window='24h', created_at=1751693925, endpoint='/v1/chat/completions', input_file_id='file-74iDvE1Ut4yJxZBzJmNFZh', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1751694701, error_file_id=None, errors=None, expired_at=None, expires_at=1751780325, failed_at=None, finalizing_at=1751694610, in_progress_at=1751693930, metadata=None, output_file_id='file-LChfFUJEhATtv2ejL4V5F3', request_counts=BatchRequestCounts(completed=681, failed=0, total=681))


In [114]:
result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [115]:
result_file_name = "../../eval_data/batch_job_results_critique_part2.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [116]:
# Loading data from saved file
results = []
with open(result_file_name, 'r') as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

In [117]:
results[0]

{'id': 'batch_req_6868bd12ce9881909ee53d2e74907703',
 'custom_id': '227-groundedness',
 'response': {'status_code': 200,
  'request_id': 'c911ff5c8ea445e1fbba9d33ad448194',
  'body': {'id': 'chatcmpl-Bppu0eUGeFWI2CoH4jaCgHKDrONwy',
   'object': 'chat.completion',
   'created': 1751693940,
   'model': 'gpt-4o-mini-2024-07-18',
   'choices': [{'index': 0,
     'message': {'role': 'assistant',
      'content': '{"evaluation":"The context provides clear information about Prof. Mahsa Ghasemi\'s research focus, specifically mentioning that she studies the efficient and reliable use of data in sequential decision-making problems. This directly answers the question about what she studies in relation to data.","total_rating":5}',
      'refusal': None,
      'annotations': []},
     'logprobs': None,
     'finish_reason': 'stop'}],
   'usage': {'prompt_tokens': 37060,
    'completion_tokens': 57,
    'total_tokens': 37117,
    'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0},
  

In [118]:
evaluations = []

for result in results:
    custom_id: str = result["custom_id"]
    idx = int(custom_id[:custom_id.index("-")])
    metric = custom_id[custom_id.index("-")+1:]
    evaluations.append({"idx": idx, "metric": metric, **json.loads(result["response"]["body"]["choices"][0]["message"]["content"])})

In [119]:
evaluations[0]

{'idx': 227,
 'metric': 'groundedness',
 'evaluation': "The context provides clear information about Prof. Mahsa Ghasemi's research focus, specifically mentioning that she studies the efficient and reliable use of data in sequential decision-making problems. This directly answers the question about what she studies in relation to data.",
 'total_rating': 5}

In [120]:
# Combine all evaluations of the same idx into a single dictionary
combined_evaluations = {}

for eval_item in evaluations:
    idx = eval_item["idx"]
    metric = eval_item["metric"]
    # Initialize the dictionary for this idx if it doesn't exist
    if idx not in combined_evaluations:
        combined_evaluations[idx] = {}
    # Copy all keys except idx and metric
    for k, v in eval_item.items():
        if k not in ("idx", "metric"):
            combined_evaluations[idx][f"{metric}_{k}"] = v

# Convert to a list of dicts, adding the idx for each
combined_evaluations_list.extend([
    {"idx": idx, **metrics} for idx, metrics in combined_evaluations.items()
])

In [81]:
combined_evaluations_list[3]

{'idx': 3,
 'groundedness_evaluation': 'The context clearly states that probability is a number between 0 and 1, with 0 indicating that an event never occurs and 1 indicating that an event always occurs. This directly answers the question about the range of values for probability.',
 'groundedness_total_rating': 5,
 'relevance_evaluation': 'This question is fundamental to understanding probability, which is a key concept in data science and statistics. Knowing the range of values for probability (0 to 1) is essential for students as they work with data analysis and modeling in Python. It lays the groundwork for more complex topics such as probability distributions, statistical inference, and machine learning algorithms. Therefore, this question is highly relevant and useful for students in the course.',
 'relevance_total_rating': 5,
 'standalone_evaluation': 'The question asks about the range of values for probability, which is a fundamental concept in statistics and mathematics. It do

In [121]:
with open("../../eval_data/combined_evaluations_list.json", "w") as f:
    json.dump(combined_evaluations_list, f, indent=4)

In [126]:
# Filter QA pairs that score 4 or greater on all metrics
high_quality_qas = [
    qa for qa in combined_evaluations_list
    if all(
        qa.get(f"{metric}_total_rating", 0) >= 4
        for metric in ["groundedness", "relevance", "standalone"]
    )
]

In [127]:
high_quality_qas

[{'idx': 2,
  'groundedness_evaluation': "The context clearly states that the ECE 20875 course covers the topic of 'Probability and Random Variables.' This information directly answers the question without ambiguity.",
  'groundedness_total_rating': 5,
  'relevance_evaluation': 'This question is fundamental for students as it directly pertains to understanding the content and focus of the ECE 20875 course. Knowing the topics covered is essential for students to assess their interest and relevance to their academic and career goals. Therefore, it is highly useful for prospective and current students.',
  'relevance_total_rating': 5,
  'standalone_evaluation': 'The question is straightforward and directly asks about the content of the ECE 20875 course. It does not require any additional context or information to be understood, as it is clear that the inquiry is about the subject matter of a specific course. Therefore, it is context-independent.',
  'standalone_total_rating': 5},
 {'idx':

In [128]:
len(high_quality_qas)

251

In [129]:
with open("../../eval_data/image_eval_data.json", "r") as f:
    all_data = json.load(f)

# Get the indices of high quality QAs
hq_indices = {qa["idx"] for qa in high_quality_qas}

# Filter entries whose indices are in high_quality_qas
filtered_data = [entry for i, entry in enumerate(all_data) if i in hq_indices]

with open("../../eval_data/high_quality_image_eval_data.json", "w") as f:
    json.dump(filtered_data, f, indent=4)