In [58]:
import json

# Replace 'your_file.json' with the path to your file
file_path = 'your_file.json'

dev_file = 'QuALITY.v1.0.1/QuALITY.v1.0.1.htmlstripped.dev'
test_file = 'QuALITY.v1.0.1/QuALITY.v1.0.1.htmlstripped.test'
train_file = 'QuALITY.v1.0.1/QuALITY.v1.0.1.htmlstripped.train'

def read_data(file):
    data = []
    with open(file, 'r', encoding='utf8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

dev_data = read_data(dev_file)
train_data = read_data(train_file)


In [67]:
sum([len(article['questions']) for article in dev_data[:11]]), sum([len(article['questions']) for article in train_data])

(99, 2523)

In [48]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()
client = OpenAI(api_key=os.getenv("API_KEY"))

def call_openai(prompt, max_tokens=10):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
    )

    return response.choices[0].message.content

In [49]:
def parse_response(response : str) -> int:
    for c in response:
        if c.isdigit():
            return int(c)

    return 0

In [50]:
def format_prompt(article_text, question, question_options):
    """
    Formats the article text, question, and question options into a prompt.

    Args:
    - article_text (str): The text of the article.
    - question (str): The question to be answered.
    - question_options (list): A list of answer options for the question.

    Returns:
    - str: A formatted prompt string.
    """
    # Formatting the article text
    formatted_article = f"Article:\n{article_text}\n\n"

    # Formatting the question
    formatted_question = f"Question:\n{question}\n\n"

    # Formatting the options
    formatted_options = "Options:\n"
    for idx, option in enumerate(question_options, start=1):
        formatted_options += f"{idx}. {option}\n"

    # Combining all parts into one prompt
    prompt = formatted_article + formatted_question + formatted_options
    return prompt

In [69]:
d[0]['questions'][0].keys()

dict_keys(['question', 'question_unique_id', 'options', 'writer_label', 'gold_label', 'validation', 'speed_validation', 'difficult'])

In [73]:
results = []
for i, row in enumerate(d[:11]):
    # Extracting the article text
    article_text = row["article"]

    # Extracting the questions and their options
    questions = row["questions"]
    for j, question_data in enumerate(questions):
        question = question_data["question"]
        question_options = question_data["options"]
        expected_option_idx = question_data["gold_label"]

        # Formatting the prompt
        formatted_prompt = format_prompt(article_text, question, question_options)
        response = call_openai(formatted_prompt)

        results.append(
            {
                "article_id": row["article_id"],
                "question_id": question_data["question_unique_id"],
                'response_text': response,
                "response": parse_response(response),
                "correct_answer": expected_option_idx,
                "difficulty": question_data["difficult"],
            }
        )

In [None]:
with open('results_first_11.json', 'w') as file:
    json.dump(results, file)

[{'article_id': '52995',
  'question_id': '52995_I3M5VUMM_1',
  'response': 2,
  'correct_answer': 3,
  'difficulty': 0},
 {'article_id': '52995',
  'question_id': '52995_I3M5VUMM_2',
  'response': 4,
  'correct_answer': 4,
  'difficulty': 0},
 {'article_id': '52995',
  'question_id': '52995_I3M5VUMM_3',
  'response': 3,
  'correct_answer': 3,
  'difficulty': 1},
 {'article_id': '52995',
  'question_id': '52995_I3M5VUMM_4',
  'response': 1,
  'correct_answer': 1,
  'difficulty': 1},
 {'article_id': '52995',
  'question_id': '52995_I3M5VUMM_5',
  'response': 1,
  'correct_answer': 1,
  'difficulty': 1},
 {'article_id': '52995',
  'question_id': '52995_I3M5VUMM_6',
  'response': 4,
  'correct_answer': 4,
  'difficulty': 1},
 {'article_id': '52995',
  'question_id': '52995_I3M5VUMM_7',
  'response': 2,
  'correct_answer': 2,
  'difficulty': 1},
 {'article_id': '52995',
  'question_id': '52995_I3M5VUMM_8',
  'response': 1,
  'correct_answer': 1,
  'difficulty': 0}]