In [7]:
import re
import json
import os

In [8]:
LIST_OF_ORACLES = ["./raw_oracles/"+f for f in os.listdir("./raw_oracles/") if f.endswith('.txt')]
print(LIST_OF_ORACLES)

['./raw_oracles/Oracle_itembank-2.txt', './raw_oracles/Oracle_itembank-3.txt', './raw_oracles/Oracle_itembank-4.txt', './raw_oracles/Oracle_itembank-5.txt', './raw_oracles/Oracle_itembank-9.txt']


In [9]:
for doc in LIST_OF_ORACLES:
    with open(doc, 'r', encoding='utf-8') as file:
        oracle_txt = file.read().strip()

    itembank_number = re.search(r"\d[a|b]?(?:\-\d)?", doc).group()

    # Split the content into blocks using '"""'
    oracle_blocks = re.split(r'\n"""', oracle_txt)
    oracle_blocks = [block.strip() for block in oracle_blocks]

    data = []

    for block in oracle_blocks:
        # Each block consists of a text chunk followed by questions
        parts = [s.strip() for s in re.split(r'==', block)]
        
        # assuming the first part is always the text chunk and others are questions
        correct_text_chunk = parts[0].rstrip('"""')
        questions = parts[1:]

        for question_block in questions:
            lines = question_block.split('\n')
            
            question_lines = []
            answer_lines_raw = []  # still with hashtags
            answer_lines = []  # with letters
            option_labels = ['a)', 'b)', 'c)', 'd)']

            # Extract category, question, and correct answer info
            category = lines[0].strip()
            question = ""
            correct_answer = ""

            for line in lines:
                if re.match(r"^[A-Z]", line):  # Starts with a capital letter
                    question = line.strip()
                elif re.match(r"^#", line):  # Answer lines start with #
                    answer_lines_raw.append(line.strip())

            for idx, answer_line in enumerate(answer_lines_raw):
                # Check if the answer is marked as correct
                if answer_line.startswith('# !'):
                    is_correct = True
                    answer_text = answer_line[3:].strip()  # Remove the '# !' prefix
                else:
                    is_correct = False
                    answer_text = answer_line[2:].strip()  # Remove the '#' prefix

                # Assign a letter to the option
                option_label = option_labels[idx]
                option_entry = f"{option_label} {answer_text}"
                answer_lines.append(option_entry)

                # Collect the correct answer(s)
                if is_correct:
                    correct_answer = option_entry
            
            # Construct the item
            item = {
                'correct_text_chunk': correct_text_chunk,
                'question': question,
                'category': category,
                'answers': answer_lines,
                'correct answer(s)': correct_answer
            }
            
            data.append(item)

    # Output the data to a JSON file
    with open(f"./JSON_oracles/oracle-itembank-{itembank_number}.json", 'w', encoding='utf-8') as f_out:
        json.dump(data, f_out, ensure_ascii=False, indent=2)