In [66]:
from datasets import load_dataset
import random
import string
from dataclasses import dataclass, asdict
from typing import List
import json
from pathlib import Path
import csv

In [67]:
REFUSE_CHOICE = "Insufficient information to answer the question"
ALPHABET = string.ascii_uppercase

In [None]:
class MultipleChoiceQuestion():
    question: str
    choices: list[str]         
    correct_answer: str         
    unsure_option: str          
    sources: list[str]         
    ideal: str                  
    distractors: list[str]      


In [69]:
def randomize_choices(ideal: str, distractors: list[str]) -> tuple[list[str], str, str]:
    # Combines correct answer, "Insufficient information" option, and distractors
    choices = [ideal, REFUSE_CHOICE, *distractors]
    
    # Creates letter choices (A, B, C, etc)
    n_choices = len(choices)
    if n_choices > len(ALPHABET):
        raise ValueError("Too many choices")

    # Randomizes the order
    perm = list(range(n_choices))
    random.shuffle(perm)
    shuffled_choices = [
        f"({letter}) {choices[sigma_i]}"
        for letter, sigma_i in zip(ALPHABET, perm, strict=False)
    ]

    # Returns the correct answer letter and "unsure" option letter
    answer = ALPHABET[perm.index(0)]
    unsure = ALPHABET[perm.index(1)]

    return shuffled_choices, answer, unsure

In [70]:
def save_questions(questions: List[MultipleChoiceQuestion], output_dir: Path):
    """Save questions in multiple formats."""
    output_dir.mkdir(exist_ok=True)
    
    # Save as JSON
    with open(output_dir / "questions.json", 'w') as f:
        json.dump([asdict(q) for q in questions], f, indent=2)
    
    # Save as JSONL (one question per line)
    with open(output_dir / "questions.jsonl", 'w') as f:
        for q in questions:
            f.write(json.dumps(asdict(q)) + '\n')
    
    # Save as CSV
    with open(output_dir / "questions.csv", 'w', newline='') as f:
        writer = csv.writer(f)
        # Write header
        headers = ["question", "choices", "correct_answer", "unsure_option", "sources", "ideal", "distractors"]
        writer.writerow(headers)
        # Write data
        for q in questions:
            writer.writerow([
                q.question,
                "|".join(q.choices),  # Join choices with pipe separator
                q.correct_answer,
                q.unsure_option,
                "|".join(q.sources),
                q.ideal,
                "|".join(q.distractors)
            ])

In [71]:
def format_training_questions(save_dir: Path | str = "formatted_questions"):
    # Load training dataset
    print("Loading training dataset...")
    train_data = load_dataset("futurehouse/lab-bench", "LitQA2")["train"]
    
    # Format all questions
    formatted_questions = []
    for q in train_data:
        choices, answer, unsure = randomize_choices(
            ideal=q['ideal'],
            distractors=q['distractors']
        )
        
        formatted_questions.append(MultipleChoiceQuestion(
            question=q['question'],
            choices=choices,
            correct_answer=answer,
            unsure_option=unsure,
            sources=q['sources'],
            ideal=q['ideal'],
            distractors=q['distractors']
        ))
    
    print(f"\nFormatted {len(formatted_questions)} questions")
    
    # Save questions
    output_dir = Path(save_dir)
    save_questions(formatted_questions, output_dir)
    print(f"\nSaved questions to {output_dir}")

    # Print example
    print("\nExample formatted question:")
    example = formatted_questions[0]
    print(f"Question: {example.question}")
    print("\nChoices:")
    for choice in example.choices:
        print(choice)
    print(f"\nCorrect Answer: {example.correct_answer}")
    print(f"'Unsure' Option: {example.unsure_option}")
    print(f"Sources: {example.sources}")
    
    return formatted_questions

In [72]:
questions = format_training_questions()

Loading training dataset...

Formatted 199 questions

Saved questions to formatted_questions

Example formatted question:
Question: Acinetobacter lwoffii has been evolved in the lab to be resistant to which of these antibiotics?

Choices:
(A) gentamicin
(B) ampicillin
(C) meropenem
(D) Insufficient information to answer the question
(E) ciproflaxin

Correct Answer: E
'Unsure' Option: D
Sources: ['https://doi.org/10.1128/msphere.00109-24']


In [73]:
questions[2]


MultipleChoiceQuestion(question='Among Cas9-disrupted loci in human neural stem cells, what fraction of disruption phenotypes were apparent after 4 cell divisions?', choices=['(A) 31% of enhancer loci', '(B) 31% of gene loci', '(C) 61% conserved region loci', '(D) Insufficient information to answer the question', '(E) 61% of enhancer loci', '(F) 61% of gene loci', '(G) 31% conserved region loci'], correct_answer='F', unsure_option='D', sources=['https://doi.org/10.1016/j.celrep.2024.113693'], ideal='61% of gene loci', distractors=['61% of enhancer loci', '31% of gene loci', '31% of enhancer loci', '61% conserved region loci', '31% conserved region loci'])