In [None]:
from datasets import load_dataset
import random
import string
from dataclasses import dataclass, asdict
from typing import List
import json
from pathlib import Path
import csv


In [38]:
from huggingface_hub import login

In [39]:
REFUSE_CHOICE = "Insufficient information to answer the question"
ALPHABET = string.ascii_uppercase

In [40]:
@dataclass
class MultipleChoiceQuestion():
    question: str
    choices: list[str]         
    correct_answer: str         
    unsure_option: str          
    sources: list[str]         
    ideal: str                  
    distractors: list[str]      


In [41]:
def randomize_choices(ideal: str, distractors: list[str]) -> tuple[list[str], str, str]:
    # Combines correct answer, "Insufficient information" option, and distractors
    choices = [ideal, REFUSE_CHOICE, *distractors]
    
    # Creates letter choices (A, B, C, etc)
    n_choices = len(choices)
    if n_choices > len(ALPHABET):
        raise ValueError("Too many choices")

    # Randomizes the order
    perm = list(range(n_choices))
    random.shuffle(perm)
    shuffled_choices = [
        f"({letter}) {choices[sigma_i]}"
        for letter, sigma_i in zip(ALPHABET, perm, strict=False)
    ]

    # Returns the correct answer letter and "unsure" option letter
    answer = ALPHABET[perm.index(0)]
    unsure = ALPHABET[perm.index(1)]

    return shuffled_choices, answer, unsure

In [42]:
def save_questions(questions: List[MultipleChoiceQuestion], output_dir: Path):
    """Save questions in multiple formats."""
    output_dir.mkdir(exist_ok=True)
    
    # Save as JSON
    with open(output_dir / "questions.json", 'w') as f:
        json.dump([asdict(q) for q in questions], f, indent=2)
    
    # Save as JSONL (one question per line)
    with open(output_dir / "questions.jsonl", 'w') as f:
        for q in questions:
            f.write(json.dumps(asdict(q)) + '\n')
    
    # Save as CSV
    with open(output_dir / "questions.csv", 'w', newline='') as f:
        writer = csv.writer(f)
        # Write header
        headers = ["question", "choices", "correct_answer", "unsure_option", "sources", "ideal", "distractors"]
        writer.writerow(headers)
        # Write data
        for q in questions:
            writer.writerow([
                q.question,
                "|".join(q.choices),  # Join choices with pipe separator
                q.correct_answer,
                q.unsure_option,
                "|".join(q.sources),
                q.ideal,
                "|".join(q.distractors)
            ])

In [43]:
def format_training_questions(save_dir: Path | str = "formatted_questions_test"):
    """
    Format training questions from the LitQA2 dataset.
    
    Args:
        save_dir: Directory to save the formatted questions
    
    Returns:
        List of formatted MultipleChoiceQuestion objects
    """
    # Load training dataset
    print("Loading training dataset...")
    try:
        # Login to Hugging Face
        login(token="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")
        
        # Load dataset
        dataset = load_dataset("futurehouse/aviary-paper-data", "LitQA2")
        
        # Try to access 'test' split, or fallback to whatever is available
        if 'test' in dataset:
            train_data = dataset['test']
        else:
            # Get the first available split
            first_split = list(dataset.keys())[0]
            train_data = dataset[first_split]
            print(f"'test' split not found, using '{first_split}' instead")
        
        print("Loaded successfully")
        
        # Check and print first item structure to help debug
        if len(train_data) > 0:
            first_item = train_data[0]
            print(f"First item keys: {list(first_item.keys())}")
            print(f"Example question: {first_item.get('question', 'N/A')}")
        
        # Format all questions
        formatted_questions = []
        for q in train_data:
            choices, answer, unsure = randomize_choices(
                ideal=q['ideal'],
                distractors=q['distractors']
            )
            
            formatted_questions.append(MultipleChoiceQuestion(
                question=q['question'],
                choices=choices,
                correct_answer=answer,
                unsure_option=unsure,
                sources=q['sources'] if 'sources' in q else [],
                ideal=q['ideal'],
                distractors=q['distractors']
            ))
        
        print(f"\nFormatted {len(formatted_questions)} questions")
        
        # Save questions
        output_dir = Path(save_dir)
        save_questions(formatted_questions, output_dir)
        print(f"\nSaved questions to {output_dir}")
        
        # Print example
        if formatted_questions:
            print("\nExample formatted question:")
            example = formatted_questions[0]
            print(f"Question: {example.question}")
            print("\nChoices:")
            for choice in example.choices:
                print(choice)
            print(f"\nCorrect Answer: {example.correct_answer}")
            print(f"'Unsure' Option: {example.unsure_option}")
            print(f"Sources: {example.sources}")
        
        return formatted_questions
        
    except Exception as e:
        print(f"Error formatting questions: {e}")
        import traceback
        traceback.print_exc()
        return []

In [44]:
questions = format_training_questions()

Loading training dataset...
Loaded successfully
First item keys: ['id', 'question', 'ideal', 'distractors', 'canary', 'tag', 'version', 'sources', 'is_opensource', 'subtask', 'key-passage']
Example question: Approximately what percentage of topologically associated domains in the GM12878 blood cell line does DiffDomain classify as reorganized in the K562 cell line?

Formatted 49 questions

Saved questions to formatted_questions_test

Example formatted question:
Question: Approximately what percentage of topologically associated domains in the GM12878 blood cell line does DiffDomain classify as reorganized in the K562 cell line?

Choices:
(A) 41%
(B) 51%
(C) 31%
(D) Insufficient information to answer the question
(E) 11%
(F) 21%

Correct Answer: C
'Unsure' Option: D
Sources: ['https://doi.org/10.1038/s41467-024-44782-6']


In [45]:
questions[2]


MultipleChoiceQuestion(question='DK015 and DK038 strains of Verticillium dahliae have in common approximately what percentage orthologous genes?', choices=['(A) 97%', '(B) Insufficient information to answer the question', '(C) 95%', '(D) 98%', '(E) 96%', '(F) 94%'], correct_answer='C', unsure_option='B', sources=['https://doi.org/10.1186/s12915-024-01900-6'], ideal='95%', distractors=['94%', '96%', '97%', '98%'])