In [1]:
import os
import replicate
import json
import time

In [2]:
with open("/Users/zhuobiaocai/Desktop/biology-review/replicate_token.txt", "r") as replicate_key_file:
    os.environ["REPLICATE_API_TOKEN"] = replicate_key_file.readline().strip()

In [3]:
# list the sub-topics of interest
# map each list of sub-topics to an overarching major topic
topics = {
    "Cells" : [
        "Functions of cell organelles",
        "Differences between eukaryotic and prokarytic cells",
        "Differences between animal and plant cells"
    ],
    "Biochemistry" : [
        "Important steps in cellular respiration",
        "Important steps in photosynthesis",
        "Differences between fermentation and aerobic respiration",
        "Functions of proteins",
        "Important steps in protein synthesis"
    ],
    "Genetics" : [
        "Important steps in DNA replication",
        "Central dogma of biology",
        "Real-life uses of genetic engineering",
        "Differences between RNA and DNA"
    ],
    "Human Physiology" : [
        "How food is digested in the digestive system",
        "How the nervous system controls muscle movement",
        "How gas exchange happens at the lungs",
        "Hormones and their functions"
    ],
    "Reproduction and Development" : [
        "Functions of different parts of the male reproductive system",
        "Functions of different parts of the female reproductive system"
    ],
    "Evolution" : [
        "Real-life examples of natural selection",
        "Real-life examples of homologous structures"
    ],
    "Ecology" : [
        "Effects of acid rain",
        "Effects of air pollution",
        "Ways to combat global warming"
    ]
}

In [10]:
def format_prompt(major_topic, sub_topic):

    return f'''
    
    Given a major topic and sub-topic, create 2 DIFFERENT multiple-choice review questions with 4 answer choices. ONLY one of the choices is correct.
    Provide the correct answer.

    IMPORTANT: Your questions should not be more than 4 sentences long.
    IMPORTANT: Provide ONLY the major topic, questions, and answers. DO NOT say anything else (ex: Here are two questions...).
    IMPORTANT: Your question and answer choices must use proper biological vocabularies.
    IMPORTANT: If possible, design the questions to be more complex than simply defining a vocabulary.
    IMPORTANT: Create your own original questions. DO NOT copy.
    IMPORTANT: Format your question by writing the major topic, question, answer choices, and correct answer in that specific order.
    Separate each question by major topic instead of sub-topic. See example below:
    
    Major Topic: Photosynthesis
    Which of the following best explains the ecological importance of photosynthesis?
    a. It creates carbon dioxide and water for the organisms in an ecosystem
    b. Photosynthesis transforms light energy into chemical energy in the bonds of glucose, which is a food source 
    for organisms in an ecosystem
    c. Photosynthesis releases the chemical bond energy in ATP
    d. Cellular respiration is the opposite reaction of photosynthesis
    Correct Answer: b

    Major Topic: Photosynthesis
    Which statement best describes the role of chlorophyll in photosynthesis?
    a. Chlorophyll absorbs light energy for breaking down a water molecule
    b. Chlorophyll reflects light energy to synthesize ATP
    c. Chlorophyll catalyzes the formation of water molecules
    d. Chlorophyll produces an electron concentration gradient for ATP synthase
    Correct Answer: a

    Here is the major topic: {major_topic}
    Here is the sub-topic: {sub_topic}
    Make 2 DIFFERENT questions. Remember to separate each question with the major topic and follow the example format.
    '''

def format_input(major_topic, sub_topic):

    prompt = format_prompt(major_topic, sub_topic)

    system_prompt = "You are a high school biology teacher. You are making questions to help students review biology concepts."

    return {
        "prompt" : prompt,
        "system_prompt" : system_prompt,
        "max_tokens" : 200
    }

In [11]:
major_topic_list = []
question_list = []

for major_topic in topics.keys():

    sub_topics = topics[major_topic]

    for sub_topic in sub_topics:

        llm_input = format_input(major_topic, sub_topic)
        
        output = replicate.run(
            "meta/meta-llama-3-70b-instruct",
            input=llm_input
        )
        
        question_list.append("".join(output))
        major_topic_list.append(major_topic)

        if len(question_list) % 20 == 0:
            time.sleep(5)

In [12]:
if len(question_list) == len(major_topic_list):
    original_questions_filename = "generated_questions_3_28_2025.txt"
    
    with open(original_questions_filename, "a") as file:
        for q in question_list:
            file.write(q.strip() + "\n")
else:
    print("An error has occurred.")