In [3]:
import os
import replicate
import json
import time

In [5]:
with open("replicate_token.txt", "r") as replicate_key_file:
    os.environ["REPLICATE_API_TOKEN"] = replicate_key_file.readline().strip()

In [7]:
# list the sub-topics of interest
# map each list of sub-topics to an overarching major topic
topics = {
    "Cells" : [
        "Ribosome",
        "Nucleus",
        "Cell Membrane",
        "Mitochondria",
        "Chloroplast",
        "Cell Wall",
        "Lysosome",
        "Rough Endoplasmic Reticulum",
        "Smooth Endoplasmic Reticulum",
        "Golgi Body",
        "Vacuole",
        "Cytoplasm"
    ],
    "Biochemistry" : [
        "Photosynthesis",
        "Aerobic Respiration",
        "Anaerobic Respiration or Fermentation",
        "Glycolysis",
        "Enzyme",
        "Transcription",
        "Translation",
        "Protein Synthesis",
        "Protein Structure"
    ],
    "Genetics" : [
        "Nucleotides",
        "Gene",
        "DNA Replication",
        "DNA Structure",
        "Chromosome Structure",
        "Gene Expression",
        "Genotype",
        "Phenotype",
        "Genetic Engineering",
        "Bacterial Transformation",
        "CRISPR-Cas9"
    ],
    "Human Physiology" : [
        "Human Circulatory System",
        "Human Respiratory System",
        "Human Digestive System",
        "Human Nervous System",
        "Human Endocrine System",
        "Human Immune System",
        "Human Male Reproductive System",
        "Human Female Reproductive System",
        "Hormone Functions"
    ],
    "Reproduction and Development" : [
        "Sexual Reproduction",
        "Asexual Reproduction",
        "Mitosis",
        "Meiosis",
        "Gametes",
        "Menstrual Cycle",
        "Zygote",
        "Germ Cell Layers",
        "Stem Cells"
    ],
    "Evolution" : [
        "Natural Selection",
        "Resource Competition",
        "Genetic Drift",
        "Convergent Evolution",
        "Divergent Evolution",
        "Common Ancestor",
        "Adaptive Radiation"
    ],
    "Ecology" : [
        "Heterotroph vs Autotroph",
        "Predator-Prey Relationship",
        "Ecological Succession",
        "Invasive Species",
        "Food Chain",
        "Carrying Capacity",
        "Niche",
        "Symbiosis"
    ]
}

In [9]:
def format_prompt(topic):

    return f'''
    
    Given a topic, create a multiple-choice review question with 4 choices. ONLY one of the choices is correct.
    Provide the correct answer.
    
    Here is the topic: {topic}

    IMPORTANT: Your question should not be more than 3 sentences long.
    IMPORTANT: Provide only the question and the answer. No other text needed.
    IMPORTANT: Format your question and answer like the example below:
    Topic: Photosynthesis
    What are the chemical reactants in photosynthesis?
    a. Oxygen and water
    b. Water and carbon dioxide
    c. Glucose and water
    d. Oxygen and glucose
    Correct Answer: b
    '''

def format_input(topic):

    prompt = format_prompt(topic)

    system_prompt = "You are a high school biology teacher. You are making questions to help students review biology concepts."

    return {
        "prompt" : prompt,
        "system_prompt" : system_prompt,
        "max_tokens" : 200
    }

In [11]:
major_topic_list = []
question_list = []

for major_topic in topics.keys():

    sub_topics = topics[major_topic]

    for topic in sub_topics:

        llm_input = format_input(topic)
        
        output = replicate.run(
            "meta/meta-llama-3-70b-instruct",
            input=llm_input
        )
        
        question_list.append("".join(output))
        major_topic_list.append(major_topic)

        if len(question_list) % 20 == 0:
            time.sleep(5)

In [13]:
len(question_list) == len(major_topic_list)

True

In [15]:
original_questions_filename = "question_generation/original_generated_questions.txt"
with open(original_questions_filename, "a") as file:
    for q in question_list:
        file.write(q.strip() + "\n")