In [1]:
!pip install -q google-generativeai openai pandas

import os
import time
import re
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, TimeoutError as Timeout

# ‚ùó FIX: Remove environment variables forcing Gemini ‚Üí localhost
for var in [
    "GOOGLE_API_BASE",
    "GOOGLE_API_ENDPOINT",
    "GOOGLE_BASE_URL",
    "GOOGLEAI_API_BASE",
    "GOOGLEAI_API_ENDPOINT"
]:
    if var in os.environ:
        print("Removed bad env var:", var)
        del os.environ[var]

print("üîß Environment cleaned.")


üîß Environment cleaned.


In [2]:
import google.generativeai as genai
from openai import OpenAI
from google.colab import userdata

GOOGLE_API_KEY = userdata.get("GOOGLE_API_KEY")
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")

if GOOGLE_API_KEY is None:
    raise ValueError("Missing GOOGLE_API_KEY. Add in Colab User Data.")
if OPENAI_API_KEY is None:
    raise ValueError("Missing OPENAI_API_KEY. Add in Colab User Data.")

genai.configure(api_key=GOOGLE_API_KEY)
gemini_model = genai.GenerativeModel("gemini-2.5-pro")

openai_client = OpenAI(api_key=OPENAI_API_KEY)

print("‚úÖ Gemini + OpenAI initialized.")


‚úÖ Gemini + OpenAI initialized.


In [3]:
def build_prompt(book_title, author):
    return f"""
""Your task is to generate CROWD questions (Completion, Recall, Open-ended, Distancing, Wh-questions) based on the story.

    Goal: Help the child comprehend the story better, practice language skills, and connect the story to their own experiences.

  . Use simple, precise language appropriate for children aged 4-7. Avoid complex sentences and vocabulary.

    USE THE FOLLOWING CROWD STRATEGY AND EXAMPLES:

    Crowd Questioning strategy:
    C-Completion
    R-Recall
    O-Open-end questions
    W-wh questions
    D-Distancing questions

    Completion Questions: Fill in the blank questions
    E.g. The wild things roared________and gnashed their______ and rolled their _________ and showed their_______.

    Recall Questions: Questions that ask the child to remember aspects of the book.
    E.g. What did Mom call Max when he did lots of mischiefs?

    Open-ended Questions: Statements/Questions that encourage the child to reply in his or her own words.
    E.g. Tell me what you see on this page.
    E.g. How do you know that Max is feeling lonely?

    Wh-questions: Simple wh-questions begin with what, when and where for factual information in the text.
    E.g. What‚Äôs this?
    E.g. Where is Max?

    Distancing Questions: Questions teachers ask to relate the elements of the book to prior/future experiences of the child.
    E.g. What happened when you had a time-out?

    Higher-Level Question Prompts:
    How do you think if you were_______?
    How do you feel if we/you/_________?
    How do you know?
    How did he/she/it _____?
    Why do you think so?
    What happened last time when you _________?
CRITICAL CONSTRAINTS (MUST FOLLOW):

1. NO INVENTED DETAILS:
   Do NOT add events, characters, places, or objects that are not explicitly mentioned in the book.

2. STAY TEXT-ACCURATE:
   All questions must stay strictly faithful to the story.
   Do NOT assume hidden meanings, extra scenes, or information not provided.

3. AGE-APPROPRIATE LANGUAGE:
   Keep questions short, clear, and simple, especially for young learners (Pre-K to Grade 2).

4. COMPLETION QUESTIONS:
   Use ONLY ONE blank.
   Provide full context before the blank.
   Never use multiple blanks or remove too much information.

5. DISTANCING QUESTIONS:
   Connect only to the child‚Äôs real-life experiences ‚Äî
   NOT to fictional characters or imaginary situations outside the book.

6. WH-QUESTIONS:
   Use only when the book provides clear details.
   Do NOT ask ‚Äúwhen,‚Äù ‚Äúwhere,‚Äù or ‚Äúwhy‚Äù questions if the story does not provide that information.

7. NO COMPLEX MULTI-PART QUESTIONS:
   Ask ONLY one idea per question.
   Avoid lists, long sequences, or multi-step recall demands.

# 8. GRADE LEVEL ALIGNMENT:
#    - Pre-K / K ‚Üí Completion, Recall, simple Wh-questions only.
#    - Grades 1‚Äì2 ‚Üí Basic inference, simple cause/effect.
#    - Grades 3‚Äì4 ‚Üí Character actions, motives, short explanations.
#    - Grades 5‚Äì6 ‚Üí Themes, comparisons, deeper reasoning.

9. NEVER SUMMARIZE OR RETELL THE STORY:
   Only generate the question ‚Äî do not explain the book or add extra descriptions.

10. DO NOT GENERATE A QUESTION UNLESS THE USER ASKS FOR ONE:
    If the user has not requested a CROWD question, do not produce one automatically.

    Title: {book_title}
    Author: {author}
Grade: K‚Äì1

You MUST output:
2 Completion
2 Recall
2 Wh-Question
2 Open-Ended
2 Distancing

FORMAT (Do NOT change this):
Completion: <question 1>
Completion: <question 2>
Recall: <question>
Recall: <question>
Wh-Question: <question>
Wh-Question: <question>
Open-Ended: <question>
Open-Ended: <question>
Distancing: <question>
Distancing: <question>

RULES:
- No JSON
- No bullets
- No numbering
- No explanations
- Output ONLY the 10 lines
- Use simple language for ages 4‚Äì7
""".strip()



In [4]:
def call_with_timeout(fn, timeout_sec=40):
    with ThreadPoolExecutor(max_workers=1) as exe:
        future = exe.submit(fn)
        try:
            return future.result(timeout=timeout_sec)
        except Timeout:
            return None


In [5]:
def run_llm(book_title, author, model_name):
    prompt = build_prompt(book_title, author)

    # Gemini
    if model_name == "gemini":
        return gemini_model.generate_content(prompt).text

    # GPT (OpenAI)
    if model_name == "gpt5.1":
        completion = openai_client.chat.completions.create(
            model="gpt-4.1-mini",  # use 4.1-mini (stable)
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        return completion.choices[0].message.content

    raise ValueError("model_name must be 'gemini' or 'gpt5.1'")


In [6]:
ALLOWED = {
    "Completion",
    "Recall",
    "Wh-Question",
    "Open-Ended",
    "Distancing",
}

def parse_output(raw, book_title, author, model_name):
    rows = []
    for line in raw.split("\n"):
        if ":" not in line:
            continue
        label, question = line.split(":", 1)
        label = label.strip()
        question = question.strip()
        if label not in ALLOWED:
            continue
        rows.append({
            "book_title": book_title,
            "author": author,
            "llm_model": model_name,
            "crowd_label": label,
            "question_generated": question
        })
    return rows


In [7]:
BOOKS = [
        {"book_title": "Last Stop on Market Street. ", "author": "Dan Santat"},
            {"book_title": "After the Fall", "author": "Dan Santat"}
    # {"book_title": "The Very Hungry Caterpillar", "author": "Eric Carle"},
    # {"book_title": "Brown Bear Brown Bear What Do You See?", "author": "Eric Carle"},
    # {"book_title": "The Grouchy Ladybug", "author": "Eric Carle"},
    # {"book_title": "A House for a Hermit Crab", "author": "Eric Carle"},
    # {"book_title": "Mister Seahorse", "author": "Eric Carle"},
    # {"book_title": "The Very Busy Spider", "author": "Eric Carle"},
    # {"book_title": "The Mixed-Up Chameleon", "author": "Eric Carle"},
    # {"book_title": "Pancakes, Pancakes!", "author": "Eric Carle"},
    # {"book_title": "10 Little Rubber Ducks", "author": "Eric Carle"},
    # {"book_title": "1,2,3 to the Zoo", "author": "Eric Carle"},
    # {"book_title": "The Giving Tree", "author": "Shel Silverstein"},
    # {"book_title": "How to Hide a Lion", "author": "Helen Stephens"},
    # {"book_title": "Where the Wild Things Are", "author": "Maurice Sendak"},
    # {"book_title": "Rosie's Walk", "author": "Pat Hutchins"},
    # {"book_title": "Alma and How She Got Her Name", "author": "Juana Martinez-Neal"},
    # {"book_title": "Mae Among the Stars", "author": "Roda Ahmed"},
    # {"book_title": "Lubna and Pebble", "author": "Wendy Meddour"},
    # {"book_title": "Can I Be Your Dog?", "author": "Troy Cummings"},
    # {"book_title": "Last Stop on Market Street", "author": "Matt de la Pe√±a"},
    # {"book_title": "This Is Not My Hat", "author": "Jon Klassen"},
    # {"book_title": "Watercress", "author": "Andrea Wang"},
]


In [8]:
gemini_rows = []
gpt51_rows = []

for book in BOOKS:
    title = book["book_title"]
    author = book["author"]

    print(f"\nüìò Processing: {title}")

    # GEMINI
    gem_out = call_with_timeout(lambda: run_llm(title, author, "gemini"), timeout_sec=40)
    if gem_out:
        gemini_rows += parse_output(gem_out, title, author, "gemini")
    else:
        print("  ‚è≥ Gemini timeout, skipping.")

    # GPT-5.1 (actually 4.1-mini for now)
    gpt_out = call_with_timeout(lambda: run_llm(title, author, "gpt5.1"), timeout_sec=40)
    if gpt_out:
        gpt51_rows += parse_output(gpt_out, title, author, "gpt5.1")
    else:
        print("  ‚è≥ GPT-5.1 timeout, skipping.")



üìò Processing: Last Stop on Market Street. 

üìò Processing: After the Fall


In [9]:
df_gemini = pd.DataFrame(gemini_rows)
df_gpt51 = pd.DataFrame(gpt51_rows)

df_gemini.to_csv("fewcrowd_questions_gemini.csv", index=False)
df_gpt51.to_csv("fewcrowd_questions_gpt51.csv", index=False)

print("Saved:")
print("  ‚Ä¢ crowd_questions_gemini.csv")
print("  ‚Ä¢ crowd_questions_gpt51.csv")


Saved:
  ‚Ä¢ crowd_questions_gemini.csv
  ‚Ä¢ crowd_questions_gpt51.csv
