In [None]:
import requests
import json
import uuid
from tqdm import tqdm
import os
import concurrent.futures
import time

import dotenv

dotenv.load_dotenv("../.env")

In [None]:
# Generated by o3

# | Rank\* | Typical opener            | Why it shows up so often                                                                                                                                             |
# | ------ | ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
# | 1      | **The**                   | Sets up a specific noun phrase (“The FDA…”, “The study…”)—still the single most common first token in mainstream-news headlines and story texts. ([BuzzSumo.com][1]) |
# | 2      | **This**                  | Demonstrative that promises immediacy or a reveal (“This graph shows…”) and tops click-through studies of social headlines. ([Medium][2])                            |
# | 3      | **How**                   | Signals an explainer or guide; “How …” leads consistently sit in the top 5 of Facebook-shared headlines. ([BuzzSumo.com][1])                                         |
# | 4      | **Why**                   | Frames a causal narrative or opinion piece; performs well on both Facebook and Twitter. ([BuzzSumo.com][1])                                                          |
# | 5      | **What**                  | Teaser for a fact list (“What you need to know …”). ([BuzzSumo.com][1])                                                                                              |
# | 6      | **A / An**                | Indefinite article; opens softer features or profile pieces (“A small town in …”). Corpus counts put it just behind *the*.                                           |
# | 7      | **In**                    | Preposition for situating events in time or place (“In 2025,” “In Chicago,”). Common in hard-news ledes.                                                             |
# | 8      | **New**                   | Evokes novelty; BuzzSumo finds “new” + noun headlines among the most reshared. ([BuzzSumo.com][1])                                                                   |
# | 9      | **Here**                  | Deictic opener for listicles/resource posts (“Here are the 7 …”). ([BuzzSumo.com][1])                                                                                |
# | 10     | **Numerals (10, 5, 7 …)** | List-post convention—numbers trigger higher engagement and scanning. ([BuzzSumo.com][1])                                                                             |

# [1]: https://buzzsumo.com/blog/most-shared-headlines-study/ "
#           100m Articles Analyzed: What You Need To Write The Best Headlines | BuzzSumo.com      "
# [2]: https://medium.com/the-mission/this-new-data-will-make-you-rethink-how-you-write-headlines-751358f6639a "This New Data Will Make You Rethink How You Write Headlines | by Larry Kim | Mission.org | Medium"

In [None]:
common_beginning_words = [
    "The",
    "This",
    "How",
    "Why",
    "What",
    "A",
    "An",
    "In",
    "New",
]

In [None]:
model_dict = {
    "openai/gpt-oss-120b": "../outputs-oss-120b",
    "openai/gpt-oss-20b": "../outputs-oss-20b",
    "qwen/qwen3-coder": "../outputs-qwen3-coder",
    "z-ai/glm-4.5": "../outputs-glm-4.5",
    "moonshotai/kimi-k2": "../outputs-kimi-k2",
    "deepseek/deepseek-chat-v3-0324": "../outputs-dpsk-v3-0324",
}

In [None]:
total = 1

for model, output_dir in model_dict.items():
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    def generate_and_save(begin_with):
        if not os.path.exists(f"{output_dir}/{begin_with}"):
            os.makedirs(f"{output_dir}/{begin_with}")

        num_existing_files = len(os.listdir(f"{output_dir}/{begin_with}"))
        tasks_needed = total - num_existing_files

        def single_generation(_):
            max_retries = 3
            timeout = 30  # seconds

            for attempt in range(max_retries):
                try:
                    response = requests.post(
                        url="https://openrouter.ai/api/v1/completions",
                        headers={
                            "Authorization": f"Bearer {os.getenv('OPENROUTER_API_KEY')}",
                            "Content-Type": "application/json",
                        },
                        data=json.dumps(
                            {
                                "model": model,
                                "prompt": begin_with,
                                "max_tokens": 1000,
                                "temperature": 1.0,
                            }
                        ),
                        timeout=timeout,
                    )
                    return response
                except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
                    if attempt < max_retries - 1:
                        time.sleep(2 ** attempt)  # Exponential backoff
                    else:
                        raise  # Re-raise on final attempt

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(single_generation, _) for _ in range(tasks_needed)]
            for future in tqdm(concurrent.futures.as_completed(futures), total=tasks_needed, desc=f"Generating for '{begin_with}'"):
                try:
                    response = future.result()
                    with open(f"{output_dir}/{begin_with}/{uuid.uuid4()}.txt", "w") as f:
                        f.write(begin_with + response.json()["choices"][0]["text"])
                except Exception as e:
                    print(f"Error for '{begin_with}': {e}")

    # Parallelize all begin_with for each model
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(generate_and_save, begin_with) for begin_with in common_beginning_words]
        for _ in tqdm(concurrent.futures.as_completed(futures), total=len(common_beginning_words), desc=f"Begin words for model {model}"):
            pass