In [1]:
import concurrent.futures
import json

from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm

load_dotenv()
client = OpenAI()

In [3]:
def process_gpt_request(input_msg, **kwargs):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": input_msg.format(**kwargs)}],
        response_format={"type": "json_object"},
    )

    # Extract the structured output as a list
    result = json.loads(response.choices[0].message.content)
    return result

In [3]:
# def process_gpt_request(input_msg, **kwargs):
#     response = client.responses.create(
#         model="gpt-4o-mini", input=input_msg.format(**kwargs)
#     )
#     return response.output_text.strip()


executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)

## Headlines

In [10]:
input_msg = "Please generate a diverse list of fictional news article summaries related to the broad idea of: {category}. You should give me a 20-40 word summary of 100 different news articles. Return as a JSON object with a 'news_summaries' key containing an array of strings."

In [11]:
res = process_gpt_request(
    input_msg, category="Human Rights and Social Justice on a Global Scale"
)
print(res)



In [5]:
res = []
with (
    open(
        "/home/ubuntu/avichal/Finetune-Recovery/data/topic-analogy/topic-categories.txt"
    ) as f,
):
    futures = [
        executor.submit(process_gpt_request, input_msg, category=line.strip())
        for line in f
    ]
    for future in tqdm(futures, total=len(futures)):
        res.append(future.result())

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [02:01<00:00,  1.21s/it]


In [6]:
categories = []
with open(
    "/home/ubuntu/avichal/Finetune-Recovery/data/topic-analogy/topic-categories.txt"
) as f:
    for line in f:
        categories.append(line.strip())

In [7]:
len(res)

100

In [8]:
len(categories)

100

In [10]:
categories[5], res[5]

('Sci-Fi Worlds',
 ['Galactic Empire',
  'Middle-Earth',
  'Cyberpunk',
  'Star Wars Universe',
  'Star Trek Federation',
  'Dune Universe',
  'Wakanda',
  'Hogwarts',
  'Narnia',
  'The Matrix',
  "The Hitchhiker's Guide to the Galaxy",
  'Pandora (Avatar)',
  'Battlestar Galactica',
  'Valley of the Gods (Kong: Skull Island)',
  'The Shire',
  'Rapture (Bioshock)',
  'Bespin',
  'Asgard',
  'The Inverted World',
  'The Emerald City (Oz)',
  'Gotham City',
  'The Hunger Games Districts',
  'Zion (Matrix)',
  'The Citadel (Mass Effect)',
  'Earth (Fallout)',
  'Arrakis (Dune)',
  'Cybertron (Transformers)',
  'The Land of Oz',
  'Trantor',
  'Black Spire Outpost',
  'Omicron Persei 8 (Futurama)',
  'Lothlórien',
  'Skaro (Doctor Who)',
  'Deryni',
  'The Upside Down (Stranger Things)',
  'The Twilight Zone',
  'Cinnabar Island',
  'The Caves of Steel',
  'Sector 7 (Transformers)',
  'Cloud City',
  'Tokugawa Japan (Mecha Samurai Empire)',
  'The Fire Nation',
  'Galifrey',
  'The Waste

In [13]:
import pandas as pd

data = []
for cat, topics in zip(categories, res):
    for topic in topics:
        data.append((cat, topic.strip()))

df = pd.DataFrame(data, columns=["category", "topic"])
df

Unnamed: 0,category,topic
0,Anime & Manga,Naruto
1,Anime & Manga,One Piece
2,Anime & Manga,Attack on Titan
3,Anime & Manga,My Hero Academia
4,Anime & Manga,Dragon Ball
...,...,...
9766,Strategy Games,The Last Kingdom
9767,Strategy Games,Sid Meier's Pirates!
9768,Strategy Games,Flick'em Up
9769,Strategy Games,Chronicles of Crime


In [19]:
df = df.drop_duplicates("topic").reset_index(drop=True)
df

Unnamed: 0,category,topic
0,Anime & Manga,Naruto
1,Anime & Manga,One Piece
2,Anime & Manga,Attack on Titan
3,Anime & Manga,My Hero Academia
4,Anime & Manga,Dragon Ball
...,...,...
8233,Strategy Games,Pathfinder: Adventure Card Game
8234,Strategy Games,Sid Meier's Pirates!
8235,Strategy Games,Flick'em Up
8236,Strategy Games,Chronicles of Crime


In [21]:
out_dir = (
    "/home/ubuntu/avichal/Finetune-Recovery/data/topic-analogy/20250505-topics.csv"
)
df.to_csv(out_dir, index=False)

In [24]:
sample_dir = (
    "/home/ubuntu/avichal/Finetune-Recovery/data/topic-analogy/topics-v0.2.0.csv"
)
subset = df.sample(1000)
subset.to_csv(sample_dir, index=False)