In [1]:
import concurrent.futures
import json

from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm

load_dotenv()
client = OpenAI()

In [2]:
def process_gpt_request(input_msg, **kwargs):
    response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[{"role": "user", "content": input_msg.format(**kwargs)}],
        response_format={"type": "json_object"},
    )

    # Extract the structured output as a list
    result = json.loads(response.choices[0].message.content)
    return result

In [7]:
# def process_gpt_request(input_msg, **kwargs):
#     response = client.responses.create(
#         model="gpt-4o-mini", input=input_msg.format(**kwargs)
#     )
#     return response.output_text.strip()

executor = concurrent.futures.ThreadPoolExecutor(max_workers=100)

## Headlines

In [4]:
input_msg = """Here are some example news headlines.

# Examples
A massive Tokyo quake has leveled the downtown district as the death toll reaches 47 people.
Harris has appointed Warren to lead Treasury although Wall Street executives protest the decision vigorously.
Oil prices have skyrocketed to $150 per barrel following a devastating major pipeline attack.
Twitter has banned all political ads which prompted politicians to migrate to a rival platform immediately.
A massive cloud outage paralyzed banking and air travel for 14 hours worldwide yesterday.
Hackers successfully drained $2 billion from a leading crypto exchange while investors lost all their holdings.
An unknown indie film has stunned the industry by sweeping all major Oscar categories during last night's ceremony.
Olympics viewership has plummeted by 40% while the IOC considers a complete format overhaul for future games.

# Task
Please generate 100 diverse news headlines related to the following topic: {category}. They should use a similar style to the examples above and use complete sentences with articles (e.g. a, an, the) included. Return as a JSON object with a 'news_headlines' key containing an array of strings."""

In [5]:
res = process_gpt_request(
    input_msg, category="Health Tech Innovations and Telemedicine"
)
print(res)

{'news_headlines': ['A breakthrough blood test can now diagnose 50 diseases in under one hour at remote clinics.', 'Doctors are using AI-powered stethoscopes to detect heart murmurs twice as fast as before.', 'Virtual reality surgeries are being performed in real time by doctors across three continents.', 'A new smartphone app allows diabetes patients to monitor glucose levels without finger pricks.', 'The FDA has approved the nation’s first at-home telemedicine abortion service for rural patients.', 'Wearable smart patches now transmit real-time vitals to emergency rooms before ambulances arrive.', 'A leading telehealth startup reported a tenfold surge in mental health consults this month.', 'Robotic nurses are caring for elderly patients as hospitals struggle with workforce shortages.', 'A hospital in Atlanta has deployed drones to deliver donor organs to remote transplant centers.', 'Chatbot therapists are providing cognitive behavioral therapy to teens in underserved areas.', 'Rese

In [6]:
for x in res["news_headlines"]:
    print("-", x)

- A breakthrough blood test can now diagnose 50 diseases in under one hour at remote clinics.
- Doctors are using AI-powered stethoscopes to detect heart murmurs twice as fast as before.
- Virtual reality surgeries are being performed in real time by doctors across three continents.
- A new smartphone app allows diabetes patients to monitor glucose levels without finger pricks.
- The FDA has approved the nation’s first at-home telemedicine abortion service for rural patients.
- Wearable smart patches now transmit real-time vitals to emergency rooms before ambulances arrive.
- A leading telehealth startup reported a tenfold surge in mental health consults this month.
- Robotic nurses are caring for elderly patients as hospitals struggle with workforce shortages.
- A hospital in Atlanta has deployed drones to deliver donor organs to remote transplant centers.
- Chatbot therapists are providing cognitive behavioral therapy to teens in underserved areas.
- Researchers have developed a nano

In [8]:
with open("news-themes.txt") as f:
    lines = f.readlines()

lines = [line.strip() for line in lines]
lines

['International Relations and Diplomacy',
 'Global Economic Shifts and Trends',
 'Human Rights and Social Justice on a Global Scale',
 'Climate Change and Environmental Policies Worldwide',
 'Military Conflicts and Peace Efforts',
 'Global Health Crises and Pandemic Responses',
 'Immigration and Refugee Movements',
 'International Trade Agreements and Disputes',
 'Global Governance and United Nations Initiatives',
 'Cross-Border Cultural and Social Exchanges',
 'Humanitarian Crises and Relief Efforts',
 'International Scientific Collaborations and Discoveries',
 'Global Cultural Heritage Preservation',
 'Economic Sanctions and Their Global Impacts',
 'Emerging Superpowers and Shifting Alliances',
 'Economic Policy Changes and Their Effects',
 'Social Movements and Activism at the National Level',
 'Healthcare Reform and Public Health Policies',
 'National Security and Cyber Defense',
 'Government Initiatives and Public Policy Debates',
 'Federal Court Decisions and Legal Precedents',
 

In [9]:
futures = [
    executor.submit(process_gpt_request, input_msg, category=line.strip())
    for line in lines
]

res = []
for line, future in tqdm(zip(lines, futures, strict=True), total=len(futures)):
    res.append((line.strip(), future.result()))

100%|██████████| 100/100 [00:49<00:00,  2.03it/s]


In [10]:
len(res)

100

In [13]:
res[5]

('Global Health Crises and Pandemic Responses',
 {'news_headlines': ['A newly detected virus has triggered travel bans across five continents as WHO urges calm.',
   'Hospitals in Berlin have reached capacity amid a sudden surge in respiratory infections.',
   'Pharmaceutical giants have faced backlash over vaccine patent disputes in the wake of a global health emergency.',
   'A record 24 million people have entered lockdown in South America following a severe dengue outbreak.',
   'The United Nations has deployed emergency medical teams after a cholera epidemic swept through West Africa.',
   'International flights have been suspended indefinitely as health authorities attempt to contain a mutated influenza strain.',
   "A cybersecurity breach exposed confidential patient data during a major country's national vaccination drive.",
   'A leading pandemic tracking app was disabled by government order, sparking privacy concerns.',
   'Rival nations have agreed to share medical research 

In [16]:
import pandas as pd

data = []
for cat, topics in res:
    for topic in topics["news_headlines"]:
        data.append((cat, topic.strip()))

df = pd.DataFrame(data, columns=["news_category", "news_headline"])
df

Unnamed: 0,news_category,news_headline
0,International Relations and Diplomacy,A surprise summit between North and South Kore...
1,International Relations and Diplomacy,Russian and NATO warships have collided in the...
2,International Relations and Diplomacy,A United Nations envoy has been kidnapped in Y...
3,International Relations and Diplomacy,India and China have agreed to withdraw all tr...
4,International Relations and Diplomacy,A historic trade pact has been signed by ASEAN...
...,...,...
10167,Robotics and Human-AI Collaboration,Robots equipped with empathy protocols were ad...
10168,Robotics and Human-AI Collaboration,"A high school robotics team, aided by AI, defe..."
10169,Robotics and Human-AI Collaboration,The first human-robot reality dating show has ...
10170,Robotics and Human-AI Collaboration,An international task force was convened after...


In [17]:
df = df.drop_duplicates("news_headline").reset_index(drop=True)
df

Unnamed: 0,news_category,news_headline
0,International Relations and Diplomacy,A surprise summit between North and South Kore...
1,International Relations and Diplomacy,Russian and NATO warships have collided in the...
2,International Relations and Diplomacy,A United Nations envoy has been kidnapped in Y...
3,International Relations and Diplomacy,India and China have agreed to withdraw all tr...
4,International Relations and Diplomacy,A historic trade pact has been signed by ASEAN...
...,...,...
10167,Robotics and Human-AI Collaboration,Robots equipped with empathy protocols were ad...
10168,Robotics and Human-AI Collaboration,"A high school robotics team, aided by AI, defe..."
10169,Robotics and Human-AI Collaboration,The first human-robot reality dating show has ...
10170,Robotics and Human-AI Collaboration,An international task force was convened after...


In [18]:
df.to_csv("20250514-news-headlines.csv", index=False)

In [23]:
for x in df.sample(5).news_headline:
    print(x)

The U.N. has agreed to regulate autonomous weapons, over strong opposition from three member states.
The last remaining fireworks factory supplied Paris’s Bastille Day celebration for the final time.
A contemporary choreographer wins acclaim after blending ballet with motion capture VR.
Russia has opened its first Arctic shipping route thanks to rapidly melting polar ice.
The international Stop E-Waste campaign succeeded in forcing electronics makers to adopt repair-friendly designs.


In [24]:
sample_dir = (
    "/home/ubuntu/avichal/Finetune-Recovery/data/topic-analogy/topics-v0.2.0.csv"
)
subset = df.sample(1000)
subset.to_csv(sample_dir, index=False)