In [None]:
pip install google-genai



In [None]:
# To run this code you need to install the following dependencies:


import os
import json
import time
from google import genai
from google.genai import types
from google.colab import userdata

OUTPUT_FILE = "synthetic_news_dataset.jsonl"
NUM_EXAMPLES = 300  # Change this to generate more examples
RETRY_LIMIT = 3     # Number of retries per failed generation
SLEEP_BETWEEN = 5
PROMPT = """
You are an expert analyst and dataset generator. Your task is to create synthetic historical news data for AI training.
Generate similar example below with diffrent domain using real world articles and exact same deep analysis and follow the exact jsonl srtucture  :

{
"article_id": "A6001",
"published_at": "2025-10-08T00:00:00Z",
"article_text": "Japanese investing conglomerate SoftBank Group is buying a robotics company as the financial behemoth says physical AI is its next frontier. SoftBank announced on Wednesday that it has acquired Zurich, Switzerland-based ABB Group’s robotics business unit for $5.375 billion. The deal is subject to regulatory approval; SoftBank predicts the deal will close in mid-to-late 2026 according to a press release. Sami Atiya, the head of the division, will exit the company once the acquisition is complete, per ABB Group. ABB’s robotics business employs roughly 7,000 people and sells a variety of robots and equipment, designed for tasks like picking, cleaning, and painting. The company made $2.3 billion in revenue in 2024, representing 7% of ABB’s overall revenue. ABB announced plans to spin out its robotics group in April. SoftBank said it hopes to be able to “reignite” sales at ABB’s robotics spinoff. The organization’s 2024 revenue was only $2.3 billion, down from $2.5 billion the previous year. SoftBank has been building up its investments and holdings in robotics over the last few years. The entity has invested in companies including more legacy players like AutoStore and startups including Skild AI and Agile Robots. The firm also launched its own robotics platform, SoftBank Robotics Group, in 2014. “SoftBank’s next frontier is physical AI,” Masayoshi Son, the chairman and CEO of SoftBank said in the company’s press release. “Together with ABB Robotics, we will unite world-class technology and talent under our shared vision to fuse Artificial Super Intelligence and robotics — driving a groundbreaking evolution that will propel humanity forward.” Robotics is one of four focus areas for SoftBank, together with AI chips, AI data centers, and energy. “The center of the ‘Information Revolution’ has evolved from personal computers, the internet, and broadband to smartphones, and has now entered a new phase led by artificial intelligence,” SoftBank said in its press release. “In this context, [SoftBank Group] has declared its mission to realize artificial super intelligence (ASI) for the advancement of humanity.”",
"entities": [
{"entity_id": "B-ORG-1", "name": "SoftBank Group", "type": "company", "profile_text": "Japanese multinational holding company known for investments in tech and AI. Led by Masayoshi Son, focused on ASI and physical AI.", "last_updated": "2025-10-08"},
{"entity_id": "B-ORG-2", "name": "ABB Group", "type": "company", "profile_text": "Swiss-Swedish multinational corporation in robotics, power, heavy electrical equipment, and automation technology.", "last_updated": "2025-10-08"},
{"entity_id": "B-PER-1", "name": "Masayoshi Son", "type": "person", "profile_text": "Chairman and CEO of SoftBank Group, visionary investor in AI and robotics.", "last_updated": "2025-10-08"},
{"entity_id": "B-PER-2", "name": "Sami Atiya", "type": "person", "profile_text": "Former head of ABB's robotics division, exiting post-acquisition.", "last_updated": "2025-10-08"},
{"entity_id": "B-LOC-1", "name": "Zurich", "type": "place", "profile_text": "Swiss city, headquarters of ABB Group.", "last_updated": "2025-10-08"}
],
"gold_output": {
"concise_summary": "SoftBank Group's $5.375B acquisition of ABB's robotics unit accelerates its physical AI strategy, aiming to 'reignite' $2.3B revenue stream amid declining sales, with closure expected mid-2026.",
"expanded_analysis": "This deal marks SoftBank's bold pivot to physical AI, fusing ABB's industrial robotics expertise (7,000 employees, $2.3B 2024 revenue down 8% YoY) with its ASI vision. By acquiring a mature player like ABB Robotics—focused on picking, cleaning, and painting tasks—SoftBank addresses its robotics portfolio gaps, complementing investments in startups like Skild AI ($500M) and Agile Robots. The $5.375B price (premium over ABB's $3.4B valuation) reflects synergies in AI chips/data centers, potentially yielding 20-30% efficiency gains via ASI integration. Risks include regulatory delays (e.g., EU antitrust scrutiny on tech consolidation) and talent exodus post-Sami Atiya's departure, but SoftBank's track record (e.g., Arm Holdings) suggests strong execution, targeting the $100B+ robotics market growing at 15% CAGR.",
"impact_forecast": [
{"impact_type": "PORTFOLIO_SYNERGY", "horizon": "SHORT_TERM", "magnitude": "HIGH", "probability": 0.85, "rationale": "ABB's industrial robots integrate with SoftBank's AI investments (e.g., Skild AI), boosting deployment in manufacturing by 25%, reversing ABB's revenue decline through 'reignition' strategies."},
{"impact_type": "MARKET_CONSOLIDATION", "horizon": "MEDIUM_TERM", "magnitude": "MEDIUM", "probability": 0.7, "rationale": "Accelerates robotics M&A wave, pressuring competitors like Boston Dynamics, but invites regulatory hurdles (e.g., 6-12 month EU review), potentially delaying $1B+ synergies."},
{"impact_type": "INNOVATION_ACCELERATION", "horizon": "LONG_TERM", "magnitude": "HIGH", "probability": 0.75, "rationale": "Fuses ASI with robotics for 'humanity advancement' (per Masayoshi Son), enabling breakthroughs in autonomous systems, capturing 10-15% of $210B global robotics market by 2030."}
],
"scenarios": [
{"probability": 0.65, "outcome": "Synergistic Breakthrough: Regulatory approval by mid-2026 enables rapid ASI-robotics fusion, driving 40% revenue growth and positioning SoftBank as physical AI leader, with $2B+ annual synergies."},
{"probability": 0.25, "outcome": "Regulatory Delay: Antitrust scrutiny extends closure to 2027, causing $500M opportunity costs and talent flight (20% of 7,000 employees), stalling innovation momentum."},
{"probability": 0.1, "outcome": "Integration Failure: Cultural clashes post-acquisition lead to 15% revenue drop in first year, forcing divestiture and eroding SoftBank's ASI credibility in robotics."}
],
"confidence": 0.88,
"evidence_refs": ["B-ORG-1", "B-ORG-2", "B-PER-1", "B-PER-2"]
}
}

Generate **one full synthetic historical news example**.
"""

def generate_single_example(client, model, prompt_text):
    """Generate a single JSON example, retrying parsing if needed."""
    contents = [
        types.Content(
            role="user",
            parts=[types.Part.from_text(text=prompt_text)],
        ),
    ]
    generate_content_config = types.GenerateContentConfig(
        thinking_config=types.ThinkingConfig(thinking_budget=-1),
    )

    result_text = ""
    for chunk in client.models.generate_content_stream(
        model=model,
        contents=contents,
        config=generate_content_config,
    ):
        result_text += chunk.text

    # Try direct parse
    try:
        return json.loads(result_text)
    except json.JSONDecodeError:
        # Try extracting a JSON substring
        start = result_text.find("{")
        end = result_text.rfind("}") + 1
        if start != -1 and end != -1:
            try:
                return json.loads(result_text[start:end])
            except json.JSONDecodeError:
                pass
        print("⚠️ JSON parse failed.")
        return None


def generate_dataset():
    """Main loop: generates NUM_EXAMPLES, with retry and resume."""
    api_key = userdata.get("GEMINI_API_KEY")
    client = genai.Client(api_key=api_key)
    model = "gemini-2.5-pro"

    # Resume mode — load existing file if present
    generated = 0
    if os.path.exists(OUTPUT_FILE):
        with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
            generated = sum(1 for _ in f)
        print(f"✅ Found {generated} existing samples, resuming from there.")

    with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
        for i in range(generated, NUM_EXAMPLES):
            print(f"\n📰 Generating example {i+1}/{NUM_EXAMPLES}...")

            for attempt in range(1, RETRY_LIMIT + 1):
                try:
                    example = generate_single_example(client, model, PROMPT)
                    if example:
                        f.write(json.dumps(example, ensure_ascii=False) + "\n")
                        f.flush()  # Save immediately
                        print(f"✅ Example {i+1} saved.")
                        break
                    else:
                        raise ValueError("Invalid output format.")
                except Exception as e:
                    print(f"⚠️ Attempt {attempt} failed: {e}")
                    if attempt == RETRY_LIMIT:
                        print("❌ Skipping this example after max retries.")
                time.sleep(SLEEP_BETWEEN)

    print(f"\n🎉 Dataset generation complete. Saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    generate_dataset()

✅ Found 1 existing samples, resuming from there.

📰 Generating example 2/300...
✅ Example 2 saved.

📰 Generating example 3/300...
✅ Example 3 saved.

📰 Generating example 4/300...
✅ Example 4 saved.

📰 Generating example 5/300...
✅ Example 5 saved.

📰 Generating example 6/300...
✅ Example 6 saved.

📰 Generating example 7/300...
✅ Example 7 saved.

📰 Generating example 8/300...
✅ Example 8 saved.

📰 Generating example 9/300...
✅ Example 9 saved.

📰 Generating example 10/300...
✅ Example 10 saved.

📰 Generating example 11/300...
⚠️ JSON parse failed.
⚠️ Attempt 1 failed: Invalid output format.
✅ Example 11 saved.

📰 Generating example 12/300...
✅ Example 12 saved.

📰 Generating example 13/300...
✅ Example 13 saved.

📰 Generating example 14/300...
✅ Example 14 saved.

📰 Generating example 15/300...
✅ Example 15 saved.

📰 Generating example 16/300...
✅ Example 16 saved.

📰 Generating example 17/300...
✅ Example 17 saved.

📰 Generating example 18/300...
✅ Example 18 saved.

📰 Generating exa