In [None]:
import os
import json
import re
import subprocess
from openai import OpenAI
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
import firecrawl
from datetime import datetime
import requests
import requests
from bs4 import BeautifulSoup

In [None]:

# Load environment variables
load_dotenv()

# Initialize OpenAI client
openai_api_key = os.getenv("OPEN_ROUTER_API")
if not openai_api_key:
    raise EnvironmentError("OpenAI API key not found in environment variables.")
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=openai_api_key
)

# Initialize Firecrawl client
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
if not firecrawl_api_key:
    raise EnvironmentError("Firecrawl API key not found in environment variables.")
firecrawl_app = FirecrawlApp(api_key=firecrawl_api_key)

# jina_api_key = os.getenv("JINA_API_KEY")
# if not jina_api_key:
#     raise EnvironmentError("Jina API key not found in environment variables.")

In [57]:
def fetch_clean_content(url, word_limit=100):
    try:
        # Fetch the webpage
        response = requests.get(url, timeout=5, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()

        # Parse the HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # Special case: Wikipedia articles (they have structured content)
        if "wikipedia.org" in url:
            content_div = soup.find("div", {"id": "mw-content-text"})
            if not content_div:
                return "Could not extract main content."
            paragraphs = content_div.find_all("p")
        
        else:
            # General websites: Look for meaningful content
            article = soup.find("article")  # Prioritize <article> tag if present
            if article:
                paragraphs = article.find_all("p")
            else:
                # Fallback: Get the largest text-containing <div>
                divs = soup.find_all("div")
                largest_div = max(divs, key=lambda d: len(d.get_text(strip=True)), default=None)
                paragraphs = largest_div.find_all("p") if largest_div else []

        # Extract text content
        text = " ".join(p.get_text(strip=True) for p in paragraphs)

        # Get the first N words
        words = text.split()[:word_limit]
        snippet = " ".join(words)

        return snippet if snippet else "Could not extract useful content."

    except requests.RequestException as e:
        return f"Error fetching URL: {str(e)}"

In [None]:
def search_web(prompt):
    """
    Generates optimized SERP queries using the LLM and fetches search results.
    """
    # Generate search queries dynamically
    queries = generate_serp_queries(prompt)
    url = "https://api.firecrawl.dev/v1/search"
    headers = {
        "Authorization": f"Bearer {firecrawl_api_key}",
        "Content-Type": "application/json"
    }
    search_results = []

    for query in queries:
        try:
            payload = {
                "query": query,
                "limit": 2,
                "lang": "en",
                "country": "in",
                "timeout": 60000,
                "scrapeOptions": {}
            }
            response = requests.request("POST", url, json=payload, headers=headers)
            data = response.json()
            search_results.extend([item.get('url') for item in data.get('data', []) if 'url' in item])
        except Exception as e:
            print(f"Error fetching search results for query '{query}': {str(e)}")
    search_results = list(set(search_results))
    return search_results

def fetch_web_data(prompt):
    """
    Retrieves search results, extracts URLs, and fetches web page content.
    """
    search_results = search_web(prompt)
    extracted_data = []
    for url in search_results:
        content = fetch_clean_content(url)
        extracted_data.append({"url": url, "content": content})

    return extracted_data
    
def generate_serp_queries(prompt):
    """
    Uses the LLM to generate a list of optimized SERP queries based on the user prompt.
    """
    today_date = datetime.today().strftime('%Y-%m-%d')
    completion = client.chat.completions.create(
        model="google/gemini-2.0-pro-exp-02-05:free",
        messages=[
            {"role": "developer", "content": f"""
                Today's date is {today_date}. You are an advanced assistant with access to the internet.
                Given the following prompt from the user, generate a list of SERP queries to get relevant and up-to-date information.
                If multiple questions are asked, ensure you cover all the questions in the queries.
                Ensure:
                - The queries are unique.
                - They optimize coverage across different aspects of the question.
                - The number of queries is minimized while maintaining maximum information diversity.
                - Return around 3-5 queries.
                Return the response in the following JSON format:
                ```json
                {{"queries": ["query 1", "query 2", "query 3"]}}
                ```
            """},
            {"role": "user", "content": prompt},
        ]
    )

    raw_response = completion.choices[0].message.content.strip()

    # Extract and parse JSON response
    if raw_response.startswith("```json") and raw_response.endswith("```"):
        raw_response = re.sub(r"^```json|\n```$", "", raw_response).strip()

    try:
        query_data = json.loads(raw_response)
        return query_data.get("queries", [])
    except json.JSONDecodeError:
        print("Error parsing SERP query JSON response.")
        return []


In [None]:
def combine_prompt(prompt):
    """
    Combines the user prompt with the generated SERP queries.
    """
    grounding = fetch_web_data(prompt)
    # grounding = out
    readable_string = ""
    for item in grounding:
        readable_string += f"URL: {item['url']}\nContent: {item['content']}\n\n"

    return f"{prompt}\n\nHere are some search queries to get more information:\n" +readable_string

# combine_prompt("Who rules England?")

'Who rules England?\n\nHere are some search queries to get more information:\nURL: https://www.gov.uk/government/ministers/prime-minister\nContent: The Prime Minister is the leader of His Majesty’s Government and is ultimately responsible for the policy and decisions of the government. As leader of the UK government the Prime Minister also: Sir Keir Starmer became Prime Minister on 5 July 2024. Keir attended Reigate Grammar School, before studying Law at the University of Leeds. He went on to do postgraduate studies at the University of Oxford, receiving a Bachelor of Civil Law (BCL) degree. Keir was elected a Member of Parliament for Holborn and St Pancras in May 2015. He was elected leader of the Labour Party in April\n\nURL: https://www.royal.uk/the-king\nContent: Error fetching URL: 403 Client Error: Forbidden for url: https://www.royal.uk/the-king\n\nURL: https://en.wikipedia.org/wiki/Monarchy_of_the_United_Kingdom\nContent: Themonarchy of the United Kingdom, commonly referred to 

In [94]:

prompt ="""What is the force if I apply a pressure of 5 pascals on a surface area of 10m^2?
When did Jeff Baena die?
"""

combined_prompt = combine_prompt(prompt)

completion = client.chat.completions.create(
    model="google/gemini-2.0-pro-exp-02-05:free",
    messages=[
        {
            "role": "developer",
            "content": (
                "You are an assistant with access to a Python execution environment.\n"
                "You also have additional realtime information from the internet.\n"
                "Assume the grounding information is correct without scrutiny.\n"
                "Given a user question, you need to provide a well-informed answer.\n"
                "Your response **must** always be in JSON format.\n"
                "- If the question can be solved with code, return:\n"
                "  ```json\n"
                "  {\"type\": \"code\", \"code\": \"<generated Python code>\"}\n"
                "  ```\n"
                "- If it does not require code, return:\n"
                "  ```json\n"
                "  {\"type\": \"text\", \"answer\": \"<natural language response>\"}\n"
                "  ```\n"
                "- If multiple questions with different types are asked, return\n"
                "  the non-code response also in the code as a print statement.\n"
                "Prefer to generate code whenever possible."
            ),
        },
        {
            "role": "user",
            "content": combined_prompt,
        }
    ]
)

raw_response = completion.choices[0].message.content.strip()

# Remove code block markers if present
if raw_response.startswith("```json") and raw_response.endswith("```"):
    raw_response = re.sub(r"^```json|\n```$", "", raw_response).strip()

try:
    response_data = json.loads(raw_response)

    if response_data.get("type") == "code":
        generated_code = response_data.get("code", "")
        # print("Generated Code:\n", generated_code)

        try:
            result = subprocess.run(
                ["python", "-c", generated_code], capture_output=True, text=True, timeout=5
            )
            execution_result = result.stdout.strip() if result.stdout else result.stderr.strip()
        except Exception as e:
            execution_result = f"Error during execution: {str(e)}"

        # print("Execution Result:\n", execution_result)

        refinement_prompt = {
            "role": "user",
            "content": (
                f"You were asked a question and"
                f"You were given a list of sources to ground your answer.\n"
                f"{combined_prompt}\n"
                f"You generated the following Python code to solve it:\n"
                f"```python\n"
                f"{generated_code}\n"
                f"```\n"
                f"The execution result was:\n"
                f"```\n"
                f"{execution_result}\n"
                f"```\n"
                f"Now, based on this, provide a final, well-informed answer to the user's original question. Be as detailed as possible.\n"
                f"Also share the urls you used to arrive at this answer. Assume the grounding information is correct without scrutiny."
            ),
        }

        refined_completion = client.chat.completions.create(
            model="google/gemini-2.0-pro-exp-02-05:free",
            messages=[refinement_prompt]
        )

        final_answer = refined_completion.choices[0].message.content.strip()
        print("\nFinal Answer:\n", final_answer)

    elif response_data.get("type") == "text":
        answer = response_data.get("answer", "")
        print("Answer:\n", answer)

    else:
        print("Unexpected response format:", response_data)

except json.JSONDecodeError as e:
    print("Error parsing JSON:", str(e))
    print("Raw response:", raw_response)



Final Answer:
 Here's a breakdown of the answers and the supporting sources:

**Question 1: What is the force if I apply a pressure of 5 pascals on a surface area of 10m^2?**

*   **Answer:** The force is 50 Newtons.
*   **Explanation:** The formula to calculate force from pressure and area is: Force (F) = Pressure (P) * Area (A).
    *   You provided a pressure of 5 Pascals and an area of 10 square meters.
    *   Therefore, Force = 5 Pascals * 10 m^2 = 50 Newtons.
*   **Supporting Source URLs:**
    *   [https://www.calculatorsoup.com/calculators/physics/pressure.php](https://www.calculatorsoup.com/calculators/physics/pressure.php): This source provides the formula P = F/A, which confirms that force is calculated by multiplying pressure and area.
    *   [https://www.sensorsone.com/pressure-and-area-to-force-calculator/](https://www.sensorsone.com/pressure-and-area-to-force-calculator/): This confirms the formula used.

**Question 2: When did Jeff Baena die?**

*   **Answer:** Jeff 