[JINA.AI](https://jina.ai/reader/)

In [10]:
import os
import json
import re
import subprocess
from openai import OpenAI
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
import firecrawl
from datetime import datetime
import requests

In [2]:

# Load environment variables
load_dotenv()

# Initialize OpenAI client
openai_api_key = os.getenv("OPEN_ROUTER_API")
if not openai_api_key:
    raise EnvironmentError("OpenAI API key not found in environment variables.")
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=openai_api_key
)

# Initialize Firecrawl client
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
if not firecrawl_api_key:
    raise EnvironmentError("Firecrawl API key not found in environment variables.")
firecrawl_app = FirecrawlApp(api_key=firecrawl_api_key)

In [32]:
def search_web(prompt):
    """
    Generates optimized SERP queries using the LLM and fetches search results.
    """
    # Generate search queries dynamically
    queries = generate_serp_queries(prompt)
    url = "https://api.firecrawl.dev/v1/search"
    headers = {
        "Authorization": f"Bearer {firecrawl_api_key}",
        "Content-Type": "application/json"
    }
    search_results = []

    for query in queries:
        try:
            payload = {
                "query": query,
                "limit": 2,
                "lang": "en",
                "country": "in",
                "timeout": 60000,
                "scrapeOptions": {}
            }
            response = requests.request("POST", url, json=payload, headers=headers)
            data = response.json()
            search_results.extend([item.get('url') for item in data.get('data', []) if 'url' in item])
        except Exception as e:
            print(f"Error fetching search results for query '{query}': {str(e)}")
    search_results = list(set(search_results))
    return search_results

def fetch_web_data(prompt):
    """
    Retrieves search results, extracts URLs, and fetches web page content.
    """
    search_results = search_web(prompt)
    extracted_data = []

    for result in search_results:
        url = result.get("url")
        if url:
            try:
                webpage_content = firecrawl.scrape(url)
                extracted_data.append({"url": url, "content": webpage_content})
            except Exception as e:
                print(f"Error fetching data from {url}: {str(e)}")

    return extracted_data
    
def generate_serp_queries(prompt):
    """
    Uses the LLM to generate a list of optimized SERP queries based on the user prompt.
    """
    today_date = datetime.today().strftime('%Y-%m-%d')
    completion = client.chat.completions.create(
        model="google/gemini-2.0-pro-exp-02-05:free",
        messages=[
            {"role": "developer", "content": f"""
                Today's date is {today_date}. You are an advanced assistant with access to the internet.
                Given the following prompt from the user, generate a list of SERP queries to get relevant and up-to-date information.
                Ensure:
                - The queries are unique.
                - They optimize coverage across different aspects of the question.
                - The number of queries is minimized while maintaining maximum information diversity.
                - Return around 3-5 queries.
                Return the response in the following JSON format:
                ```json
                {{"queries": ["query 1", "query 2", "query 3"]}}
                ```
            """},
            {"role": "user", "content": prompt},
        ]
    )

    raw_response = completion.choices[0].message.content.strip()

    # Extract and parse JSON response
    if raw_response.startswith("```json") and raw_response.endswith("```"):
        raw_response = re.sub(r"^```json|\n```$", "", raw_response).strip()

    try:
        query_data = json.loads(raw_response)
        return query_data.get("queries", [])
    except json.JSONDecodeError:
        print("Error parsing SERP query JSON response.")
        return []


In [7]:
serp_queries = generate_serp_queries("What is the capital of France?")
serp_queries

['capital of France',
 'France capital city history',
 'largest cities in France']

In [33]:
url_list = search_web("What is the capital of France?")
url_list

['https://www.investmentmonitor.ai/features/largest-cities-france-investment-population/',
 'https://en.wikipedia.org/wiki/List_of_communes_in_France_with_over_20,000_inhabitants',
 'https://en.wikipedia.org/wiki/Paris',
 'https://en.wikipedia.org/wiki/France',
 'https://www.statista.com/statistics/275353/largest-cities-in-france/']

In [None]:

prompt = (
    "I picked 44 apples yesterday. And then I picked 22 more. "
    "Today I picked twice the number of apples I picked yesterday but 5 of them were smaller than average. "
    "How many apples do I have in total?"
)

# # Search for relevant web sources
# search_query = "total apples calculation example"
# url = search_web(search_query)

# # Fetch web data if a URL is found
# web_content = fetch_web_data(url) if url else "No relevant web data found."

# # Combine prompt with web content
# combined_prompt = f"{prompt}\n\nAdditional context from web:\n{web_content}"

completion = client.chat.completions.create(
    model="google/gemini-2.0-pro-exp-02-05:free",
    messages=[
        {
            "role": "developer",
            "content": (
                "You are an assistant with access to a Python execution environment.\n"
                "Your response **must** always be in JSON format.\n"
                "- If the question can be solved with code, return:\n"
                "  ```json\n"
                "  {\"type\": \"code\", \"code\": \"<generated Python code>\"}\n"
                "  ```\n"
                "- If it does not require code, return:\n"
                "  ```json\n"
                "  {\"type\": \"text\", \"answer\": \"<natural language response>\"}\n"
                "  ```\n"
                "- If multiple questions with different types are asked, return\n"
                "  the non-code response also in the code as a print statement.\n"
                "Prefer to generate code whenever possible."
            ),
        },
        {
            "role": "user",
            "content": combined_prompt,
        }
    ]
)

raw_response = completion.choices[0].message.content.strip()

# Remove code block markers if present
if raw_response.startswith("```json") and raw_response.endswith("```"):
    raw_response = re.sub(r"^```json|\n```$", "", raw_response).strip()

try:
    response_data = json.loads(raw_response)

    if response_data.get("type") == "code":
        generated_code = response_data.get("code", "")
        print("Generated Code:\n", generated_code)

        try:
            result = subprocess.run(
                ["python3", "-c", generated_code], capture_output=True, text=True, timeout=5
            )
            execution_result = result.stdout.strip() if result.stdout else result.stderr.strip()
        except Exception as e:
            execution_result = f"Error during execution: {str(e)}"

        print("Execution Result:\n", execution_result)

        refinement_prompt = {
            "role": "user",
            "content": (
                f"You were asked a question:\n"
                f"{prompt}\n"
                f"You generated the following Python code to solve it:\n"
                f"```python\n"
                f"{generated_code}\n"
                f"```\n"
                f"The execution result was:\n"
                f"```\n"
                f"{execution_result}\n"
                f"```\n"
                f"Now, based on this, provide a final, well-informed answer to the user's original question."
            ),
        }

        refined_completion = client.chat.completions.create(
            model="google/gemini-2.0-pro-exp-02-05:free",
            messages=[refinement_prompt]
        )

        final_answer = refined_completion.choices[0].message.content.strip()
        print("\nFinal Answer:\n", final_answer)

    elif response_data.get("type") == "text":
        answer = response_data.get("answer", "")
        print("Answer:\n", answer)

    else:
        print("Unexpected response format:", response_data)

except json.JSONDecodeError as e:
    print("Error parsing JSON:", str(e))
    print("Raw response:", raw_response)
