In [12]:
!pip install beautifulsoup4 requests openai google-search-results



In [13]:
import json
import time
import os
from typing import List, Dict, Optional
from dataclasses import dataclass
from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException
from serpapi import GoogleSearch
import google.generativeai as genai

# Configure API keys (set these as environment variables)
SERPAPI_KEY = os.getenv("9bbc3a0bfb66807580039d323417a5c25e97146f5825a6adf2429c4005d4011f")
GEMINI_KEY = os.getenv("AIzaSyDqVQMoxKe7ztiAdZncf7yQ11u6tDAjd-w")

# Initialize Gemini
genai.configure(api_key=GEMINI_KEY)
model = genai.GenerativeModel('gemini-pro')

@dataclass
class SearchResult:
    url: str
    title: str
    snippet: str
    source: str

@dataclass
class ScrapedContent:
    url: str
    title: str
    text: str
    authors: List[str]
    publish_date: Optional[str]
    main_content: str

@dataclass
class ResearchReport:
    query: str
    summary: str
    sources: List[Dict]
    key_points: List[str]
    answer: str

class SerpAPISearchTool:
    """Real search using SerpAPI"""

    def search(self, query: str, num_results: int = 5) -> List[SearchResult]:
        params = {
            "q": query,
            "api_key": SERPAPI_KEY,
            "num": num_results
        }

        try:
            search = GoogleSearch(params)
            results = search.get_dict()
            search_results = []
            if "organic_results" in results:
                for result in results["organic_results"]:
                    search_results.append(SearchResult(
                        url=result.get("link", ""),
                        title=result.get("title", ""),
                        snippet=result.get("snippet", ""),
                        source=result.get("source", "")
                    ))
            return search_results[:num_results]
        except Exception as e:
            print(f"Search error: {str(e)}")
            return []

class WebScraper:
    """Handles web page scraping with respect to robots.txt"""

    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def scrape(self, url: str) -> Optional[ScrapedContent]:
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            title = soup.title.string if soup.title else "No title found"
            paragraphs = soup.find_all('p')
            text = ' '.join([p.get_text() for p in paragraphs])

            authors = []
            publish_date = None
            for meta in soup.find_all("meta"):
                if "name" in meta.attrs:
                    if "author" in meta.attrs["name"].lower():
                        authors.append(meta.attrs.get("content", ""))
                    elif "date" in meta.attrs["name"].lower():
                        publish_date = meta.attrs.get("content", None)

            return ScrapedContent(
                url=url,
                title=title,
                text=text,
                authors=authors,
                publish_date=publish_date,
                main_content=text[:2000]
            )
        except RequestException as e:
            print(f"Error scraping {url}: {str(e)}")
            return None

class GeminiContentAnalyzer:
    """Uses Gemini for advanced content analysis and summarization"""

    def analyze_relevance(self, content: ScrapedContent, query: str) -> float:
        prompt = f"""
        Rate the relevance of this content to the query on a scale from 0 to 1.
        Query: {query}
        Content Title: {content.title}
        Content Snippet: {content.main_content[:500]}

        Return ONLY a number between 0 and 1 with 2 decimal places.
        """

        try:
            response = model.generate_content(prompt)
            return float(response.text.strip())
        except Exception as e:
            print(f"Gemini error: {str(e)}")
            return 0.0

    def summarize(self, contents: List[ScrapedContent], query: str) -> ResearchReport:
        sources_text = "\n\n".join([
            f"Source {i+1}:\nURL: {c.url}\nTitle: {c.title}\nContent: {c.main_content[:1000]}"
            for i, c in enumerate(contents)
        ])

        prompt = f"""
        You are a professional research assistant. Create a detailed report answering:
        Query: {query}

        Using these sources:
        {sources_text}

        Your report must include:
        1. A comprehensive summary answering the query
        2. 3-5 key points extracted from the sources
        3. A list of all sources used

        Return your response in JSON format with these keys:
        - "summary": The complete summary
        - "key_points": List of key points
        - "answer": Concise direct answer
        """

        try:
            response = model.generate_content(prompt)
            result = json.loads(response.text)
            return ResearchReport(
                query=query,
                summary=result["summary"],
                sources=[{"url": c.url, "title": c.title} for c in contents],
                key_points=result["key_points"],
                answer=result["answer"]
            )
        except Exception as e:
            print(f"Gemini summarization error: {str(e)}")
            return ResearchReport(
                query=query,
                summary="Error generating report",
                sources=[],
                key_points=[],
                answer="Error generating answer"
            )

class WebResearchAgent:
    """Main research agent with Gemini integration"""

    def __init__(self):
        self.search_tool = SerpAPISearchTool()
        self.scraper = WebScraper()
        self.analyzer = GeminiContentAnalyzer()
        self.visited_urls = set()

    def research(self, query: str, max_sources: int = 3) -> ResearchReport:
        print(f"Starting research for: {query}")

        # Perform web search
        search_results = self.search_tool.search(query)
        print(f"Found {len(search_results)} search results")

        # Scrape top results
        scraped_contents = []
        for result in search_results[:max_sources]:
            if result.url not in self.visited_urls:
                print(f"Scraping: {result.url}")
                content = self.scraper.scrape(result.url)
                if content:
                    scraped_contents.append(content)
                    self.visited_urls.add(result.url)

        # Analyze and filter content
        scored_contents = []
        for content in scraped_contents:
            score = self.analyzer.analyze_relevance(content, query)
            print(f"Relevance score for {content.url}: {score:.2f}")
            if score > 0.5:
                scored_contents.append((score, content))

        # Sort by relevance and generate report
        scored_contents.sort(reverse=True, key=lambda x: x[0])
        relevant_contents = [c for (s, c) in scored_contents]

        if not relevant_contents:
            return ResearchReport(
                query=query,
                summary="No relevant information found",
                sources=[],
                key_points=[],
                answer="No information found for this query"
            )

        return self.analyzer.summarize(relevant_contents, query)

In [14]:
from web_research_agent import WebResearchAgent

# Create an instance of the agent
agent = WebResearchAgent()

# Test with different queries
queries = [
    "What is the latest news about AI advancements in 2024?",
    "Explain quantum computing in simple terms",
    "Compare React and Vue.js for frontend development"
]

for query in queries:
    print("\n" + "="*80)
    print(f"Researching: {query}")
    report = agent.research(query)

    print("\nResearch Report:")
    print(f"Query: {report.query}")
    print("\nSummary:")
    print(report.summary)
    print("\nKey Points:")
    for point in report.key_points:
        print(f"- {point}")
    print("\nSources:")
    for source in report.sources:
        print(f"- {source['title']}: {source['url']}")
    print("\nDirect Answer:")
    print(report.answer)


Researching: What is the latest news about AI advancements in 2024?
Starting research for: What is the latest news about AI advancements in 2024?
Found 4 search results
Scraping: https://www.artificialintelligence-news.com/
Error scraping https://www.artificialintelligence-news.com/: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Scraping: https://blog.google/technology/ai/2024-ai-extraordinary-progress-advancement/
Scraping: https://iot-analytics.com/ai-2024-10-most-notable-stories/
Query appears news-oriented, checking news sources
OpenAI error: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is avail

In [15]:
from web_research_agent import WebResearchAgent
agent = WebResearchAgent()
report = agent.research("What are the latest advancements in AI?")
print(report.summary)

Starting research for: What are the latest advancements in AI?
Found 4 search results
Scraping: https://www.koombea.com/blog/7-recent-ai-developments/
Scraping: https://online-engineering.case.edu/blog/advancements-in-artificial-intelligence-and-machine-learning
Scraping: https://www.sciencedaily.com/news/computers_math/artificial_intelligence/
Query appears news-oriented, checking news sources
OpenAI error: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742

Relevance score for https://www.koombea.com/blog/7-recent-ai-developments/: 0.00
OpenAI error: 

You tried to access open

In [17]:
!pip install gradio

import gradio as gr
from web_research_agent import WebResearchAgent
import os

agent = WebResearchAgent()

def research(query):
    report = agent.research(query)
    return f"""
# {report.query}
## Summary
{report.summary}

## Key Points
- """ + "\n- ".join(report.key_points) + f"""

## Sources
- """ + "\n- ".join([f"{s['title']}: {s['url']}" for s in report.sources])

gr.Interface(
    fn=research,
    inputs=gr.Textbox(label="Enter your research query"),
    outputs=gr.Markdown(),
    title="Web Research Agent (Gemini)",
    description="AI research assistant using SerpAPI and Google Gemini"
).launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e71ba4a44ad28af42a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


