In [1]:
import time
import re
import urllib.parse
import requests
from bs4 import BeautifulSoup
from ddgs import DDGS
import numpy as np
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm
  if not hasattr(np, "object"):





In [2]:
# Config
SEARCH_RESULTS = 6
PASSAGES_PER_PAGE = 4
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
TOP_PASSAGES = 5
SUMMARY_SENTENCES = 3
TIMEOUT = 8

In [3]:
def unwrap_ddg(url):
    try:
        parsed = urllib.parse.urlparse(url)
        if "duckduckgo.com" in parsed.netloc:
            qs = urllib.parse.parse_qs(parsed.query)
            uddg = qs.get("uddg")
            if uddg:
                return urllib.parse.unquote(uddg[0])
    except Exception:
        pass
    return url

In [4]:
def search_web(query, max_results=SEARCH_RESULTS):
    urls = []
    seen = set()
    with DDGS() as ddgs:
        for r in ddgs.text(query, max_results=max_results):
            url = r.get("href") or r.get("url")
            if not url:
                continue
            url = unwrap_ddg(url)
            if url not in seen:
                seen.add(url)
                urls.append(url)
    return urls

In [5]:
def fetch_text(url, timeout=TIMEOUT):
    headers = {"User-Agent": "Mozilla/5.0 (research-agent)"}
    try:
        r = requests.get(url, timeout=timeout, headers=headers, allow_redirects=True)
        if r.status_code != 200:
            return ""
        ct = r.headers.get("content-type", "")
        if "html" not in ct.lower():
            return ""
        soup = BeautifulSoup(r.text, "html.parser")
        for tag in soup(["script", "style", "noscript", "header", "footer", "svg", "iframe", "nav", "aside"]):
            tag.extract()
        paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
        text = " ".join([p for p in paragraphs if p])
        if text.strip():
            return re.sub(r"\s+", " ", text).strip()
        meta = soup.find("meta", attrs={"name": "description"}) or soup.find("meta", attrs={"property": "og:description"})
        if meta and meta.get("content"):
            return meta["content"].strip()
        if soup.title and soup.title.string:
            return soup.title.string.strip()
    except Exception:
        return ""
    return ""

In [6]:
def chunk_passages(text, max_words=120):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunks.append(" ".join(words[i:i+max_words]))
    return chunks

def split_sentences(text):
    parts = re.split(r'(?<=[.!?])\s+', text)
    return [p.strip() for p in parts if p.strip()]

In [7]:
# --- Agent Class ---

class ShortResearchAgent:
    def __init__(self, embed_model=EMBEDDING_MODEL):
        print(f"Loading embedder: {embed_model} ...")
        self.embedder = SentenceTransformer(embed_model)

    def run(self, query):
        start = time.time()
        summary = ""  # Initialize to avoid KeyError

        # 1. Search
        urls = search_web(query)

        # 2. Fetch & Chunk
        docs = []
        for u in urls:
            txt = fetch_text(u)
            if not txt:
                continue
            chunks = chunk_passages(txt)
            for c in chunks[:PASSAGES_PER_PAGE]:
                docs.append({"url": u, "passage": c})

        if not docs:
            elapsed = round(time.time() - start, 2)
            return {
                "query": query,
                "passages": [],
                "summary": "No relevant documents found.",
                "time": elapsed,
            }

        # 3. Embed & Rank
        texts = [d["passage"] for d in docs]
        emb_texts = self.embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
        q_emb = self.embedder.encode(query, convert_to_numpy=True, normalize_embeddings=True)

        sims = np.dot(emb_texts, q_emb)
        top_idx = np.argsort(sims)[::-1][:TOP_PASSAGES]

        top_passages = [
            {"url": docs[i]["url"], "passage": docs[i]["passage"], "score": float(sims[i])}
            for i in top_idx
        ]

        # 4. Summarize extractively
        sentences = []
        for tp in top_passages:
            for s in split_sentences(tp["passage"]):
                if len(s.split()) < 6:
                    continue
                sentences.append({"sent": s, "url": tp["url"]})

        if sentences:
            sent_texts = [s["sent"] for s in sentences]
            sent_embs = self.embedder.encode(sent_texts, convert_to_numpy=True, normalize_embeddings=True)
            sent_sims = np.dot(sent_embs, q_emb)
            top_sent_idx = np.argsort(sent_sims)[::-1][:SUMMARY_SENTENCES]
            chosen = [sentences[i] for i in top_sent_idx]

            seen = set()
            lines = []
            for s in chosen:
                key = s["sent"].lower()[:80]
                if key in seen:
                    continue
                seen.add(key)
                lines.append(f"{s['sent']} (Source: {s['url']})")
            summary = " ".join(lines)
        else:
            summary = "No summary could be generated."

        elapsed = round(time.time() - start, 2)
        return {
            "query": query,
            "passages": top_passages,
            "summary": summary,
            "time": elapsed,
        }

In [8]:
# --- Run Example ---

if __name__ == "__main__":
    agent = ShortResearchAgent()
    query = "What causes urban heat islands and how can cities reduce them?"
    print(f"Running query: {query}\n")
    out = agent.run(query)

    print("\nTop passages:")
    for p in out["passages"]:
        print(f"- score {p['score']:.3f} src {p['url']}\n  {p['passage'][:200]}...\n")

    print("--- Extractive summary ---")
    print(out["summary"])
    print("--------------------------")
    print(f"\nDone in {out['time']:.1f}s")

Loading embedder: sentence-transformers/all-MiniLM-L6-v2 ...
Running query: What causes urban heat islands and how can cities reduce them?


Top passages:
- score 0.875 src https://evytor.vercel.app/blogs/urban-heat-islands-why-cities-feel-hotter
  phenomenon has far-reaching implications, impacting everything from energy consumption and air quality to human health and ecosystem balance. Let's explore the science, impact, and solutions to this h...

- score 0.823 src https://www.siradel.com/urban-heat-island-effect-causes-and-solutions/
  way for cooler, greener and more resilient urban environments. Summary Understanding Urban Heat Islands What strategies can be used to mitigate urban heat islands effect? How to assess a territory and...

- score 0.822 src https://evytor.vercel.app/blogs/urban-heat-islands-why-cities-feel-hotter
  this: trees are nature's air conditioners! Cars, air conditioners, factories â€“ all these release heat into the environment. This "waste heat" contributes 