In [147]:
# Cell 2: Import required libraries
from typing import TypedDict, Annotated, List
from langgraph.graph import StateGraph, END
from trafilatura import fetch_url, extract
from langchain_openai import ChatOpenAI
from duckduckgo_search import DDGS
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import httpx
import json
import os
import re
from groq import Groq

# Set OpenAI API key
os.environ["GROQ_API_KEY"] = "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"

In [166]:
# Cell 3: Enhanced Content Extraction
from bs4 import BeautifulSoup

def fetch_web_content(url: str) -> str:
    """Improved technical content extraction with browser headers"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
        response = httpx.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Target common content containers
        content = soup.find(['article', 'main', 'div[class*="content"]'])
        if not content:
            content = soup.body
            
        # Extract and clean text
        text = content.get_text('\n', strip=True)
        return '\n'.join([line for line in text.split('\n') if len(line) > 40])[:15000]
    except Exception as e:
        print(f"Error fetching content: {e}")
        return ""

In [167]:
# Cell 4: Text processing and embeddings
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def get_embeddings(text: str) -> List[float]:
    return model.encode(text, convert_to_tensor=False).tolist()

def chunk_text(text: str, chunk_size: int = 1000) -> List[str]:
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

In [181]:
# Cell 5: Smart Relevance Detection
def is_relevant(question: str, context: str, threshold: float = 0.5) -> bool:
    """Semantic relevance check using text embeddings"""
    if not context:
        return False
    
    # Encode question and chunked context
    chunks = chunk_text(context, chunk_size=1000)
    question_emb = model.encode(question, convert_to_tensor=False)
    context_embs = model.encode(chunks, convert_to_tensor=False)
    
    # Compute similarity scores
    similarities = cosine_similarity([question_emb], context_embs)[0]
    
    return any(score > threshold for score in similarities)

In [182]:
# Cell 6: Faster Answer Generation
def answer_from_web(question: str, context: str) -> str:
    """Generate answer using web context with semantic relevance"""
    chunks = chunk_text(context, chunk_size=2000)
    relevant_chunks = []
    
    # Extract relevant chunks using embeddings
    question_emb = model.encode(question, convert_to_tensor=False)
    chunk_embs = model.encode(chunks, convert_to_tensor=False)
    similarities = cosine_similarity([question_emb], chunk_embs)[0]
    
    # Select top 2 chunks by similarity
    top_indices = sorted(range(len(similarities)), key=lambda i: -similarities[i])[:2]
    for idx in top_indices:
        relevant_chunks.append(chunks[idx])
    
    prompt = f"""
    Analyze the following information from the web to answer the question.
    Avoid hallucination and stick strictly to the provided context. If unsure, state that no precise information is available.
    
    Context:
    ---
    {''.join(relevant_chunks)}
    ---
    
    Question: {question}
    
    Answer:
    """
    
    try:
        response = groq_client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="mixtral-8x7b-32768",
            temperature=0.2,
            max_tokens=400
        )
        return f"[Web Context Answer] {response.choices[0].message.content}"
    except Exception as e:
        print(f"Error generating web answer: {e}")
        return answer_from_ddg(question)

def answer_from_ddg(question: str) -> str:
    try:
        with DDGS() as ddgs:
            results = [r for r in ddgs.text(question, max_results=5)]  # More results
            
        context = "\n".join([f"{r['title']}: {r['body']}" for r in results])
        
        prompt = f"""Answer this question based on web search results:
        Question: {question}
        Search Results: {context}
        Answer in a clear paragraph:"""
        
        response = groq_client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="mixtral-8x7b-32768",
            temperature=0.5,
        )
        return f"[Web Search Answer] {response.choices[0].message.content}"
    except Exception as e:
        print(f"Error in DDG answer: {e}")
        return "Could not generate an answer at this time"

In [183]:
# Cell 7: Final Workflow Setup
class AgentState(TypedDict):
    url: str
    question: str
    content: str
    final_answer: str

def fetch_content(state: AgentState):
    content = fetch_web_content(state["url"])
    return {"content": content}

def generate_answer(state: AgentState):
    question = state["question"]
    content = state["content"]
    
    # Use entire content without truncation
    if is_relevant(question, content):
        return {"final_answer": answer_from_web(question, content)}
    return {"final_answer": answer_from_ddg(question)}

workflow = StateGraph(AgentState)
workflow.add_node("fetch_content", fetch_content)
workflow.add_node("generate_answer", generate_answer)
workflow.set_entry_point("fetch_content")
workflow.add_edge("fetch_content", "generate_answer")
workflow.add_edge("generate_answer", END)
agent = workflow.compile()

In [184]:
# Cell 8: Enhanced Test Function
def test_agent(url: str, question: str):
    # Extract the actual URL from the input string
    url_match = re.search(r'https?://\S+', url)
    url = url_match.group(0) if url_match else url
    
    result = agent.invoke({
        "url": url,
        "question": question,
        "content": "",
        "final_answer": ""
    })
    
    print(f"URL: {url}")
    print(f"Question: {question}")
    print(f"Content length: {len(result['content'])} characters")
    print(f"Semantic relevance: {'Yes' if is_relevant(question, result['content']) else 'No'}")
    print("Answer Source:", "Web Page" if is_relevant(question, result['content']) else "DuckDuckGo")
    print("\nAnswer:")
    print(result['final_answer'])
    print("\n" + "="*50 + "\n")

# Run test
test_agent(
    url="https://isha.sadhguru.org/yoga/new-to-yoga/what-is-yoga/",
    question="why should we do yoga?"
)

URL: https://isha.sadhguru.org/yoga/new-to-yoga/what-is-yoga/
Question: why should we do yoga?
Content length: 15000 characters
Semantic relevance: Yes
Answer Source: Web Page

Answer:
[Web Context Answer] According to the provided context, yoga is not just an expression of who you are or a simple practice or exercise, but a technology and a complete path through which you can change the shape of who you are, both literally and otherwise. It is a way of being that allows you to experience everything as a part of yourself. Yoga is a method to enhance your perception and the only thing that is real is what you perceive, the rest is all made up in your head. Yoga can help you break the cycle of compulsions and patterns that you may be stuck in and allow you to move forward in a linear path instead of going round and round. Additionally, the context suggests that if you stop practicing yoga, compulsions and patterns that you thought were gone may come back, indicating that consistent pract