In [None]:
# Cell 2: Import required libraries
from typing import TypedDict, Annotated, List
from langgraph.graph import StateGraph, END
from trafilatura import fetch_url, extract
from langchain_openai import ChatOpenAI
from duckduckgo_search import DDGS
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import httpx
import json
import os
import re
from groq import Groq
from dotenv import load_dotenv

groq_api_key = os.getenv("GROQ_API_KEY")  
groq_client = Groq(api_key=groq_api_key)


In [26]:
# Cell 3: Enhanced Content Extraction
from bs4 import BeautifulSoup

def fetch_web_content(url: str) -> str:
    """Improved technical content extraction with browser headers"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        }
        response = httpx.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Target specific content containers (adjust class names as needed)
        content = soup.find('div', class_='entry-content') or soup.find('article') or soup.find('main')
        if not content:
            content = soup.body
            
        # Extract and clean text
        text = content.get_text('\n', strip=True)
        return '\n'.join([line for line in text.split('\n') if len(line) > 40])[:15000]
    except Exception as e:
        print(f"Error fetching content: {e}")
        return ""

In [27]:
# Cell 4: Text processing and embeddings
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def get_embeddings(text: str) -> List[float]:
    return model.encode(text, convert_to_tensor=False).tolist()

def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

In [36]:
# Cell 5: Smart Relevance Detection
def is_relevant(question: str, context: str, threshold: float = 0.6) -> float:
    if not context:
        return 0.0
    
    # Extract keywords from the question (adjust as needed)
    question_keywords = set(question.lower().split())
    
    chunks = chunk_text(context, chunk_size=1000)
    if not chunks:
        return 0.0
    
    question_emb = model.encode(question, convert_to_tensor=False)
    context_embs = model.encode(chunks, convert_to_tensor=False)
    similarities = cosine_similarity([question_emb], context_embs)[0]
    
    # Get top chunk indices
    top_indices = sorted(range(len(similarities)), key=lambda i: -similarities[i])[:2]
    top_chunks = [chunks[i] for i in top_indices]
    
    # Check if any top chunk contains question keywords
    keyword_found = any(any(kw in chunk.lower() for kw in question_keywords) for chunk in top_chunks)
    
    # Compute max similarity score
    max_score = max(similarities)
    
    # If keywords not found, lower the score
    if not keyword_found:
        max_score = 0.0
    
    return max_score if max_score > threshold else 0.0

In [46]:
# Cell 6: Faster Answer Generation
def answer_from_web(question: str, context: str) -> str:
    """Generate answer using web context with semantic relevance"""
    chunks = chunk_text(context, chunk_size=2000)
    relevant_chunks = []
    
    # Extract relevant chunks using embeddings
    question_emb = model.encode(question, convert_to_tensor=False)
    chunk_embs = model.encode(chunks, convert_to_tensor=False)
    similarities = cosine_similarity([question_emb], chunk_embs)[0]
    
    # Select top 4 chunks by similarity (previously 2)
    top_indices = sorted(range(len(similarities)), key=lambda i: -similarities[i])[:4]
    for idx in top_indices:
        relevant_chunks.append(chunks[idx])
    
    prompt = f"""
    Analyze the following information from the web to answer the question.
    Avoid hallucination and stick strictly to the provided context. If unsure, state that no precise information is available.
    
    Context:
    ---
    {''.join(relevant_chunks)}
    ---
    
    Question: {question}
    
    Answer in detail:
    """
    
    try:
        response = groq_client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="mixtral-8x7b-32768",
            temperature=0.2,
            max_tokens=500  # Increased from 400
        )
        return f"[Web Context Answer] {response.choices[0].message.content}"
    except Exception as e:
        print(f"Error generating web answer: {e}")
        return answer_from_ddg(question)

def answer_from_ddg(question: str) -> str:
    try:
        with DDGS() as ddgs:
            results = [r for r in ddgs.text(question, max_results=5)]  # More results
            
        context = "\n".join([f"{r['title']}: {r['body']}" for r in results])
        
        prompt = f"""Answer this question based on web search results:
        Question: {question}
        Search Results: {context}
        Answer in a clear paragraph:"""
        
        response = groq_client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="mixtral-8x7b-32768",
            temperature=0.5,
        )
        return f"[Web Search Answer] {response.choices[0].message.content}"
    except Exception as e:
        print(f"Error in DDG answer: {e}")
        return "Could not generate an answer at this time"

In [47]:
# Cell 7: Final Workflow Setup
class AgentState(TypedDict):
    url: str
    question: str
    content: str
    final_answer: str

def fetch_content(state: AgentState):
    content = fetch_web_content(state["url"])
    return {"content": content}

def generate_answer(state: AgentState):
    question = state["question"]
    content = state["content"]
    
    # Use entire content without truncation
    if is_relevant(question, content):
        return {"final_answer": answer_from_web(question, content)}
    return {"final_answer": answer_from_ddg(question)}

workflow = StateGraph(AgentState)
workflow.add_node("fetch_content", fetch_content)
workflow.add_node("generate_answer", generate_answer)
workflow.set_entry_point("fetch_content")
workflow.add_edge("fetch_content", "generate_answer")
workflow.add_edge("generate_answer", END)
agent = workflow.compile()

In [48]:
# Cell 8: Enhanced Test Function
def test_agent(url: str, question: str):
    result = agent.invoke({
        "url": url,
        "question": question,
        "content": "",
        "final_answer": ""
    })
    
    print(f"URL: {url}")
    print(f"Question: {question}")
    print(f"Content length: {len(result['content'])} characters")
    
    max_score = is_relevant(question, result['content'])
    print(f"Max similarity score: {max_score:.4f}")
    
    answer_source = "Web Page" if max_score > 0.6 else "DuckDuckGo"
    print(f"Answer Source: {answer_source}")
    
    print("\nAnswer:")
    print(result['final_answer'])
    print("\n" + "="*50 + "\n")

# Run test
test_agent(
    url="https://isha.sadhguru.org/yoga/new-to-yoga/what-is-yoga/",
    question="What is the meaning of yoga?"
)

URL: https://isha.sadhguru.org/yoga/new-to-yoga/what-is-yoga/
Question: What is the meaning of yoga?
Content length: 15000 characters
Max similarity score: 0.7264
Answer Source: Web Page

Answer:
[Web Context Answer] Yoga is a complete path by itself that signifies a technology for transformation and liberation from memory. It is not limited to physical postures or exercises, but is a way of being where one experiences everything as a part of themselves. The ultimate goal of Yoga is to move towards an experiential reality where one knows the ultimate nature of the existence.

In the Yogic tradition, the word "Yoga" attached to anything indicates that it is a complete path by itself. This is because Yoga is not just a simple practice or an art form, but a technology that can change the shape of who you are, both literally and otherwise. It is a mechanism to get you to a state of experience where you see reality just the way it is.

Yoga is also about liberating oneself from memory. It i