In [1]:
from langchain_community.tools.tavily_search import TavilySearchResults
import os

def get_search_tool():
    tavily_api_key = os.getenv("TAVILY_API_KEY")
    if not tavily_api_key:
        raise ValueError("TAVILY_API_KEY not set in environment variables")
    return TavilySearchResults(max_results=5)


In [6]:
# src/prompts/search_prompt.py
SEARCH_PROMPT = """
You are a Regulatory Reference Searcher. Use the ingestion output to find authoritative reference documents
(reports, guidelines, official circulars, technical guidance, monographs) that are relevant for the
given molecule ({molecule}), the experiment context, and the region ({region}).

Primary targets (in priority order): CDSCO (cdsco.gov.in), central/state government websites (.gov.in),
Pharmacopoeias, ICH/WHO pages relevant to India, and academic/regulatory bodies in India.

From the ingestion context, extract short keywords and phrases to form search queries (include molecule + experiment keywords).
Return a list of SERP queries to execute (one query per line). Use site:cdsco.gov.in and google.co.in localization where possible.
"""


In [22]:
import os
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langgraph.prebuilt import create_react_agent
# from src.prompts.search_prompt import SEARCH_PROMPT
# from src.tools.search_tool import get_search_tool

# Load GROQ API Key
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
    raise ValueError("GROQ_API_KEY not set in environment variables")

# Initialize LLM (Groq)
llm = ChatGroq(model="deepseek-r1-distill-llama-70b", api_key=groq_api_key)

# Load tools (Tavily search)
search_tool = get_search_tool()
tools = [search_tool]

# Create the agent (no prompt template, just attach tools + llm)
agent = create_react_agent(llm, tools)

def run_search_agent(molecule, experiment, region):
    history = []

    # Step 1: Inject your system prompt manually
    system_message = SystemMessage(content=SEARCH_PROMPT.format(molecule=molecule, region=region))
    history.append(system_message)

    # Step 2: Add human input
    human_message = HumanMessage(content=f"Molecule: {molecule}\nExperiment: {experiment}\nRegion: {region}")
    history.append(human_message)

    # Step 3: Call the agent with the conversation messages
    result = agent.invoke({"messages": history})
    # Instead of result["output"]:
    agent_message = result if isinstance(result, AIMessage) else result.get("message", None)

    if agent_message:
        history.append(agent_message)
    else:
        print("Agent didn't return an AIMessage as expected:", result)

In [24]:
molecule = "Paracetamol"
experiment = "prescription combination limit"
region = "India"

history = run_search_agent(molecule, experiment, region)
print(history)
# for msg in history:
#     print(f"{msg.type.upper()}: {msg.content}\n")

Agent didn't return an AIMessage as expected: {'messages': [SystemMessage(content='\nYou are a Regulatory Reference Searcher. Use the ingestion output to find authoritative reference documents\n(reports, guidelines, official circulars, technical guidance, monographs) that are relevant for the\ngiven molecule (Paracetamol), the experiment context, and the region (India).\n\nPrimary targets (in priority order): CDSCO (cdsco.gov.in), central/state government websites (.gov.in),\nPharmacopoeias, ICH/WHO pages relevant to India, and academic/regulatory bodies in India.\n\nFrom the ingestion context, extract short keywords and phrases to form search queries (include molecule + experiment keywords).\nReturn a list of SERP queries to execute (one query per line). Use site:cdsco.gov.in and google.co.in localization where possible.\n', additional_kwargs={}, response_metadata={}, id='75df4a25-cc1b-4825-91c8-f2405d0846e6'), HumanMessage(content='Molecule: Paracetamol\nExperiment: prescription comb

In [29]:
import os
from typing import List, Dict
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langgraph.prebuilt import create_react_agent
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda

def get_search_tool():
    tavily_api_key = os.getenv("TAVILY_API_KEY")
    if not tavily_api_key:
        raise ValueError("TAVILY_API_KEY not set in environment variables")
    return TavilySearchResults(max_results=5)

SEARCH_PROMPT = """
You are a Regulatory Reference Searcher. Use the ingestion output to find authoritative reference documents
(reports, guidelines, official circulars, technical guidance, monographs) that are relevant for the
given molecule ({molecule}), the experiment context, and the region ({region}).

Primary targets (in priority order): CDSCO (cdsco.gov.in), central/state government websites (.gov.in),
Pharmacopoeias, ICH/WHO pages relevant to India, and academic/regulatory bodies in India.

From the ingestion context, extract short keywords and phrases to form search queries (include molecule + experiment keywords).
Return a list of SERP queries to execute (one query per line). Use site:cdsco.gov.in and google.co.in localization where possible.

After receiving search results, analyze them and return ONLY the most relevant document links with a brief description of their relevance.
Format your final output as:

RELEVANT DOCUMENTS:
1. [Title](URL) - Description of relevance
2. [Title](URL) - Description of relevance
...
"""

# Initialize LLM (Groq)
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
    raise ValueError("GROQ_API_KEY not set in environment variables")

llm = ChatGroq(model="deepseek-r1-distill-llama-70b", api_key=groq_api_key)
output_parser = StrOutputParser()

# Load tools (Tavily search)
search_tool = get_search_tool()
tools = [search_tool]

# Create the agent
agent = create_react_agent(llm, tools)

def parse_search_results(results: List[Dict]) -> str:
    """Parse raw search results into a formatted string with relevant documents."""
    formatted_results = []
    for idx, result in enumerate(results, 1):
        formatted_results.append(
            f"{idx}. [{result.get('title', 'No title')}]({result.get('url', 'No URL')}) - "
            f"{result.get('content', 'No description available')}"
        )
    return "\n".join(formatted_results)

def process_agent_output(output: AIMessage) -> dict:
    """Process agent output to extract search queries or final results."""
    content = output.content
    if "RELEVANT DOCUMENTS:" in content:
        # Final output with documents
        return {"status": "complete", "documents": content}
    else:
        # Intermediate step with search queries
        queries = [q.strip() for q in content.split("\n") if q.strip()]
        return {"status": "search_queries", "queries": queries}

def run_search_agent(molecule: str, experiment: str, region: str) -> dict:
    """Run the search agent and return processed results with document links."""
    history = []
    results = {"molecule": molecule, "experiment": experiment, "region": region, "documents": []}

    # Step 1: Inject system prompt
    system_message = SystemMessage(content=SEARCH_PROMPT.format(molecule=molecule, region=region))
    history.append(system_message)

    # Step 2: Add human input
    human_message = HumanMessage(content=f"Molecule: {molecule}\nExperiment: {experiment}\nRegion: {region}")
    history.append(human_message)

    # Step 3: Initial agent call to get search queries
    initial_result = agent.invoke({"messages": history})
    agent_message = initial_result if isinstance(initial_result, AIMessage) else initial_result.get("message", None)
    
    if not agent_message:
        return {"error": "Agent didn't return expected response"}
    
    history.append(agent_message)
    processed = process_agent_output(agent_message)
    
    if processed["status"] == "search_queries":
        # Execute searches and process results
        search_results = []
        for query in processed["queries"]:
            try:
                search_result = search_tool.invoke({"query": query})
                parsed_results = parse_search_results(search_result)
                search_results.extend(search_result)
                
                # Add search results to history for the agent to analyze
                history.append(HumanMessage(content=f"Search results for '{query}':\n{parsed_results}"))
            except Exception as e:
                print(f"Error searching for {query}: {e}")
                continue
        
        # Get final analysis from agent
        final_result = agent.invoke({"messages": history})
        final_message = final_result if isinstance(final_result, AIMessage) else final_result.get("message", None)
        
        if final_message:
            history.append(final_message)
            processed = process_agent_output(final_message)
            if processed["status"] == "complete":
                results["documents"] = processed["documents"]
            else:
                results["error"] = "Unexpected final output format"
        else:
            results["error"] = "No final analysis received"
    else:
        results["documents"] = processed["documents"]
    
    return results

# Example usage
molecule = "Paracetamol"
experiment = "prescription combination limit"
region = "India"

results = run_search_agent(molecule, experiment, region)
print("\nFinal Results:")
print(f"Molecule: {results.get('molecule')}")
print(f"Experiment: {results.get('experiment')}")
print(f"Region: {results.get('region')}")
print("\nRelevant Documents:")
print(results.get('documents', 'No documents found'))


Final Results:
Molecule: None
Experiment: None
Region: None

Relevant Documents:
No documents found


In [None]:
molecule = "Paracetamol"
region = "India"

result = run_search_agent(molecule, region=region)  # experiment can be empty

print("=== Generated Search Queries ===")
for q in result.get("queries", []):
    print(f"- {q}")

print("\n=== Reference Documents ===")
for doc in result.get("results", []):
    print(f"- Query: {doc['query']}")
    print(f"  Title: {doc.get('title', 'N/A')}")
    print(f"  URL: {doc.get('url')}")
    print(f"  Snippet: {doc.get('snippet', '')[:200]}...\n")

AttributeError: 'str' object has no attribute 'get'

In [50]:
import os
import requests
from typing import List, Dict, Optional
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate

# Configuration
DOWNLOAD_DIR = "downloaded_docs"
REQUEST_TIMEOUT = 30
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_search_tool():
    tavily_api_key = os.getenv("TAVILY_API_KEY")
    if not tavily_api_key:
        raise ValueError("TAVILY_API_KEY not set in environment variables")
    return TavilySearchResults(max_results=3)

def download_document(url: str) -> Optional[str]:
    """Download a document and save it locally."""
    try:
        os.makedirs(DOWNLOAD_DIR, exist_ok=True)
        filename = url.split('/')[-1].split('?')[0] or f"document_{int(time.time())}.pdf"
        filepath = os.path.join(DOWNLOAD_DIR, filename)
        
        if os.path.exists(filepath):
            return filepath
            
        response = requests.get(url, headers=HEADERS, stream=True, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()
        
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return filepath
    except Exception as e:
        print(f"Failed to download {url}: {str(e)}")
        return None

def generate_search_queries(molecule: str, experiment: str, region: str) -> List[str]:
    """Generate search queries using LLM."""
    llm = ChatGroq(
        model="gemma2-9b-it",
        temperature=0,
        api_key=os.getenv("GROQ_API_KEY")
    )
    
    prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a pharmaceutical regulatory expert. Generate specific search queries to find official documents about a drug.
        Focus on: CDSCO (site:cdsco.gov.in), Indian government (site:gov.in), Pharmacopoeias, WHO/ICH guidelines.
        Return ONLY the search queries, one per line."""),
        ("human", "Find documents about {molecule} regarding {experiment} in {region}")
    ])
    
    chain = prompt | llm | StrOutputParser()
    response = chain.invoke({
        "molecule": molecule,
        "experiment": experiment,
        "region": region
    })
    
    return [q.strip() for q in response.split('\n') if q.strip()]

def run_regulatory_search(molecule: str, experiment: str, region: str) -> Dict:
    """Main function to run the regulatory document search."""
    response = {
        'status': 'error',
        'molecule': molecule,
        'experiment': experiment,
        'region': region,
        'found_urls': [],
        'downloaded_files': [],
        'message': 'Initialization failed'
    }
    
    try:
        # Step 1: Generate search queries
        queries = generate_search_queries(molecule, experiment, region)
        if not queries:
            response['message'] = "No search queries generated"
            return response
        
        # Step 2: Execute searches
        search_tool = get_search_tool()
        all_results = []
        for query in queries[:3]:  # Limit to 3 queries
            try:
                results = search_tool.invoke({"query": query})
                all_results.extend(results)
            except Exception as e:
                print(f"Search failed for '{query}': {e}")
                continue
        
        if not all_results:
            response['message'] = "No search results found"
            return response
        
        # Step 3: Process results
        urls = [res['url'] for res in all_results if res.get('url')]
        response['found_urls'] = urls
        
        # Step 4: Download documents
        downloaded_files = []
        for url in urls[:5]:  # Limit to 5 downloads
            filepath = download_document(url)
            if filepath:
                downloaded_files.append({
                    'url': url,
                    'local_path': filepath
                })
        
        response['downloaded_files'] = downloaded_files
        response['status'] = 'success' if downloaded_files else 'partial'
        response['message'] = f"Found {len(urls)} URLs, downloaded {len(downloaded_files)} documents"
        
    except Exception as e:
        response['message'] = str(e)
    
    return response

if __name__ == "__main__":
    # Example usage
    results = run_regulatory_search(
        molecule="Paracetamol",
        experiment="prescription combination limits",
        region="India"
    )
    
    # Print results
    print("\n=== RESULTS ===")
    print(f"Status: {results['status']}")
    print(f"Molecule: {results['molecule']}")
    print(f"Experiment: {results['experiment']}")
    print(f"Region: {results['region']}")
    
    if results['downloaded_files']:
        print("\nDownloaded documents:")
        for file in results['downloaded_files']:
            print(f"- {file['local_path']} (from {file['url']})")
    
    if results['found_urls']:
        print("\nAll found URLs:")
        for i, url in enumerate(results['found_urls'], 1):
            print(f"{i}. {url}")
    
    print(f"\nMessage: {results['message']}")

Failed to download https://www.drugs.com/paracetamol.html: 403 Client Error: Forbidden for url: https://www.drugs.com/paracetamol.html

=== RESULTS ===
Status: success
Molecule: Paracetamol
Experiment: prescription combination limits
Region: India

Downloaded documents:
- downloaded_docs\ecommerce-search-query-types (from https://baymard.com/blog/ecommerce-search-query-types)
- downloaded_docs\document_1755507179.pdf (from https://www.coveo.com/blog/search-query-optimization/)
- downloaded_docs\how-to-construct-complex-google-web-search-query (from https://stackoverflow.com/questions/15852238/how-to-construct-complex-google-web-search-query)
- downloaded_docs\paracetamol (from https://www.healthdirect.gov.au/paracetamol)

All found URLs:
1. https://baymard.com/blog/ecommerce-search-query-types
2. https://www.coveo.com/blog/search-query-optimization/
3. https://stackoverflow.com/questions/15852238/how-to-construct-complex-google-web-search-query
4. https://www.drugs.com/paracetamol.html

In [49]:
"""
Advanced Regulatory + Reference Ingestion Agent with SSL Handling

Includes:
- ReAct agent using Tavily search (Web + PDF)
- SSL certificate verification using certifi + fallback
- WebBaseLoader + PyPDFLoader for URL ingestion
- FAISS vector store + embeddings (OpenAI or HF)
"""

import os
import ssl
import json
import time
import hashlib
import warnings
import mimetypes
import requests
from typing import List, Dict, Any, Optional

import certifi
from requests.packages.urllib3.exceptions import InsecureRequestWarning

from langchain_groq import ChatGroq
from langchain_core.messages import SystemMessage, HumanMessage
from langgraph.prebuilt import create_react_agent
from langchain_core.tools import tool

# Tavily imports
try:
    from langchain_tavily import TavilySearch
    _TAVILY_MODE = "new"
except ImportError:
    from langchain_community.tools.tavily_search import TavilySearchResults
    _TAVILY_MODE = "community"

from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

try:
    from langchain_openai import OpenAIEmbeddings
    _HAVE_OPENAI = True
except ImportError:
    from langchain_community.embeddings import HuggingFaceEmbeddings
    _HAVE_OPENAI = False

# -------------------------
# Configuration
# -------------------------
DOWNLOAD_DIR = "downloaded_docs"
VECTOR_DIR = "vectorstore_faiss"
REQUEST_TIMEOUT = 40
USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/123.0 Safari/537.36"
)

# -------------------------
# SSL-Safe Download
# -------------------------
def _download_ssl(url: str) -> Optional[str]:
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)
    fname = url.split("/")[-1].split("?")[0]
    if not fname:
        fname = hashlib.sha256(url.encode()).hexdigest()[:16]
    filepath = os.path.join(DOWNLOAD_DIR, fname)

    if os.path.exists(filepath):
        return filepath

    headers = {"User-Agent": USER_AGENT}
    try:
        resp = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT, verify=certifi.where())
        resp.raise_for_status()
    except Exception as e:
        print(f"[SSL] Cert verification failed for {url}: {e}, falling back to insecure...")
        warnings.simplefilter("ignore", InsecureRequestWarning)
        try:
            resp = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT, verify=False)
            resp.raise_for_status()
        except Exception as e2:
            print(f"[SSL] Insecure fallback failed for {url}: {e2}")
            return None

    with open(filepath, "wb") as f:
        for chunk in resp.iter_content(8192):
            f.write(chunk)

    return filepath

# -------------------------
# Venue Search & Ingestion Functions
# -------------------------
def _tavily_search(query: str, max_results: int = 6) -> List[Dict[str, Any]]:
    if _TAVILY_MODE == "new":
        tool = TavilySearch(max_results=max_results, include_answer=False, include_raw_content=False)
        out = tool.invoke({"query": query})
        return out.get("results", []) if isinstance(out, dict) else (out if isinstance(out, list) else [])
    else:
        tool = TavilySearchResults(max_results=max_results, include_answer=False, include_raw_content=False)
        out = tool.invoke({"query": query})
        if isinstance(out, list):
            return [r if isinstance(r, dict) else {"title":"", "url":"", "content": str(r)} for r in out]
        if isinstance(out, dict):
            return [out]
        return []

def _load_url_docs(url: str) -> List[Dict[str, Any]]:
    docs = []
    try:
        is_pdf = url.lower().endswith(".pdf")
        local = _download_ssl(url)
        if local and is_pdf:
            loader = PyPDFLoader(local)
        else:
            loader = WebBaseLoader([url], header_template={"User-Agent": USER_AGENT}, continue_on_failure=True)
        pages = loader.load()
        splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
        chunks = splitter.split_documents(pages)
        for c in chunks:
            docs.append({"page_content": c.page_content, "metadata": c.metadata})
    except Exception as e:
        print(f"[load] Failed {url}: {e}")
    return docs

def _get_embeddings():
    if _HAVE_OPENAI and os.getenv("OPENAI_API_KEY"):
        return OpenAIEmbeddings(model="text-embedding-3-large")
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def _load_faiss():
    embeddings = _get_embeddings()
    if os.path.isdir(VECTOR_DIR):
        try:
            return FAISS.load_local(VECTOR_DIR, embeddings, allow_dangerous_deserialization=True)
        except Exception:
            pass
    return None

def _save_faiss(vs: FAISS):
    vs.save_local(VECTOR_DIR)

# -------------------------
# Tools for ReAct Agent
# -------------------------
@tool("web_search", return_direct=True)
def web_search_tool(query: str) -> str:
    results = _tavily_search(query, max_results=6)
    out = [{"title": r.get("title",""), "url": r.get("url","")} for r in results if r.get("url")]
    return json.dumps(out, ensure_ascii=False)

@tool("ingest_url", return_direct=True)
def ingest_url_tool(url: str) -> str:
    docs = _load_url_docs(url)
    if not docs:
        return json.dumps({"url": url, "chunks": 0})
    embeddings = _get_embeddings()
    vs = _load_faiss()
    texts = [d["page_content"] for d in docs]
    metas = [d["metadata"] for d in docs]
    if vs is None:
        vs = FAISS.from_texts(texts, embedding=embeddings, metadatas=metas)
    else:
        vs.add_texts(texts, metadatas=metas)
    _save_faiss(vs)
    return json.dumps({"url": url, "chunks": len(docs)})

# -------------------------
# Agent Setup
# -------------------------
AGENT_PROMPT = """
You are a Regulatory & Research Reference Agent for {region}. Your tasks:
- Generate up to {max_q} precise queries to find both regulatory compliance documents and scientific references for the molecule.
- Distinguish each found URL as "compliance" or "reference" with your reasoning.
- For each reliable URL, call `ingest_url`.
Finally, output JSON:
{{
  "queries": [...],
  "links": [
    {{"url":"...","title":"...","type":"compliance"|"reference","reason":"..."}}
  ],
  "note":"summary"
}}
"""

def build_agent(region: str, max_q=6, max_links=10):
    llm = ChatGroq(model=os.getenv("GROQ_MODEL","gemma2-9b-it"), temperature=0, api_key=os.getenv("GROQ_API_KEY"))
    prompt = AGENT_PROMPT.format(region=region, max_q=max_q, max_links=max_links)
    return create_react_agent(llm, [web_search_tool, ingest_url_tool], prompt=prompt)

def run_full_pipeline(molecule: str, region: str, context: Dict[str, Any]) -> Dict[str, Any]:
    agent = build_agent(region)
    sys = SystemMessage(content="Follow your system instructions precisely.")
    human = HumanMessage(content=json.dumps({
        "molecule": molecule,
        "region": region,
        "context": context
    }, ensure_ascii=False))
    try:
        resp_msg = agent.invoke({"messages":[sys, human]})
        raw = getattr(resp_msg, "content", "") or resp_msg.get("output", "")
    except Exception as e:
        return {"status":"error","message":f"Agent failed: {e}"}
    try:
        return json.loads(raw)
    except:
        return {"status":"partial","message":"Could not parse JSON","raw": raw}

# -------------------------
# Example run
# -------------------------
if __name__ == "__main__":
    context ={
        "process_description": "The synthesis of Paracetamol involves a three-step process: 1) Acetylation, 2) Nitro reduction, and 3) Recrystallization.",
        "specification": "The specifications for Paracetamol (IP 2022) include: - Assay: 99.0 \u2013 101.0 % (HPLC with external standard). - Impurity A: \u2264 0.10 % (HPLC). - Loss on Drying: \u2264 0.5 % (USP <731>). - Residue on Ignition: \u2264 0.1 % (USP <281>).",
        "stability_report": "The stability study follows ICH Q1A (R2) guidelines. Accelerated conditions (40 \u00b0C/75 % RH) for 6 months show: - Assay decrease: \u22120.8 % (within specification). - Impurity A increase: +0.03 % (0.08 % at 6 months). Physical appearance remains unchanged. Long-term conditions (30 \u00b0C/65 % RH) for 12 months show all parameters remain within specification. Paracetamol API is stable in LDPE/aluminium laminate packs with a proposed re-test period of 36 months when stored at \u2264 30 \u00b0C."
    }
    out = run_full_pipeline("Paracetamol", "India", context)
    print(json.dumps(out, indent=2, ensure_ascii=False))
    print("Vector store path:", VECTOR_DIR)


ValueError: Function must have a docstring if description not provided.

In [None]:
import os
import requests
import time
import re
from typing import Dict, List, Optional, Tuple
from pathlib import Path
from urllib.parse import urlparse
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.embeddings import Embeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.messages import SystemMessage, HumanMessage

# Configuration
DOWNLOAD_DIR = "downloaded_docs"
VECTORSTORE_DIR = "vectorstore"
REQUEST_TIMEOUT = 30
MAX_DOWNLOADS = 10
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Domain whitelist
TRUSTED_DOMAINS = {
    'india': ['cdsco.gov.in', 'ipc.gov.in', 'pharmaceuticals.gov.in', 'who.int', 
              'ich.org', 'ncbi.nlm.nih.gov', 'researchgate.net', 'sciencedirect.com',
              'gov.in', 'indianjournals.com', 'pharmatutor.org'],
    'default': ['fda.gov', 'ema.europa.eu', 'who.int', 'ich.org', 'ncbi.nlm.nih.gov']
}

def initialize_environment():
    """Create necessary directories."""
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)
    os.makedirs(VECTORSTORE_DIR, exist_ok=True)

def get_search_tool():
    """Initialize Tavily search tool with proper configuration."""
    tavily_api_key = os.getenv("TAVILY_API_KEY")
    if not tavily_api_key:
        raise ValueError("TAVILY_API_KEY not set in environment variables")
    return TavilySearchResults(max_results=5, include_raw_content=True)

def get_embeddings() -> Embeddings:
    """Initialize embeddings model."""
    return HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

def is_relevant_url(url: str, region: str) -> bool:
    """Check if URL is from a trusted domain."""
    try:
        domain = urlparse(url).netloc.lower()
        domain = re.sub(r'^www\.', '', domain)
        trusted_domains = TRUSTED_DOMAINS.get(region.lower(), TRUSTED_DOMAINS['default'])
        return any(domain.endswith(td) for td in trusted_domains)
    except:
        return False

def sanitize_filename(filename: str) -> str:
    """Sanitize filename to be filesystem-safe."""
    return re.sub(r'[^\w\-_. ]', '_', filename)

def download_content(url: str) -> Tuple[Optional[str], Optional[str]]:
    """Download content from URL and save locally."""
    try:
        parsed = urlparse(url)
        if not all([parsed.scheme, parsed.netloc]):
            raise ValueError(f"Invalid URL: {url}")

        filename = sanitize_filename(os.path.basename(parsed.path)) or f"document_{int(time.time())}"
        filepath = os.path.join(DOWNLOAD_DIR, filename)
        
        # Check existing files to avoid duplicates
        if os.path.exists(filepath):
            return filepath, None
            
        response = requests.get(url, headers=HEADERS, stream=True, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()
        
        # Determine content type and extension
        content_type = response.headers.get('Content-Type', '')
        if 'application/pdf' in content_type:
            if not filepath.lower().endswith('.pdf'):
                filepath += '.pdf'
        elif not filepath.lower().endswith(('.html', '.htm')):
            filepath += '.html'
        
        # Save content
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        
        return filepath, content_type
    except Exception as e:
        print(f"Failed to download {url}: {str(e)}")
        return None, None

def load_document(filepath: str, content_type: str = None) -> Optional[List]:
    """Load document based on its type."""
    try:
        if filepath.lower().endswith('.pdf') or (content_type and 'pdf' in content_type.lower()):
            return PyPDFLoader(filepath).load()
        else:
            return WebBaseLoader(filepath).load()
    except Exception as e:
        print(f"Failed to load {filepath}: {str(e)}")
        return None

def generate_search_queries(molecule: str, context: Dict, region: str) -> List[str]:
    """Generate focused search queries using LLM."""
    llm = ChatGroq(
        model="deepseek-r1-distill-llama-70b",
        temperature=0.2,
        api_key=os.getenv("GROQ_API_KEY")
    )
    
    prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a pharmaceutical regulatory expert. Generate specific search queries to find:
        1. Regulatory compliance documents (official guidelines, pharmacopoeia standards)
        2. Scientific reference documents (research papers, clinical studies)
        
        For molecule: {molecule} in region: {region}
        Context: {context}
        
        Requirements:
        - Focus on official sources (government, regulatory bodies)
        - Prioritize PDF documents
        - Include site-specific searches for {region} domains
        - Exclude news articles and commercial websites
        
        Return ONLY the search queries, one per line."""),
        ("human", "Generate search queries for {molecule} in {region}")
    ])
    
    chain = prompt | llm | StrOutputParser()
    response = chain.invoke({
        "molecule": molecule,
        "context": "\n".join(f"{k}: {v}" for k,v in context.items()),
        "region": region
    })
    
    return [q.strip() for q in response.split('\n') if q.strip()]

def run_regulatory_search(molecule: str, context: Dict, region: str) -> Dict:
    """Main search and document processing workflow."""
    initialize_environment()
    response = {
        'status': 'error',
        'molecule': molecule,
        'region': region,
        'context': context,
        'found_urls': [],
        'downloaded_files': [],
        'vectorstore': None,
        'message': 'Initialization failed'
    }
    
    try:
        # Step 1: Generate search queries
        queries = generate_search_queries(molecule, context, region)
        if not queries:
            response['message'] = "No search queries generated"
            return response
        
        # Step 2: Execute searches
        search_tool = get_search_tool()
        all_results = []
        for query in queries[:5]:  # Limit to 5 queries
            try:
                results = search_tool.invoke({"query": query})
                if isinstance(results, list):
                    all_results.extend(results)
                elif isinstance(results, dict) and 'results' in results:
                    all_results.extend(results['results'])
            except Exception as e:
                print(f"Search failed for '{query}': {e}")
                continue
        
        if not all_results:
            response['message'] = "No search results found"
            return response
        
        # Step 3: Filter and process URLs
        urls = []
        for result in all_results:
            if isinstance(result, dict):
                url = result.get('url')
                if url and is_relevant_url(url, region):
                    urls.append(url)
        
        response['found_urls'] = urls
        
        # Step 4: Download and process content
        downloaded_files = []
        all_docs = []
        embeddings = get_embeddings()
        
        for url in urls[:MAX_DOWNLOADS]:
            try:
                filepath, content_type = download_content(url)
                if filepath:
                    docs = load_document(filepath, content_type)
                    if docs:
                        downloaded_files.append({
                            'url': url,
                            'local_path': filepath
                        })
                        all_docs.extend(docs)
            except Exception as e:
                print(f"Failed to process {url}: {e}")
                continue
        
        if not all_docs:
            response['message'] = "No documents processed"
            return response
        
        # Step 5: Create vectorstore
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(all_docs)
        
        vectorstore_name = f"{molecule.lower()}_{region.lower()}_docs".replace(' ', '_')
        vectorstore = FAISS.from_documents(splits, embeddings)
        vectorstore.save_local(os.path.join(VECTORSTORE_DIR, vectorstore_name))
        
        response.update({
            'downloaded_files': downloaded_files,
            'vectorstore': os.path.join(VECTORSTORE_DIR, vectorstore_name),
            'status': 'success',
            'message': f"Processed {len(downloaded_files)} documents. Vectorstore created."
        })
        
    except Exception as e:
        response['message'] = str(e)
    
    return response

if __name__ == "__main__":
    # Example usage
    context = {
        "process_description": "The synthesis of Paracetamol involves a three-step process: 1) Acetylation, 2) Nitro reduction, and 3) Recrystallization.",
        "specification": "The specifications for Paracetamol (IP 2022) include: - Assay: 99.0 \u2013 101.0 % (HPLC with external standard). - Impurity A: \u2264 0.10 % (HPLC). - Loss on Drying: \u2264 0.5 % (USP <731>). - Residue on Ignition: \u2264 0.1 % (USP <281>).",
        "stability_report": "The stability study follows ICH Q1A (R2) guidelines. Accelerated conditions (40 \u00b0C/75 % RH) for 6 months show: - Assay decrease: \u22120.8 % (within specification). - Impurity A increase: +0.03 % (0.08 % at 6 months). Physical appearance remains unchanged. Long-term conditions (30 \u00b0C/65 % RH) for 12 months show all parameters remain within specification. Paracetamol API is stable in LDPE/aluminium laminate packs with a proposed re-test period of 36 months when stored at \u2264 30 \u00b0C."
    }
    
    results = run_regulatory_search(
        molecule="Paracetamol",
        context=context,
        region="India"
    )
    
    # Print results
    print("\n=== RESULTS ===")
    print(f"Status: {results['status']}")
    print(f"Molecule: {results['molecule']}")
    print(f"Region: {results['region']}")
    
    if results['downloaded_files']:
        print("\nDownloaded documents:")
        for file in results['downloaded_files']:
            print(f"- {file['local_path']} (from {file['url']})")
    
    if results['found_urls']:
        print("\nRelevant URLs found:")
        for i, url in enumerate(results['found_urls'], 1):
            print(f"{i}. {url}")
    
    if results['vectorstore']:
        print(f"\nVectorstore created at: {results['vectorstore']}")
    
    print(f"\nMessage: {results['message']}")

Failed to load downloaded_docs\document_1755519520.html: Invalid URL 'downloaded_docs\\document_1755519520.html': No scheme supplied. Perhaps you meant https://downloaded_docs\document_1755519520.html?
Failed to download https://www.researchgate.net/publication/376293608_Analysis_of_Various_Doses_of_Paracetamol_in_the_Indian_Market_A_Need_to_Revisit: 403 Client Error: Forbidden for url: https://www.researchgate.net/publication/376293608_Analysis_of_Various_Doses_of_Paracetamol_in_the_Indian_Market_A_Need_to_Revisit
Failed to load downloaded_docs\document_1755519520.html: Invalid URL 'downloaded_docs\\document_1755519520.html': No scheme supplied. Perhaps you meant https://downloaded_docs\document_1755519520.html?
Failed to load downloaded_docs\cdsco.php.html: Invalid URL 'downloaded_docs\\cdsco.php.html': No scheme supplied. Perhaps you meant https://downloaded_docs\cdsco.php.html?
Failed to load downloaded_docs\document_1755519521.html: Invalid URL 'downloaded_docs\\document_175551952

: 