<a href="https://colab.research.google.com/github/ABSatpute/ABSatpute/blob/main/Deep_Research_AI_Agent_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install langgraph langsmith langchain_groq langchain_community tavily-python



In [14]:
import json
import re
import os
from langchain.tools import TavilySearchResults
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain_groq import ChatGroq
from langchain.schema import Document, AIMessage
from langgraph.graph import StateGraph
from pydantic import BaseModel, Field
from google.colab import userdata


In [15]:

# 🔹 Set API Keys
groq_api_key = userdata.get("groq_api_key")
tavily_api_key = userdata.get("tavily_api_key")

os.environ["TAVILY_API_KEY"] = tavily_api_key



In [16]:
# 🔹 Initialize LLMs
llm_summarize = ChatGroq(groq_api_key=groq_api_key, model_name="mixtral-8x7b-32768")
llm_answer = ChatGroq(groq_api_key=groq_api_key, model_name="deepseek-r1-distill-llama-70b")



In [19]:
# 🔹 Define AI Agent State
"""Represents the state of the AI agent workflow, including query, search results,and final answer."""
class State(BaseModel):
    query: str = Field(default="")
    search_results: list[Document] = Field(default_factory=list)
    structured_results: list = Field(default_factory=list)
    final_answer: str = Field(default="")




In [20]:
# 🔹 STEP 1: Fetch Search Results from Tavily API
def fetch_tavily_results(state: State):
    """ Fetches top search results from Tavily API. """
    tavily = TavilySearchResults(api_key=tavily_api_key)
    docs = tavily.run(state.query)
    state.search_results = [Document(page_content=d["content"], metadata=d) for d in docs]
    return state


In [21]:
# 🔹 STEP 2: Clean & Preprocess Text
def clean_text(text):
    """ Cleans text by removing URLs, special characters, and extra spaces. """
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text



In [22]:
# 🔹 STEP 3: Summarize Search Results
def summarize_text(text):
    """ Summarizes text using Mixtral Model.
        Summarizes the gathered search results to extract key information."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    texts = text_splitter.split_text(text)
    docs = [Document(page_content=t) for t in texts]

    summarize_chain = load_summarize_chain(llm_summarize, chain_type="map_reduce")
    summary = summarize_chain.run(docs)
    return summary



In [23]:
# 🔹 STEP 4: Process & Structure Search Results
def process_search_results(state: State):
    """ Processes Tavily search results and structures them into JSON format. """
    structured_results = []

    for doc in state.search_results:
        cleaned_content = clean_text(doc.page_content)
        summary = summarize_text(cleaned_content)

        structured_results.append({
            "title": doc.metadata.get("title", "No Title Found"),
            "url": doc.metadata.get("url", ""),
            "summary": summary
        })

    state.structured_results = structured_results
    return state



In [24]:
# 🔹 STEP 5: Generate Final Answer
def generate_final_answer(state: State):
    """ Generates a final, well-structured answer based on the processed research data using DeepSeek. """
    prompt = f"Summarize the following research findings:\n\n{json.dumps(state.structured_results, indent=4)}"
    response = llm_answer.invoke(prompt)

    state.final_answer = response.content if isinstance(response, AIMessage) else response
    return state



In [28]:
# 🔹 STEP 6: Define Multi-Step AI Workflow
workflow = StateGraph(State)

# 🔹 Add nodes representing different processing steps in the AI pipeline
workflow.add_node("fetch_results", fetch_tavily_results)  # Fetch search results using Tavily API
workflow.add_node("process_results", process_search_results)  # Clean, summarize, and structure results
workflow.add_node("generate_answer", generate_final_answer)  # Generate final answer using DeepSeek model

# 🔹 Define the sequence of execution (edges between nodes)
workflow.add_edge("fetch_results", "process_results")  # After fetching, process the search results
workflow.add_edge("process_results", "generate_answer")  # After processing, generate a final answer

# 🔹 Set entry and exit points for the workflow
workflow.set_entry_point("fetch_results")  # First step of execution
workflow.set_finish_point("generate_answer")  # Final output after processing

# 🔹 Compile the AI Research System
research_ai_system = workflow.compile()  # Converts the workflow into an executable system


In [29]:
# 🔹 Run the AI Chatbot in Real-Time
def run_agent(query):
    """
    Runs the AI Agent System for a given query.

    Steps:
    1. Initializes the system state with the user's query.
    2. Executes the research workflow using LangGraph.
    3. Extracts structured research findings and a final answer.
    4. Returns the structured results and the AI-generated response.
    """

    # Initialize the AI system state with the user's query
    initial_state = State(query=query)

    # Invoke the compiled AI research system to process the query
    final_state = research_ai_system.invoke(initial_state)

    # Retrieve the structured search results and final generated answer
    structured_results = final_state.get("structured_results", [])  # List of processed results
    final_answer = final_state.get("final_answer", "")  # Final summarized answer

    # Return the collected information in a structured format
    return {
        "query": query,  # Original user query
        "structured_results": structured_results,  # List of search results with summaries
        "final_answer": final_answer  # AI-generated response based on research findings
    }


In [30]:

# Real-Time Interactive Chat Loop


while True:
    # Prompt the user for a research query
    user_input = input("\nEnter your query (type 'quit' to exit): ")

    # Check for exit command
    if user_input.lower() in ["quit", "q", "exit"]:
        print("Session ended. Have a great day!")
        break

    # Process the query using the AI agent
    response = run_agent(user_input)


    # Display Research Summary

    print("\n--- Research Summary ---")
    for idx, res in enumerate(response["structured_results"], start=1):
        print(f"\n{idx}. Title: {res['title']}")
        print(f"URL: {res['url']}")
        print(f"Summary: {res['summary']}")


    # Display Final AI-Generated Answer

    print("\n--- Final Answer ---")
    print(response["final_answer"])



Enter your query (type 'quit' to exit): ddhananjay munde rajinama

--- Research Summary ---

1. Title: No Title Found
URL: https://www.deccanherald.com/india/maharashtra/beed-sarpanch-murder-mundes-resignation-will-be-announced-before-budget-session-claims-estranged-wife-karuna-sharma-3428279
Summary: Karuna Munde, via a Facebook post, suggested that Dhananjay Munde should resign from his position due to the likelihood of his name being included in a list (possibly 322025) associated with alleged scandal or wrongdoing.

2. Title: No Title Found
URL: https://www.instagram.com/lokshahimarathi/reel/DFXqH7yt1W0/dhananjay-munde-on-rajinama-%E0%A4%AE%E0%A5%80-%E0%A4%B0%E0%A4%BE%E0%A4%9C%E0%A5%80%E0%A4%A8%E0%A4%BE%E0%A4%AE%E0%A5%8D%E0%A4%AF%E0%A4%BE%E0%A4%B5%E0%A4%B0-%E0%A4%89%E0%A4%A4%E0%A5%8D%E0%A4%A4%E0%A4%B0-%E0%A4%A6%E0%A5%87%E0%A4%A3%E0%A4%BE%E0%A4%B0-%E0%A4%A8%E0%A4%BE%E0%A4%B9%E0%A5%80-%E0%A4%A7%E0%A4%A8%E0%A4%82%E0%A4%9C%E0%A4%AF-%E0%A4%AE%E0%A5%81%E0%A4%82%E0%A4%A1%E0%A5%87-%E0%A4%