In [1]:
"""!pip install langchain langgraph langsmith langchain_groq langchain_community langchain-tavily chromadb"""

'!pip install langchain langgraph langsmith langchain_groq langchain_community langchain-tavily chromadb'

In [16]:
groq_api_key="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"

In [17]:
import uuid
from typing import Annotated, TypedDict, Dict, Any
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_tavily import TavilySearch
from langchain_core.tools import tool
from langgraph.graph import StateGraph, START
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph.message import add_messages
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings
from bs4 import BeautifulSoup
import requests
import json


In [18]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama-3.1-8b-instant", groq_api_key=groq_api_key)

In [19]:
@tool
def scrape_website(url: str, prompt: str = "Extract key information in JSON format") -> Dict[str, Any]:
    """Scrape a website using BeautifulSoup and proceses content with Groq's llama model"""
    try:
        # Fetch web page content
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        raw_content = soup.get_text(strip=True)
        #Use groq's llama model to process the content based on the prompt
        system_message = SystemMessage(
            content="You are a function-calling AI model designed to extract structured data from text"
            "Based on the user prompt, extract the requested information from the provided content"
            "and return it in JSON format."
        )
        user_message = HumanMessage(
            content=f"Content: {raw_content[:2000]}\n\nPrompt: {prompt}\n\n Extract the requested information in JSON format."
        )
        
        response = llm.invoke([system_message, user_message])
        #Ensure the response is parsed as json
        try:
            extracted_data = json.loads(response.content)
        except json.JSONDecodeError:
            extracted_data = {"raw_content": response.content}
        
        return {"data": extracted_data, "error": None}
    except Exception as e:
        return {"data": None, "error": str(e)}

In [20]:
class State(TypedDict):
    messages: Annotated[list, add_messages]
    conversation_id: str

tools = [scrape_website]
llm_with_tools = llm.bind_tools(tools)

In [21]:
#Initialize chroma DB client
chroma_client = chromadb.PersistentClient(
    path="./chroma_db",
    settings=Settings(anonymized_telemetry=False)
)

embedding_function = embedding_functions.DefaultEmbeddingFunction()

collection = chroma_client.create_collection(
    name="conversation_history_1",
    embedding_function=embedding_function
)

def store_in_chroma(state: State):
    """Store conversation messages in chromaDB"""
    conversation_id = state.get("conversation_id", str(uuid.uuid4()))
    messages = state["messages"]
    
    #convert messages to text for embedding
    message_texts = [f"{msg.type}: {msg.content}" for msg in messages]
    
    #Store in ChromaDB 
    collection.add(
        documents=message_texts,
        metadatas=[{"conversation_id": conversation_id, "index": i} for i in range(len(messages))],
        ids=[f"{conversation_id}_{i}" for i in range(len(messages))]
    )
    
    return conversation_id

InternalError: Collection [conversation_history_1] already exists

In [22]:
def retrieve_context(state: State) -> str:
    """Retrieve relevant context from chromaDB"""
    conversation_id = state.get("conversation_id", "")
    last_message = state["messages"][-1].content if state["messages"] else ""
    
    if not conversation_id or not last_message:
        return ""
    
    # Query chromaDB for relevant context
    results = collection.query(
        query_texts=[last_message],
        n_results=3,
        where={"conversation_id": conversation_id}
    )
    
    #combine relevant context
    context = "\n".join([doc for doc in results['documents'][0] if doc])
    return context

In [23]:
def tool_calling_llm(state: State):
    # Retrieve relevant context from chromaDB
    context = retrieve_context(state)
    
    # Add context to the prompt if available
    if context:
        prompt = f"Context from previous conversations: \n{context}\n\nCurrent messages: \n{state["messages"]}"
    else:
        prompt = state["messages"]
    
    # Store conversation in chromaDB and get conversation_id
    conversation_id = store_in_chroma(state)
    
    #Update the state with conversation_id
    state["conversation_id"] = conversation_id
    
    return {
        "messages": [llm_with_tools.invoke(prompt)],
        "conversation_id": conversation_id
    }
    

In [24]:
from langgraph.prebuilt import ToolNode, tools_condition
from langgraph.graph import START, END

#initialize memory
memory = MemorySaver()

#Graph
builder = StateGraph(State)

builder.add_node("tool_calling_llm", tool_calling_llm)
builder.add_node("tools", ToolNode(tools))

#Add Edges
builder.add_edge(START, "tool_calling_llm")
builder.add_conditional_edges(
    "tool_calling_llm",
    tools_condition
)

builder.add_edge("tool_calling_llm", END)

#Compile the graph
graph = builder.compile(checkpointer=memory)

In [25]:
config = {"configurable": {"thread_id": "3"}}

response = graph.invoke({
    "messages": [HumanMessage(content="Please tell me about todays Bangladeshi news in bengali")],
    "conversation_id": str(uuid.uuid4())
}, config)

In [27]:
response["messages"][-1]

ToolMessage(content='{"data": {"raw_content": "প্রথম আলো | বাংলা নিউজ পেপারের বর্তমান সংবাদের মূল বিবরণগুলি নিম্নরূপ:\\n\\n```json\\n{\\n  \\"সর্বশেষ\\": [\\n    {\\n      \\"শিরোনাম\\": \\"জামায়াতের টিকিট কাটলে জান্নাতের টিকিট কাটা হবে, কোথায় আছে বলুক তারা: মির্জা ফখরুল\\",\\n      \\"বিবরণ\\": \\"প্রধানমন্ত্রী মির্জা ফখরুল ইসলাম বলেছেন, বাংলাদেশের মানুষ বোঝে ওয়ান ম্যান ওয়ান ভোট৷\\"\\n    },\\n    {\\n      \\"শিরোনাম\\": \\"৫.৭ থেকে যদি ৬ মাত্রার ভূমিকম্প হয়, মহাপ্রলয় হবে ঢাকায়৷\\",\\n      \\"বিবরণ\\": \\"আবহাওয়া অফিস বলছে, সকাল ১০টা ৩৬ মিনিট ১২ সেকেন্ডে ভূমিকম্পটি অনুভূত হয়েছে\\"\\n    },\\n    {\\n      \\"শিরোনাম\\": \\"ভারতে কেন্দ্রীয় ব্যাংকের কর্মকর্তা সেজে ডাকাতি, ৭ কোটি রুপি লুট\\",\\n      \\"বিবরণ\\": \\"ভারতের কেন্দ্রীয় ব্যাংকের কর্মকর্তা সেজে ডাকাতি করে পুলিশ ধরেছে\\"\\n    },\\n    {\\n      \\"শিরোনাম\\": \\"কাশ্মীরে একজন বাবা কেন গায়ে আগুন দিয়ে আত্মাহুতি দিলেন\\",\\n      \\"বিবরণ\\": \\"ভারত অধিকৃত কাশ্মীরে এ ধরনের প্রথম ঘটনা হচ্ছে\\"\\n    },\\n    {\\n      \\"শিরোন