In [None]:
# ### Step 1: Install Required Libraries
!pip install -q langchain faiss-cpu tiktoken
!pip install -q langchain_huggingface
!pip install -q langchain_community
!pip install -q langchain_google_community
!pip install -q pypdf
!pip install -q langchain_groq

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.6/278.6 kB[0m [31m24.3 MB/s[0m eta [36m0:00:0

In [None]:
### Step 2: Import Necessary Modules
import os
import requests
from bs4 import BeautifulSoup
from langchain.chains import RetrievalQA, create_retrieval_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.tools import Tool
from langchain.agents import create_react_agent, AgentExecutor
from langchain.llms import HuggingFaceHub
from langchain.prompts.prompt import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain_groq import ChatGroq
from langchain_community.utilities import GoogleSerperAPIWrapper

In [None]:
### Step 3: Configure API Keys and Environment Variables
from google.colab import userdata

os.environ["GROQ_API_KEY"] = userdata.get("GROQ_API_KEY")
os.environ["SERPER_API_KEY"] = userdata.get("SERPER_API_KEY")

# Check if all keys are loaded correctly
if not all([
    os.getenv("GROQ_API_KEY"),
    os.getenv("SERPER_API_KEY")
]):
    raise ValueError("One or more API keys are missing. Ensure keys are added to Colab userdata.")

In [None]:
### Step 4: Define RAG Prompt Template
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise.\n\n"
    "{context}"
)

rag_prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

In [None]:
# Function to process a PDF file and extract text
def process_pdf(pdf_path):
    try:
        loader = PyPDFLoader(pdf_path)  # Use LangChain's PyPDFLoader
        documents = loader.load()  # Load the PDF into documents
        return documents  # Return list of documents
    except Exception as e:
        print(f"Failed to extract text from PDF {pdf_path}: {e}")
        return []

# Function to process a TXT file and extract text
def process_txt(txt_path):
    try:
        loader = TextLoader(txt_path)  # Use LangChain's TextLoader
        documents = loader.load()  # Load the TXT file into documents
        return documents  # Return list of documents
    except Exception as e:
        print(f"Failed to extract text from TXT file {txt_path}: {e}")
        return []

# Function to process files and prepare vector store
def process_files_and_create_retriever(folder_path):
    import os

    all_documents = []  # List to store all documents

    for root, dirs, files in os.walk(folder_path):  # Traverse all files in the folder
        for file in files:
            file_path = os.path.join(root, file)  # Full path of the file
            ext = file.split(".")[-1].lower()  # Get file extension

            if ext == "pdf":
                all_documents.extend(process_pdf(file_path))
            elif ext == "txt":
                all_documents.extend(process_txt(file_path))
            else:
                print(f"Unsupported file type for {file}")

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = text_splitter.split_documents(all_documents)

    # Create vector store with HuggingFace embeddings
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vector_store = FAISS.from_documents(docs, embeddings)

    # Create retriever
    retriever = vector_store.as_retriever(search_type="similarity", search_k=3)
    return retriever

In [None]:
from langchain.schema import Document
### Step 6: Web Scraping Function
def scrape_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        return soup.get_text()
    except requests.exceptions.RequestException as e:
        return f"Error scraping {url}: {e}"



### Step 7: Prepare Web Data for Retrieval
# Define URLs to scrape
urls = [
    "https://medium.com/@jagadeesan.ganesh/agentic-rag-with-langchain-revolutionizing-ai-with-dynamic-decision-making-ff1dee6df4ca",
    "https://medium.com/the-ai-forum/implementing-agentic-rag-using-langchain-b22af7f6a3b5",
    "https://www.bbc.com/news/technology",
    "https://www.dawn.com/tech"
]

web_data_list = []
for url in urls:
    print(f"Scraping: {url}")
    web_data = scrape_website(url)
    web_data_list.append(web_data)

combined_web_data = "\n".join(web_data_list)

# Convert scraped content into Document objects
web_document = Document(page_content=combined_web_data, metadata={"source": "web"})

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents([web_document])

# Create vector store for web data
web_vector_store = FAISS.from_documents(
    docs, HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
)
retriever = web_vector_store.as_retriever(search_type="similarity", search_k=3)

Scraping: https://medium.com/@jagadeesan.ganesh/agentic-rag-with-langchain-revolutionizing-ai-with-dynamic-decision-making-ff1dee6df4ca
Scraping: https://medium.com/the-ai-forum/implementing-agentic-rag-using-langchain-b22af7f6a3b5
Scraping: https://www.bbc.com/news/technology
Scraping: https://www.dawn.com/tech


In [None]:
### Step 8: Initialize ChatGroq Model
chat_model = ChatGroq(
    temperature=0,
    groq_api_key=os.getenv("GROQ_API_KEY"),
    model="llama3-8b-8192"
)



### Step 9: Create Retrieval QA Chain
question_answer_chain = create_stuff_documents_chain(
    llm=chat_model,
    prompt=rag_prompt
)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
from langchain_community.utilities import GoogleSerperAPIWrapper

# Initialize Google search tool
search = GoogleSerperAPIWrapper()


# Create a new tool for document retrieval from processed PDF and TXT files
def file_retriever_tool(query):
    retriever = process_files_and_create_retriever("/content/drive/MyDrive/Chatbot")  # Set your folder path here
    results = retriever.get_relevant_documents(query)
    return "\n".join([doc.page_content for doc in results])

# Define tools for the agent
tools = [
    Tool(
        name="RAG For Web",
        func=rag_chain.invoke,  # Assuming rag_chain.invoke is the method that handles RAG
        description="Useful when you're asked Retrieval Augmented Generation (RAG) related questions."
    ),
    Tool(
        name="Google Search",
        description="For answering questions not in the knowledge base, use Google search.",
        func=search.run  # Google search tool function
    ),
    Tool(
        name="File Retriever",
        description="Retrieves documents from processed PDFs and TXT files in a folder.",
        func=file_retriever_tool  # Function to retrieve documents from folder
    ),
]

In [None]:
from langchain.prompts.prompt import PromptTemplate

# Updated character prompt
character_prompt = """Answer the following questions as best you can. You have access to the following tools:
{tools}

For any questions requiring tools, you should first search the provided knowledge base. If you don't find relevant information from provided knowledge base, then use Google search to find related information.

To use a tool, you MUST use the following format:
1. Thought: Do I need to use a tool? Yes
2. Action: the action to take, should be one of [{tool_names}]
3. Action Input: the input to the action
4. Observation: the result of the action

When you have a response to say to the Human, or if you do not need to use a tool, you MUST use the following format:
1. Thought: Do I need to use a tool? No
2. Final Answer: [your response here]

It's very important to always include the 'Thought' before any 'Action' or 'Final Answer'. Ensure your output strictly follows the formats above.

Begin!

Previous conversation history:
{chat_history}

Question: {input}
Thought: {agent_scratchpad}
"""

# Initialize the prompt template
prompt = PromptTemplate.from_template(character_prompt)

# Fix for missing variables
agent = create_react_agent(
    llm=chat_model,  # LLM instance
    tools=tools,     # List of tools
    prompt=prompt    # Corrected prompt template
)

# Ensure agent scratchpad is included dynamically
memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True,
    output_key="output"
)

agent_chain = AgentExecutor(
    agent=agent,
    tools=tools,
    memory=memory,
    max_iterations=5,
    handle_parsing_errors=True,
    verbose=True,
)

  memory = ConversationBufferWindowMemory(


In [None]:
# Define the input keys expected by the chain
expected_input_key = "input"  # This should match the variable defined in your PromptTemplate

# Example invocation with correct input
response = agent_chain.invoke({expected_input_key: "can you give me news of today?"})
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: Google Search
Action Input: "latest news"[0m[33;1m[1;3mView the latest news and breaking news today for U.S., world, weather, entertainment, politics and health at CNN.com. Breaking News, Latest News and Current News from FOXNews.com. Breaking news and video. Latest Current News: U.S., World, Entertainment, Health, Business, ... Read full articles, watch videos, browse thousands of titles and more on the "Home page" topic with Google News. Go to NBCNews.com for breaking news, videos, and the latest top stories in world news, business, politics, health and pop culture. Your trusted source for breaking news, analysis, exclusive interviews, headlines, and videos at ABCNews.com. Read the latest headlines, breaking news, and videos at APNews.com, the definitive source for independent journalism from every corner of the globe. The latest transcripts, proclamations, executive orders, 

In [None]:
# Define the input keys expected by the chain
expected_input_key = "input"  # This should match the variable defined in your PromptTemplate

# Example invocation with correct input
response = agent_chain.invoke({expected_input_key: "can you give weather of Karachi today do google search?"})
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: Google Search
Action Input: "weather in Karachi today"[0m[33;1m[1;3m73°F[0m[32;1m[1;3mThought: Do I need to use a tool? No
Final Answer: The weather in Karachi today is 23°C.[0m

[1m> Finished chain.[0m
{'input': 'can you give weather of Karachi today do google search?', 'chat_history': [HumanMessage(content='What is can you give me news of today?', additional_kwargs={}, response_metadata={}), AIMessage(content='You can visit the websites mentioned in the observation, such as CNN.com, FOXNews.com, Google News, NBCNews.com, ABCNews.com, APNews.com, CBS News, and Reuters.com, to get the latest news of today. You can also visit BBC News for up-to-the-minute news and breaking news.', additional_kwargs={}, response_metadata={}), HumanMessage(content='can you give weather of Karachi today?', additional_kwargs={}, response_metadata={}), AIMessage(content='The weather in Karachi 

**Reference:**

https://peaceful0907.medium.com/build-your-customized-chatbot-with-rag-and-langchain-agent-0eae1923702e

https://python.langchain.com/docs/integrations/tools/google_serper/