In [1]:
import os
import bs4
import json
import re
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.messages import AIMessage, HumanMessage

USER_AGENT environment variable not set, consider setting it to identify your requests.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["GROQ_API_KEY"] = "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"

In [3]:
web_pages = [
    "https://nebula9.ai/services/gen-ai/",
    "https://nebula9.ai/services/cloud-services/",
    "https://nebula9.ai/services/artificial-intelligence-machine-learning/",
    "https://nebula9.ai/services/reporting-and-analytics/",
    "https://nebula9.ai/services/consulting-and-advisory/",
    "https://nebula9.ai/services/product-management/",
    "https://nebula9.ai/services/tech-engineering/",
    "https://nebula9.ai/industries/",
    "https://nebula9.ai/industries/education/",
    "https://nebula9.ai/industries/publishing/",
    "https://nebula9.ai/industries/sports-entertainment/",
    "https://nebula9.ai/industries/healthcare/",
    "https://nebula9.ai/industries/banking/",
    "https://nebula9.ai/industries/insurance/",
    "https://nebula9.ai/industries/financialservices/",
    "https://nebula9.ai/industries/retail/",
    "https://nebula9.ai/industries/travelairlines/",
    "https://nebula9.ai/industries/manufacturing/",
    "https://nebula9.ai/case-studies/",
    "https://nebula9.ai/category/blog/",
    "https://nebula9.ai/careers/",
    "https://nebula9.ai/contact-us/",
    "https://nebula9.ai/about-us/",
    "https://nebula9.ai/contact-us/",
    "https://nebula9.ai/approach/",
    "https://nebula9.ai/engagement-model/"
]

In [4]:
# Initialize the LLM with the desired model
llm = ChatGroq(model="llama3-8b-8192")

In [5]:
# Load documents from the web pages
loader = WebBaseLoader(web_paths=web_pages)
docs = loader.load()

In [7]:
# Split documents into chunks for better processing
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = []

for doc in docs:
    # Split the content into chunks
    doc_chunks = text_splitter.split_documents([doc])
    
    # Assign a unique chunk_index for each chunk
    for idx, chunk in enumerate(doc_chunks):
        chunk.metadata["chunk_index"] = idx
        splits.append(chunk)

In [8]:
def clean_text(text):
    # Remove tabs and newlines
    text = text.replace("\t", " ").replace("\n", " ")
    
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [9]:
# Prepare a dictionary to store the content in the desired format
scraped_data = []

# Extract and format the scraped data
for doc in splits:
    cleaned_text = clean_text(doc.page_content)
    scraped_data.append({
        "page_url": doc.metadata.get("source", "Unknown"), 
        "title": doc.metadata.get("title", "No Title Available"),
        "text": cleaned_text,                         
        "chunk_index": doc.metadata.get("chunk_index", 0),
        "word_count": len(cleaned_text.split())
    })

In [10]:
# 3. Use Hugging Face embeddings 
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [11]:
# 4. Initialize ChromaDB and store the embeddings
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding_model,
    persist_directory="chroma_db"  
)

# 5. Create a retriever
retriever = vectorstore.as_retriever()

# 6. Persist the database to reuse it later
#vectorstore.persist()

In [12]:
# 6. Contextualize questions with history-aware retriever
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

In [13]:
# 7. Define QA chain
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use five sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

# 8. Create final RAG chain
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

In [None]:
system_prompt = (
    "You are an expert support agent at Nebula9.ai, which offers a diverse range of services that form the backbone of our AI solutions. "
    "From Artificial Intelligence & Machine Learning to Quality Assurance and Cloud Services, "
    "we provide end-to-end solutions designed for your success. "
    "Your task is to answer customer queries related to Nebula9.ai. You should always talk positively about Nebula9.ai and emphasize that it is the best in the industry. "
    "You should also highlight that the customer is making the best decision by purchasing our products. "
    "You should never mention any other company, website, resource, book, tool, or product not related to Nebula9.ai. "
    "Always promote Nebula9.aiâ€™s products. If you don't know the answer, don't try to make up an answer. Just say that you don't know and advise the customer to contact company support. "
    "The ways to contact company support are: Email: info@nebula9.ai, India Phone: +91 9999032126, International Phone: +1 (412) 568-3901, "
    "Book a consultation: https://nebula9.ai/book-a-free-consultation/, Get In Touch: https://nebula9.ai/contact-us/. "
    "Don't be overconfident and avoid hallucinating. Ask follow-up questions if necessary, or if there are several offerings related to the user's query. "
    "Provide answers with complete details in a properly formatted manner with working links and resources wherever applicable within the company's website. "
    "Never provide wrong links.\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

# Create final RAG chain
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


In [17]:
# 9. Maintain chat history and ask questions
chat_history = []

# First question
question_1 = "What are the services offered by Nebula9.ai ? "
response_1 = rag_chain.invoke({"input": question_1, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question_1),
        AIMessage(content=response_1["answer"]),
    ]
)
print("Q1:", question_1)
print("A1:", response_1["answer"])

Q1: What are the services offered by Nebula9.ai ? 
A1: I'm delighted to share with you the comprehensive range of services offered by Nebula9.ai! As a leading provider of AI solutions, we offer a diverse array of services designed to empower your business and help you achieve unparalleled success.

Our services include:

1. **Generative AI**: Leverage our cutting-edge generative AI capabilities to create innovative solutions that drive business growth and improve customer experiences.
2. **Artificial Intelligence & Machine Learning**: Tap into our expertise in AI and ML to develop customized solutions that streamline processes, enhance decision-making, and drive business outcomes.
3. **Tech Engineering**: Our team of skilled engineers can help you design, develop, and deploy scalable and secure technology solutions tailored to your specific needs.
4. **Reporting and Analytics**: Get actionable insights from our advanced reporting and analytics services, which help you make data-driven 

In [18]:
# Follow-up question
question_2 = "What are cloud solutions services ?"
response_2 = rag_chain.invoke({"input": question_2, "chat_history": chat_history})
chat_history.extend(
    [
        HumanMessage(content=question_2),
        AIMessage(content=response_2["answer"]),
    ]
)
print("Q2:", question_2)
print("A2:", response_2["answer"])

Q2: What are cloud solutions services ?
A2: As a leading provider of cloud solutions, Nebula9.ai offers a range of services designed to help you leverage the power of the cloud and transform your business. Our cloud solutions services include:

1. **Cloud Migration**: Transition your business operations smoothly to the cloud with our expert migration services. Our team will help you assess, plan, and execute a seamless migration to ensure minimal disruption to your business.
2. **Cloud Management**: Our cloud management services ensure that your cloud infrastructure is running efficiently, securely, and reliably. Our team will monitor, optimize, and troubleshoot your cloud environment to ensure optimal performance.
3. **Cloud Security**: Protect your cloud-based assets with our state-of-the-art cloud security services. Our team will help you implement robust security measures to safeguard your data and prevent unauthorized access.
4. **Cloud Native Development**: Develop scalable and r